284 files changed, 42769 insertions, 3470 deletions
diff --git a/src/gallium/drivers/Makefile b/src/gallium/drivers/Makefile
index 6161cb6ff8..9fe9b2c11d 100644
--- a/src/gallium/drivers/Makefile
+++ b/src/gallium/drivers/Makefile
@@ -1,20 +1,12 @@
+# src/gallium/drivers/Makefile
 TOP = ../../..
 include $(TOP)/configs/current
 
+SUBDIRS = $(GALLIUM_DRIVERS_DIRS)
 
-SUBDIRS = $(GALLIUM_DRIVER_DIRS)
-
-
-default: subdirs
-
-
-subdirs:
+default install clean:
 	@for dir in $(SUBDIRS) ; do \
 		if [ -d $$dir ] ; then \
-			(cd $$dir && $(MAKE)) || exit 1 ; \
+			(cd $$dir && $(MAKE) $@) || exit 1; \
 		fi \
 	done
-
-
-clean:
-	rm -f `find . -name \*.[oa]`
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index cb0631baf5..1f6860da11 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -49,6 +49,15 @@
    }
 
 
+
+#define JOIN(x, y) JOIN_AGAIN(x, y)
+#define JOIN_AGAIN(x, y) x ## y
+
+#define STATIC_ASSERT(e) \
+{typedef char JOIN(assertion_failed_at_line_, __LINE__) [(e) ? 1 : -1];}
+
+
+
 /** for sanity checking */
 #define ASSERT_ALIGN16(ptr) \
   ASSERT((((unsigned long) (ptr)) & 0xf) == 0);
@@ -64,9 +73,13 @@
 #define ROUNDUP16(k)  (((k) + 0xf) & ~0xf)
 
 
-#define CELL_MAX_SPUS 6
+#define CELL_MAX_SPUS 8
 
 #define CELL_MAX_SAMPLERS 4
+#define CELL_MAX_TEXTURE_LEVELS 12  /* 2k x 2k */
+#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
+#define CELL_MAX_WIDTH 1024    /**< max framebuffer width */
+#define CELL_MAX_HEIGHT 1024   /**< max framebuffer width */
 
 #define TILE_SIZE 32
 
@@ -94,48 +107,106 @@
 #define CELL_CMD_STATE_BIND_VS       18
 #define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
-#define CELL_CMD_VS_EXECUTE          22
-#define CELL_CMD_FLUSH_BUFFER_RANGE  23
+#define CELL_CMD_STATE_FS_CONSTANTS  21
+#define CELL_CMD_STATE_RASTERIZER    22
+#define CELL_CMD_VS_EXECUTE          23
+#define CELL_CMD_FLUSH_BUFFER_RANGE  24
+#define CELL_CMD_FENCE               25
 
 
+/** Command/batch buffers */
 #define CELL_NUM_BUFFERS 4
 #define CELL_BUFFER_SIZE (4*1024)  /**< 16KB would be the max */
 
 #define CELL_BUFFER_STATUS_FREE 10
 #define CELL_BUFFER_STATUS_USED 20
 
+/** Debug flags */
+#define CELL_DEBUG_CHECKER              (1 << 0)
+#define CELL_DEBUG_ASM                  (1 << 1)
+#define CELL_DEBUG_SYNC                 (1 << 2)
+#define CELL_DEBUG_FRAGMENT_OPS         (1 << 3)
+#define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
+#define CELL_DEBUG_CMD                  (1 << 5)
+#define CELL_DEBUG_CACHE                (1 << 6)
 
-#define CELL_DEBUG_CHECKER  (1 << 0)
-#define CELL_DEBUG_SYNC     (1 << 1)
+#define CELL_FENCE_IDLE      0
+#define CELL_FENCE_EMITTED   1
+#define CELL_FENCE_SIGNALLED 2
 
+#define CELL_FACING_FRONT    0
+#define CELL_FACING_BACK     1
 
+struct cell_fence
+{
+   /** There's a 16-byte status qword per SPU */
+   volatile uint status[CELL_MAX_SPUS][4];
+};
 
-/** Max instructions for doing per-fragment operations */
-#define SPU_MAX_FRAGMENT_OPS_INSTS 64
+#ifdef __SPU__
+typedef vector unsigned int opcode_t;
+#else
+typedef unsigned int opcode_t[4];
+#endif
+
+/**
+ * Fence command sent to SPUs.  In response, the SPUs will write
+ * CELL_FENCE_STATUS_SIGNALLED back to the fence status word in main memory.
+ */
+struct cell_command_fence
+{
+   opcode_t opcode;      /**< CELL_CMD_FENCE */
+   struct cell_fence *fence;
+   uint32_t pad_[3];
+};
 
 
 /**
  * Command to specify per-fragment operations state and generated code.
+ * Note that this is a variant-length structure, allocated with as 
+ * much memory as needed to hold the generated code; the "code"
+ * field *must* be the last field in the structure.  Also, the entire
+ * length of the structure (including the variant code field) must be
+ * a multiple of 8 bytes; we require that this structure itself be
+ * a multiple of 8 bytes, and that the generated code also be a multiple
+ * of 8 bytes.
+ *
+ * Also note that the dsa, blend, blend_color fields are really only needed
+ * for the fallback/C per-pixel code.  They're not used when we generate
+ * dynamic SPU fragment code (which is the normal case), and will eventually
+ * be removed from this structure.
  */
 struct cell_command_fragment_ops
 {
-   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+   opcode_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+
+   /* Fields for the fallback case */
    struct pipe_depth_stencil_alpha_state dsa;
    struct pipe_blend_state blend;
-   unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   struct pipe_blend_color blend_color;
+
+   /* Fields for the generated SPU code */
+   unsigned total_code_size;
+   unsigned front_code_index;
+   unsigned back_code_index;
+   /* this field has variant length, and must be the last field in 
+    * the structure
+    */
+   unsigned code[0];
 };
 
 
 /** Max instructions for fragment programs */
-#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128
+#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 512
 
 /**
- * Command to send a fragment progra to SPUs.
+ * Command to send a fragment program to SPUs.
  */
 struct cell_command_fragment_program
 {
-   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */
+   opcode_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */
    uint num_inst;        /**< Number of instructions */
+   uint32_t pad[3];
    unsigned code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
 };
 
@@ -145,10 +216,21 @@ struct cell_command_fragment_program
  */
 struct cell_command_framebuffer
 {
-   uint64_t opcode;     /**< CELL_CMD_FRAMEBUFFER */
+   opcode_t opcode;     /**< CELL_CMD_STATE_FRAMEBUFFER */
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
+   uint32_t pad_[2];
+};
+
+
+/**
+ * Tell SPUs about rasterizer state.
+ */
+struct cell_command_rasterizer
+{
+   opcode_t opcode;    /**< CELL_CMD_STATE_RASTERIZER */
+   struct pipe_rasterizer_state rasterizer;
 };
 
 
@@ -157,9 +239,10 @@ struct cell_command_framebuffer
  */
 struct cell_command_clear_surface
 {
-   uint64_t opcode;     /**< CELL_CMD_CLEAR_SURFACE */
+   opcode_t opcode;     /**< CELL_CMD_CLEAR_SURFACE */
    uint surface; /**< Temporary: 0=color, 1=Z */
    uint value;
+   uint32_t pad[2];
 };
 
 
@@ -206,7 +289,7 @@ struct cell_shader_info
 #define SPU_VERTS_PER_BATCH 64
 struct cell_command_vs
 {
-   uint64_t opcode;       /**< CELL_CMD_VS_EXECUTE */
+   opcode_t opcode;       /**< CELL_CMD_VS_EXECUTE */
    uint64_t vOut[SPU_VERTS_PER_BATCH];
    unsigned num_elts;
    unsigned elts[SPU_VERTS_PER_BATCH];
@@ -218,7 +301,7 @@ struct cell_command_vs
 
 struct cell_command_render
 {
-   uint64_t opcode;   /**< CELL_CMD_RENDER */
+   opcode_t opcode;   /**< CELL_CMD_RENDER */
    uint prim_type;    /**< PIPE_PRIM_x */
    uint num_verts;
    uint vertex_size;  /**< bytes per vertex */
@@ -227,44 +310,51 @@ struct cell_command_render
    float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
+   uint32_t pad_[1];
 };
 
 
 struct cell_command_release_verts
 {
-   uint64_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
+   opcode_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
    uint vertex_buf;    /**< in [0, CELL_NUM_BUFFERS-1] */
+   uint32_t pad_[3];
 };
 
 
 struct cell_command_sampler
 {
-   uint64_t opcode;         /**< CELL_CMD_STATE_SAMPLER */
+   opcode_t opcode;         /**< CELL_CMD_STATE_SAMPLER */
    uint unit;
    struct pipe_sampler_state state;
+   uint32_t pad_[1];
 };
 
 
 struct cell_command_texture
 {
-   uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   opcode_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   uint target;         /**< PIPE_TEXTURE_x */
    uint unit;
-   void *start;         /**< Address in main memory */
-   ushort width, height;
+   void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
+   ushort width[CELL_MAX_TEXTURE_LEVELS];
+   ushort height[CELL_MAX_TEXTURE_LEVELS];
+   ushort depth[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
-/** XXX unions don't seem to work */
-/* XXX this should go away; all commands should be placed in batch buffers */
-struct cell_command
+#define MAX_SPU_FUNCTIONS 12
+/**
+ * Used to tell the PPU about the address of particular functions in the
+ * SPU's address space.
+ */
+struct cell_spu_function_info
 {
-#if 0
-   struct cell_command_framebuffer fb;
-   struct cell_command_clear_surface clear;
-   struct cell_command_render render;
-#endif
-   struct cell_command_vs vs;
-} ALIGN16_ATTRIB;
+   uint num;
+   char names[MAX_SPU_FUNCTIONS][16];
+   uint addrs[MAX_SPU_FUNCTIONS];
+   char pad[12];   /**< Pad struct to multiple of 16 bytes (256 currently) */
+};
 
 
 /** This is the object passed to spe_create_thread() */
@@ -273,11 +363,13 @@ struct cell_init_info
    unsigned id;
    unsigned num_spus;
    unsigned debug_flags;  /**< mask of CELL_DEBUG_x flags */
-   struct cell_command *cmd;
+   float inv_timebase;    /**< 1.0/timebase, for perf measurement */
 
    /** Buffers for command batches, vertex/index data */
    ubyte *buffers[CELL_NUM_BUFFERS];
    uint *buffer_status;  /**< points at cell_context->buffer_status */
+
+   struct cell_spu_function_info *spu_functions;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index b28f4c5c31..c92f8e5cba 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -24,6 +24,7 @@ SOURCES = \
 	cell_clear.c \
 	cell_context.c \
 	cell_draw_arrays.c \
+	cell_fence.c \
 	cell_flush.c \
 	cell_gen_fragment.c \
 	cell_gen_fp.c \
@@ -38,8 +39,7 @@ SOURCES = \
 	cell_texture.c \
 	cell_vbuf.c \
 	cell_vertex_fetch.c \
-	cell_vertex_shader.c \
-	cell_winsys.c
+	cell_vertex_shader.c
 
 
 OBJECTS = $(SOURCES:.c=.o) \
@@ -54,6 +54,9 @@ INCLUDE_DIRS = \
 	$(CC) -c $(INCLUDE_DIRS) $(CFLAGS) $< -o $@
 
 
+.c.s:
+	$(CC) -S $(INCLUDE_DIRS) $(CFLAGS) $< -o $@
+
 
 default: $(CELL_LIB)
 
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 16882c0129..fe144f8b84 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -28,6 +28,7 @@
 
 #include "cell_context.h"
 #include "cell_batch.h"
+#include "cell_fence.h"
 #include "cell_spu.h"
 
 
@@ -42,7 +43,9 @@
 uint
 cell_get_empty_buffer(struct cell_context *cell)
 {
-   uint buf = 0, tries = 0;
+   static uint prev_buffer = 0;
+   uint buf = (prev_buffer + 1) % CELL_NUM_BUFFERS;
+   uint tries = 0;
 
    /* Find a buffer that's marked as free by all SPUs */
    while (1) {
@@ -58,8 +61,13 @@ cell_get_empty_buffer(struct cell_context *cell)
                   cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
                }
                /*
-               printf("PPU: ALLOC BUFFER %u\n", buf);
+               printf("PPU: ALLOC BUFFER %u, %u tries\n", buf, tries);
                */
+               prev_buffer = buf;
+
+               /* release tex buffer associated w/ prev use of this batch buf */
+               cell_free_fenced_buffers(cell, &cell->fenced_buffers[buf]);
+
                return buf;
             }
          }
@@ -82,6 +90,38 @@ cell_get_empty_buffer(struct cell_context *cell)
 
 
 /**
+ * Append a fence command to the current batch buffer.
+ * Note that we're sure there's always room for this because of the
+ * adjusted size check in cell_batch_free_space().
+ */
+static void
+emit_fence(struct cell_context *cell)
+{
+   const uint batch = cell->cur_batch;
+   const uint size = cell->buffer_size[batch];
+   struct cell_command_fence *fence_cmd;
+   struct cell_fence *fence = &cell->fenced_buffers[batch].fence;
+   uint i;
+
+   /* set fence status to emitted, not yet signalled */
+   for (i = 0; i < cell->num_spus; i++) {
+      fence->status[i][0] = CELL_FENCE_EMITTED;
+   }
+
+   STATIC_ASSERT(sizeof(struct cell_command_fence) % 16 == 0);
+   ASSERT(size % 16 == 0);
+   ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
+
+   fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
+   fence_cmd->opcode[0] = CELL_CMD_FENCE;
+   fence_cmd->fence = fence;
+
+   /* update batch buffer size */
+   cell->buffer_size[batch] = size + sizeof(struct cell_command_fence);
+}
+
+
+/**
  * Flush the current batch buffer to the SPUs.
  * An empty buffer will be found and set as the new current batch buffer
  * for subsequent commands/data.
@@ -91,7 +131,7 @@ cell_batch_flush(struct cell_context *cell)
 {
    static boolean flushing = FALSE;
    uint batch = cell->cur_batch;
-   const uint size = cell->buffer_size[batch];
+   uint size = cell->buffer_size[batch];
    uint spu, cmd_word;
 
    assert(!flushing);
@@ -99,6 +139,14 @@ cell_batch_flush(struct cell_context *cell)
    if (size == 0)
       return;
 
+   /* Before we use this batch buffer, make sure any fenced texture buffers
+    * are released.
+    */
+   if (cell->fenced_buffers[batch].head) {
+      emit_fence(cell);
+      size = cell->buffer_size[batch];
+   }
+
    flushing = TRUE;
 
    assert(batch < CELL_NUM_BUFFERS);
@@ -139,74 +187,24 @@ uint
 cell_batch_free_space(const struct cell_context *cell)
 {
    uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch];
+   free -= sizeof(struct cell_command_fence);
    return free;
 }
 
 
 /**
- * Append data to the current batch buffer.
- * \param data  address of block of bytes to append
- * \param bytes  size of block of bytes
- */
-void
-cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
-{
-   uint size;
-
-   ASSERT(bytes % 8 == 0);
-   ASSERT(bytes <= CELL_BUFFER_SIZE);
-   ASSERT(cell->cur_batch >= 0);
-
-#ifdef ASSERT
-   {
-      uint spu;
-      for (spu = 0; spu < cell->num_spus; spu++) {
-         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
-                 == CELL_BUFFER_STATUS_USED);
-      }
-   }
-#endif
-
-   size = cell->buffer_size[cell->cur_batch];
-
-   if (size + bytes > CELL_BUFFER_SIZE) {
-      cell_batch_flush(cell);
-      size = 0;
-   }
-
-   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
-
-   memcpy(cell->buffer[cell->cur_batch] + size, data, bytes);
-
-   cell->buffer_size[cell->cur_batch] = size + bytes;
-}
-
-
-/**
  * Allocate space in the current batch buffer for 'bytes' space.
+ * Bytes must be a multiple of 16 bytes.  Allocation will be 16 byte aligned.
  * \return address in batch buffer to put data
  */
 void *
-cell_batch_alloc(struct cell_context *cell, uint bytes)
-{
-   return cell_batch_alloc_aligned(cell, bytes, 1);
-}
-
-
-/**
- * Same as \sa cell_batch_alloc, but return an address at a particular
- * alignment.
- */
-void *
-cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
-                         uint alignment)
+cell_batch_alloc16(struct cell_context *cell, uint bytes)
 {
    void *pos;
-   uint size, padbytes;
+   uint size;
 
-   ASSERT(bytes % 8 == 0);
+   ASSERT(bytes % 16 == 0);
    ASSERT(bytes <= CELL_BUFFER_SIZE);
-   ASSERT(alignment > 0);
    ASSERT(cell->cur_batch >= 0);
 
 #ifdef ASSERT
@@ -221,17 +219,12 @@ cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
 
    size = cell->buffer_size[cell->cur_batch];
 
-   padbytes = (alignment - (size % alignment)) % alignment;
-
-   if (padbytes + size + bytes > CELL_BUFFER_SIZE) {
+   if (bytes > cell_batch_free_space(cell)) {
       cell_batch_flush(cell);
       size = 0;
    }
-   else {
-      size += padbytes;
-   }
 
-   ASSERT(size % alignment == 0);
+   ASSERT(size % 16 == 0);
    ASSERT(size + bytes <= CELL_BUFFER_SIZE);
 
    pos = (void *) (cell->buffer[cell->cur_batch] + size);
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.h b/src/gallium/drivers/cell/ppu/cell_batch.h
index f74dd60079..290136031a 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.h
+++ b/src/gallium/drivers/cell/ppu/cell_batch.h
@@ -44,15 +44,8 @@ cell_batch_flush(struct cell_context *cell);
 extern uint
 cell_batch_free_space(const struct cell_context *cell);
 
-extern void
-cell_batch_append(struct cell_context *cell, const void *data, uint bytes);
-
-extern void *
-cell_batch_alloc(struct cell_context *cell, uint bytes);
-
 extern void *
-cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
-                         uint alignment);
+cell_batch_alloc16(struct cell_context *cell, uint bytes);
 
 extern void
 cell_init_batch_buffers(struct cell_context *cell);
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index c9c0c721bb..edc06747ac 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -70,18 +70,12 @@ void
 cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
                    unsigned clearValue)
 {
-   struct pipe_screen *screen = pipe->screen;
    struct cell_context *cell = cell_context(pipe);
    uint surfIndex;
 
    if (cell->dirty)
       cell_update_derived(cell);
 
-
-   if (!cell->cbuf_map[0])
-      cell->cbuf_map[0] = screen->surface_map(screen, ps,
-                                              PIPE_BUFFER_USAGE_GPU_WRITE);
-
    if (ps == cell->framebuffer.zsbuf) {
       /* clear z/stencil buffer */
       surfIndex = 1;
@@ -99,11 +93,25 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
 
    /* Build a CLEAR command and place it in the current batch buffer */
    {
+      STATIC_ASSERT(sizeof(struct cell_command_clear_surface) % 16 == 0);
       struct cell_command_clear_surface *clr
          = (struct cell_command_clear_surface *)
-         cell_batch_alloc(cell, sizeof(*clr));
-      clr->opcode = CELL_CMD_CLEAR_SURFACE;
+         cell_batch_alloc16(cell, sizeof(*clr));
+      clr->opcode[0] = CELL_CMD_CLEAR_SURFACE;
       clr->surface = surfIndex;
       clr->value = clearValue;
    }
+
+   /* Technically, the surface's contents are now known and cleared,
+    * so we could set the status to PIPE_SURFACE_STATUS_CLEAR.  But
+    * it turns out it's quite painful to recognize when any particular
+    * surface goes from PIPE_SURFACE_STATUS_CLEAR to 
+    * PIPE_SURFACE_STATUS_DEFINED (i.e. with known contents), because
+    * the drawing commands could be operating on numerous draw buffers,
+    * which we'd have to iterate through to set all their stati...
+    * For now, we cheat a bit and set the surface's status to DEFINED
+    * right here.  Later we should revisit this and set the status to
+    * CLEAR here, and find a better place to set the status to DEFINED.
+    */
+   ps->status = PIPE_SURFACE_STATUS_DEFINED;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 71f1a3049d..ae82ded334 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -36,7 +36,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_format.h"
 #include "util/u_memory.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "pipe/p_screen.h"
 
 #include "draw/draw_context.h"
@@ -47,6 +47,7 @@
 #include "cell_clear.h"
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
+#include "cell_fence.h"
 #include "cell_flush.h"
 #include "cell_state.h"
 #include "cell_surface.h"
@@ -62,6 +63,8 @@ cell_destroy_context( struct pipe_context *pipe )
 {
    struct cell_context *cell = cell_context(pipe);
 
+   util_delete_keymap(cell->fragment_ops_cache, NULL);
+
    cell_spu_exit(cell);
 
    align_free(cell);
@@ -85,13 +88,16 @@ cell_draw_create(struct cell_context *cell)
 }
 
 
-#ifdef DEBUG
 static const struct debug_named_value cell_debug_flags[] = {
    {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */
+   {"asm", CELL_DEBUG_ASM},        /**< dump SPU asm code */
    {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
+   {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
+   {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
+   {"cmd", CELL_DEBUG_CMD},       /**< SPUs dump command buffer info */
+   {"cache", CELL_DEBUG_CACHE},   /**< report texture cache stats on exit */
    {NULL, 0}
 };
-#endif
 
 
 struct pipe_context *
@@ -99,6 +105,7 @@ cell_create_context(struct pipe_screen *screen,
                     struct cell_winsys *cws)
 {
    struct cell_context *cell;
+   uint i;
 
    /* some fields need to be 16-byte aligned, so align the whole object */
    cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16);
@@ -125,11 +132,14 @@ cell_create_context(struct pipe_screen *screen,
    cell_init_state_functions(cell);
    cell_init_shader_functions(cell);
    cell_init_surface_functions(cell);
-   cell_init_texture_functions(cell);
    cell_init_vertex_functions(cell);
 
    cell->draw = cell_draw_create(cell);
 
+   /* Create cache of fragment ops generated code */
+   cell->fragment_ops_cache =
+      util_new_keymap(sizeof(struct cell_fragment_ops_key), ~0, NULL);
+
    cell_init_vbuf(cell);
 
    draw_set_rasterize_stage(cell->draw, cell->vbuf);
@@ -143,17 +153,31 @@ cell_create_context(struct pipe_screen *screen,
                                               cell_debug_flags, 
                                               0 );
 
+   for (i = 0; i < CELL_NUM_BUFFERS; i++)
+      cell_fence_init(&cell->fenced_buffers[i].fence);
+
+
    /*
     * SPU stuff
     */
-   cell->num_spus = 6;
-   /* XXX is this in SDK 3.0 only?
-   cell->num_spus = spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1);
-   */
+   /* This call only works with SDK 3.0.  Anyone still using 2.1??? */
+   cell->num_cells = spe_cpu_info_get(SPE_COUNT_PHYSICAL_CPU_NODES, -1);
+   cell->num_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);
+   if (cell->debug_flags) {
+      printf("Cell: found %d Cell(s) with %u SPUs\n",
+             cell->num_cells, cell->num_spus);
+   }
+   if (getenv("CELL_NUM_SPUS")) {
+      cell->num_spus = atoi(getenv("CELL_NUM_SPUS"));
+      assert(cell->num_spus > 0);
+   }
 
    cell_start_spus(cell);
 
    cell_init_batch_buffers(cell);
 
+   /* make sure SPU initializations are done before proceeding */
+   cell_flush_int(cell, CELL_FLUSH_WAIT);
+
    return &cell->pipe;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 14914b9c6f..ca03dc1511 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -38,6 +38,7 @@
 #include "cell/common.h"
 #include "rtasm/rtasm_ppc_spe.h"
 #include "tgsi/tgsi_scan.h"
+#include "util/u_keymap.h"
 
 
 struct cell_vbuf_render;
@@ -67,31 +68,29 @@ struct cell_fragment_shader_state
 
 
 /**
- * Cell blend state atom, subclass of pipe_blend_state.
+ * Key for mapping per-fragment state to cached SPU machine code.
+ *  keymap(cell_fragment_ops_key) => cell_command_fragment_ops
  */
-struct cell_blend_state
+struct cell_fragment_ops_key
 {
-   struct pipe_blend_state base;
-
-   /**
-    * Generated code to perform alpha blending
-    */
-   struct spe_function code;
+   struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
+   struct pipe_depth_stencil_alpha_state dsa;
+   enum pipe_format color_format;
+   enum pipe_format zs_format;
 };
 
 
+struct cell_buffer_node;
+
 /**
- * Cell depth/stencil/alpha state atom, subclass of
- * pipe_depth_stencil_alpha_state.
+ * Fenced buffer list.  List of buffers which can be unreferenced after
+ * the fence has been executed/signalled.
  */
-struct cell_depth_stencil_alpha_state
+struct cell_buffer_list
 {
-   struct pipe_depth_stencil_alpha_state base;
-
-   /**
-    * Generated code to perform alpha, stencil, and depth testing on the SPE
-    */
-   struct spe_function code;
+   struct cell_fence fence ALIGN16_ATTRIB;
+   struct cell_buffer_node *head;
 };
 
 
@@ -104,10 +103,10 @@ struct cell_context
 
    struct cell_winsys *winsys;
 
-   const struct cell_blend_state *blend;
+   const struct pipe_blend_state *blend;
    const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
    uint num_samplers;
-   const struct cell_depth_stencil_alpha_state   *depth_stencil;
+   const struct pipe_depth_stencil_alpha_state *depth_stencil;
    const struct pipe_rasterizer_state *rasterizer;
    const struct cell_vertex_shader_state *vs;
    const struct cell_fragment_shader_state *fs;
@@ -128,6 +127,9 @@ struct cell_context
    struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
    uint num_vertex_elements;
 
+   struct pipe_transfer *cbuf_transfer[PIPE_MAX_COLOR_BUFS];
+   struct pipe_transfer *zsbuf_transfer;
+
    ubyte *cbuf_map[PIPE_MAX_COLOR_BUFS];
    ubyte *zsbuf_map;
 
@@ -135,6 +137,11 @@ struct cell_context
    uint *tex_map;
 
    uint dirty;
+   uint dirty_textures;  /* bitmask of texture units */
+   uint dirty_samplers;  /* bitmask of sampler units */
+
+   /** Cache of code generated for per-fragment ops */
+   struct keymap *fragment_ops_cache;
 
    /** The primitive drawing context */
    struct draw_context *draw;
@@ -149,8 +156,9 @@ struct cell_context
    /** Mapped constant buffers */
    void *mapped_constants[PIPE_SHADER_TYPES];
 
+   struct cell_spu_function_info spu_functions ALIGN16_ATTRIB;
 
-   uint num_spus;
+   uint num_cells, num_spus;
 
    /** Buffers for command batches, vertex/index data */
    uint buffer_size[CELL_NUM_BUFFERS];
@@ -162,6 +170,14 @@ struct cell_context
    uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
 
 
+   /** Associated with each command/batch buffer is a list of pipe_buffers
+    * that are fenced.  When the last command in a buffer is executed, the
+    * fence will be signalled, indicating that any pipe_buffers preceeding
+    * that fence can be unreferenced (and probably freed).
+    */
+   struct cell_buffer_list fenced_buffers[CELL_NUM_BUFFERS];
+
+
    struct spe_function attrib_fetch;
    unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS];
 
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index 880d535320..644496db40 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -33,7 +33,7 @@
 
 #include "pipe/p_defines.h"
 #include "pipe/p_context.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "pipe/p_inlines.h"
 
 #include "cell_context.h"
@@ -51,9 +51,9 @@ cell_map_constant_buffers(struct cell_context *sp)
    struct pipe_winsys *ws = sp->pipe.winsys;
    uint i;
    for (i = 0; i < 2; i++) {
-      if (sp->constants[i].size) {
+      if (sp->constants[i].buffer && sp->constants[i].buffer->size) {
          sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i].buffer,
-                                                  PIPE_BUFFER_USAGE_CPU_READ);
+                                                   PIPE_BUFFER_USAGE_CPU_READ);
          cell_flush_buffer_range(sp, sp->mapped_constants[i], 
                                  sp->constants[i].buffer->size);
       }
@@ -61,7 +61,7 @@ cell_map_constant_buffers(struct cell_context *sp)
 
    draw_set_mapped_constant_buffer(sp->draw,
                                    sp->mapped_constants[PIPE_SHADER_VERTEX],
-                                   sp->constants[PIPE_SHADER_VERTEX].size);
+                                   sp->constants[PIPE_SHADER_VERTEX].buffer->size);
 }
 
 static void
@@ -70,7 +70,7 @@ cell_unmap_constant_buffers(struct cell_context *sp)
    struct pipe_winsys *ws = sp->pipe.winsys;
    uint i;
    for (i = 0; i < 2; i++) {
-      if (sp->constants[i].size)
+      if (sp->constants[i].buffer && sp->constants[i].buffer->size)
          ws->buffer_unmap(ws, sp->constants[i].buffer);
       sp->mapped_constants[i] = NULL;
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
new file mode 100644
index 0000000000..867b5dcaa0
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -0,0 +1,168 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <unistd.h>
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_fence.h"
+#include "cell_texture.h"
+
+
+void
+cell_fence_init(struct cell_fence *fence)
+{
+   uint i;
+   ASSERT_ALIGN16(fence->status);
+   for (i = 0; i < CELL_MAX_SPUS; i++) {
+      fence->status[i][0] = CELL_FENCE_IDLE;
+   }
+}
+
+
+boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence)
+{
+   uint i;
+   for (i = 0; i < cell->num_spus; i++) {
+      if (fence->status[i][0] != CELL_FENCE_SIGNALLED)
+         return FALSE;
+      /*assert(fence->status[i][0] == CELL_FENCE_EMITTED);*/
+   }
+   return TRUE;
+}
+
+
+void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence)
+{
+   while (!cell_fence_signalled(cell, fence)) {
+      usleep(10);
+   }
+
+#ifdef DEBUG
+   {
+      uint i;
+      for (i = 0; i < cell->num_spus; i++) {
+         assert(fence->status[i][0] == CELL_FENCE_SIGNALLED);
+      }
+   }
+#endif
+}
+
+
+
+
+struct cell_buffer_node
+{
+   struct pipe_buffer *buffer;
+   struct cell_buffer_node *next;
+};
+
+
+static void
+cell_add_buffer_to_list(struct cell_context *cell,
+                        struct cell_buffer_list *list,
+                        struct pipe_buffer *buffer)
+{
+   struct pipe_screen *ps = cell->pipe.screen;
+   struct cell_buffer_node *node = CALLOC_STRUCT(cell_buffer_node);
+   /* create new list node which references the buffer, insert at head */
+   if (node) {
+      pipe_buffer_reference(ps, &node->buffer, buffer);
+      node->next = list->head;
+      list->head = node;
+   }
+}
+
+
+/**
+ * Wait for completion of the given fence, then unreference any buffers
+ * on the list.
+ * This typically unrefs/frees texture buffers after any rendering which uses
+ * them has completed.
+ */
+void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list)
+{
+   if (list->head) {
+      struct pipe_screen *ps = cell->pipe.screen;
+      struct cell_buffer_node *node;
+
+      cell_fence_finish(cell, &list->fence);
+
+      /* traverse the list, unreferencing buffers, freeing nodes */
+      node = list->head;
+      while (node) {
+         struct cell_buffer_node *next = node->next;
+         assert(node->buffer);
+         pipe_buffer_unmap(ps, node->buffer);
+#if 0
+         printf("Unref buffer %p\n", node->buffer);
+         if (node->buffer->refcount == 1)
+            printf("   Delete!\n");
+#endif
+         pipe_buffer_reference(ps, &node->buffer, NULL);
+         FREE(node);
+         node = next;
+      }
+      list->head = NULL;
+   }
+}
+
+
+/**
+ * This should be called for each render command.
+ * Any texture buffers that are current bound will be added to a fenced
+ * list to be freed later when the fence is executed/signalled.
+ */
+void
+cell_add_fenced_textures(struct cell_context *cell)
+{
+   struct cell_buffer_list *list = &cell->fenced_buffers[cell->cur_batch];
+   uint i;
+
+   for (i = 0; i < cell->num_textures; i++) {
+      struct cell_texture *ct = cell->texture[i];
+      if (ct) {
+         uint level;
+         for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+            if (ct->tiled_buffer[level]) {
+#if 0
+               printf("Adding texture %p buffer %p to list\n",
+                      ct, ct->tiled_buffer[level]);
+#endif
+               cell_add_buffer_to_list(cell, list, ct->tiled_buffer[level]);
+            }
+         }
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.h b/src/gallium/drivers/cell/ppu/cell_fence.h
new file mode 100644
index 0000000000..536b4ba411
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.h
@@ -0,0 +1,57 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_FENCE_H
+#define CELL_FENCE_H
+
+
+extern void
+cell_fence_init(struct cell_fence *fence);
+
+
+extern boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence);
+
+
+extern void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence);
+
+
+
+extern void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list);
+
+
+extern void
+cell_add_fenced_textures(struct cell_context *cell);
+
+
+#endif /* CELL_FENCE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
index 6596b72010..8275c9dc9c 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.c
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -49,7 +49,7 @@ cell_flush(struct pipe_context *pipe, unsigned flags,
       flags |= CELL_FLUSH_WAIT;
    }
 
-   if (flags & PIPE_FLUSH_SWAPBUFFERS)
+   if (flags & (PIPE_FLUSH_SWAPBUFFERS | PIPE_FLUSH_RENDER_CACHE))
       flags |= CELL_FLUSH_WAIT;
 
    draw_flush( cell->draw );
@@ -72,8 +72,9 @@ cell_flush_int(struct cell_context *cell, unsigned flags)
    flushing = TRUE;
 
    if (flags & CELL_FLUSH_WAIT) {
-      uint64_t *cmd = (uint64_t *) cell_batch_alloc(cell, sizeof(uint64_t));
-      *cmd = CELL_CMD_FINISH;
+      STATIC_ASSERT(sizeof(opcode_t) % 16 == 0);
+      opcode_t *cmd = (opcode_t*) cell_batch_alloc16(cell, sizeof(opcode_t));
+      *cmd[0] = CELL_CMD_FINISH;
    }
 
    cell_batch_flush(cell);
@@ -101,11 +102,11 @@ void
 cell_flush_buffer_range(struct cell_context *cell, void *ptr,
 			unsigned size)
 {
-   uint64_t batch[1 + (ROUNDUP8(sizeof(struct cell_buffer_range)) / 8)];
-   struct cell_buffer_range *br = (struct cell_buffer_range *) & batch[1];
-
+   STATIC_ASSERT((sizeof(opcode_t) + sizeof(struct cell_buffer_range)) % 16 == 0);
+   uint32_t *batch = (uint32_t*)cell_batch_alloc16(cell, 
+      sizeof(opcode_t) + sizeof(struct cell_buffer_range));
+   struct cell_buffer_range *br = (struct cell_buffer_range *) &batch[4];
    batch[0] = CELL_CMD_FLUSH_BUFFER_RANGE;
    br->base = (uintptr_t) ptr;
    br->size = size;
-   cell_batch_append(cell, batch, sizeof(batch));
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 6ffe94eb14..5a889a6119 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -2,6 +2,7 @@
  * 
  * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2009 VMware, Inc.  All rights reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -37,7 +38,7 @@
  * \author Brian Paul
  */
 
-
+#include <math.h>
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
@@ -51,25 +52,55 @@
 #include "cell_gen_fp.h"
 
 
-/** Set to 1 to enable debug/disassembly printfs */
-#define DISASSEM 01
+#define MAX_TEMPS 16
+#define MAX_IMMED  8
 
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
 
 /**
  * Context needed during code generation.
  */
 struct codegen
 {
+   struct cell_context *cell;
    int inputs_reg;      /**< 1st function parameter */
    int outputs_reg;     /**< 2nd function parameter */
    int constants_reg;   /**< 3rd function parameter */
-   int temp_regs[8][4]; /**< maps TGSI temps to SPE registers */
+   int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */
+   int imm_regs[MAX_IMMED][4];  /**< maps TGSI immediates to SPE registers */
+
+   int num_imm;  /**< number of immediates */
 
    int one_reg;         /**< register containing {1.0, 1.0, 1.0, 1.0} */
 
+   int addr_reg;        /**< address register, integer values */
+
    /** Per-instruction temps / intermediate temps */
    int num_itemps;
-   int itemps[3];
+   int itemps[12];
+
+   /** Current IF/ELSE/ENDIF nesting level */
+   int if_nesting;
+   /** Current BGNLOOP/ENDLOOP nesting level */
+   int loop_nesting;
+   /** Location of start of current loop */
+   int loop_start;
+
+   /** Index of if/conditional mask register */
+   int cond_mask_reg;
+   /** Index of loop mask register */
+   int loop_mask_reg;
+
+   /** Index of master execution mask register */
+   int exec_mask_reg;
+
+   /** KIL mask: indicates which fragments have been killed */
+   int kill_mask_reg;
+
+   int frame_size;  /**< Stack frame size, in words */
 
    struct spe_function *f;
    boolean error;
@@ -112,19 +143,122 @@ get_const_one_reg(struct codegen *gen)
 {
    if (gen->one_reg <= 0) {
       gen->one_reg = spe_allocate_available_register(gen->f);
-   }
 
-   /* one = {1.0, 1.0, 1.0, 1.0} */
-   spe_load_float(gen->f, gen->one_reg, 1.0f);
-#if DISASSEM
-   printf("il\tr%d, 1.0f\n", gen->one_reg);
-#endif
+      spe_indent(gen->f, 4);
+      spe_comment(gen->f, -4, "init constant reg = 1.0:");
+
+      /* one = {1.0, 1.0, 1.0, 1.0} */
+      spe_load_float(gen->f, gen->one_reg, 1.0f);
+
+      spe_indent(gen->f, -4);
+   }
 
    return gen->one_reg;
 }
 
 
 /**
+ * Return index of the address register.
+ * Used for indirect register loads/stores.
+ */
+static int
+get_address_reg(struct codegen *gen)
+{
+   if (gen->addr_reg <= 0) {
+      gen->addr_reg = spe_allocate_available_register(gen->f);
+
+      spe_indent(gen->f, 4);
+      spe_comment(gen->f, -4, "init address reg = 0:");
+
+      /* init addr = {0, 0, 0, 0} */
+      spe_zero(gen->f, gen->addr_reg);
+
+      spe_indent(gen->f, -4);
+   }
+
+   return gen->addr_reg;
+}
+
+
+/**
+ * Return index of the master execution mask.
+ * The register is allocated an initialized upon the first call.
+ *
+ * The master execution mask controls which pixels in a quad are
+ * modified, according to surrounding conditionals, loops, etc.
+ */
+static int
+get_exec_mask_reg(struct codegen *gen)
+{
+   if (gen->exec_mask_reg <= 0) {
+      gen->exec_mask_reg = spe_allocate_available_register(gen->f);
+
+      /* XXX this may not be needed */
+      spe_comment(gen->f, 0*-4, "initialize master execution mask = ~0");
+      spe_load_int(gen->f, gen->exec_mask_reg, ~0);
+   }
+
+   return gen->exec_mask_reg;
+}
+
+
+/** Return index of the conditional (if/else) execution mask register */
+static int
+get_cond_mask_reg(struct codegen *gen)
+{
+   if (gen->cond_mask_reg <= 0) {
+      gen->cond_mask_reg = spe_allocate_available_register(gen->f);
+   }
+
+   return gen->cond_mask_reg;
+}
+
+
+/** Return index of the loop execution mask register */
+static int
+get_loop_mask_reg(struct codegen *gen)
+{
+   if (gen->loop_mask_reg <= 0) {
+      gen->loop_mask_reg = spe_allocate_available_register(gen->f);
+   }
+
+   return gen->loop_mask_reg;
+}
+
+
+
+static boolean
+is_register_src(struct codegen *gen, int channel,
+                const struct tgsi_full_src_register *src)
+{
+   int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+   int sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+
+   if (swizzle > TGSI_SWIZZLE_W || sign_op != TGSI_UTIL_SIGN_KEEP) {
+      return FALSE;
+   }
+   if (src->SrcRegister.File == TGSI_FILE_TEMPORARY ||
+       src->SrcRegister.File == TGSI_FILE_IMMEDIATE) {
+      return TRUE;
+   }
+   return FALSE;
+}
+
+  
+static boolean
+is_memory_dst(struct codegen *gen, int channel,
+              const struct tgsi_full_dst_register *dst)
+{
+   if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
+      return TRUE;
+   }
+   else {
+      return FALSE;
+   }
+}
+
+  
+/**
  * Return the index of the SPU temporary containing the named TGSI
  * source register.  If the TGSI register is a TGSI_FILE_TEMPORARY we
  * just return the corresponding SPE register.  If the TGIS register
@@ -136,35 +270,99 @@ get_src_reg(struct codegen *gen,
             int channel,
             const struct tgsi_full_src_register *src)
 {
-   int reg;
+   int reg = -1;
+   int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+   boolean reg_is_itemp = FALSE;
+   uint sign_op;
+
+   assert(swizzle >= TGSI_SWIZZLE_X);
+   assert(swizzle <= TGSI_EXTSWIZZLE_ONE);
+
+   if (swizzle == TGSI_EXTSWIZZLE_ONE) {
+      /* Load const one float and early out */
+      reg = get_const_one_reg(gen);
+   }
+   else if (swizzle == TGSI_EXTSWIZZLE_ZERO) {
+      /* Load const zero float and early out */
+      reg = get_itemp(gen);
+      spe_xor(gen->f, reg, reg, reg);
+   }
+   else {
+      int index = src->SrcRegister.Index;
+
+      assert(swizzle < 4);
+
+      if (src->SrcRegister.Indirect) {
+         /* XXX unfinished */
+      }
+
+      switch (src->SrcRegister.File) {
+      case TGSI_FILE_TEMPORARY:
+         reg = gen->temp_regs[index][swizzle];
+         break;
+      case TGSI_FILE_INPUT:
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = index * 4 + swizzle;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16);
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         reg = gen->imm_regs[index][swizzle];
+         break;
+      case TGSI_FILE_CONSTANT:
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = index * 4 + swizzle;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
+         }
+         break;
+      default:
+         assert(0);
+      }
+   }
 
-   /* XXX need to examine src swizzle info here.
-    * That will involve changing the channel var...
+   /*
+    * Handle absolute value, negate or set-negative of src register.
     */
+   sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+   if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+      /*
+       * All sign ops are done by manipulating bit 31, the IEEE float sign bit.
+       */
+      const int bit31mask_reg = get_itemp(gen);
+      int result_reg;
+
+      if (reg_is_itemp) {
+         /* re-use 'reg' for the result */
+         result_reg = reg;
+      }
+      else {
+         /* alloc a new reg for the result */
+         result_reg = get_itemp(gen);
+      }
 
+      /* mask with bit 31 set, the rest cleared */
+      spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
 
-   switch (src->SrcRegister.File) {
-   case TGSI_FILE_TEMPORARY:
-      reg = gen->temp_regs[src->SrcRegister.Index][channel];
-      break;
-   case TGSI_FILE_INPUT:
-      {
-         /* offset is measured in quadwords, not bytes */
-         int offset = src->SrcRegister.Index * 4 + channel;
-         reg = get_itemp(gen);
-         /* Load:  reg = memory[(machine_reg) + offset] */
-         spe_lqd(gen->f, reg, gen->inputs_reg, offset);
-#if DISASSEM
-         printf("lqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset);
-#endif
+      if (sign_op == TGSI_UTIL_SIGN_CLEAR) {
+         spe_andc(gen->f, result_reg, reg, bit31mask_reg);
       }
-      break;
-   case TGSI_FILE_IMMEDIATE:
-      /* xxx fall-through for now / fix */
-   case TGSI_FILE_CONSTANT:
-      /* xxx fall-through for now / fix */
-   default:
-      assert(0);
+      else if (sign_op == TGSI_UTIL_SIGN_SET) {
+         spe_and(gen->f, result_reg, reg, bit31mask_reg);
+      }
+      else {
+         assert(sign_op == TGSI_UTIL_SIGN_TOGGLE);
+         spe_xor(gen->f, result_reg, reg, bit31mask_reg);
+      }
+
+      reg = result_reg;
    }
 
    return reg;
@@ -183,11 +381,14 @@ get_dst_reg(struct codegen *gen,
             int channel,
             const struct tgsi_full_dst_register *dest)
 {
-   int reg;
+   int reg = -1;
 
    switch (dest->DstRegister.File) {
    case TGSI_FILE_TEMPORARY:
-      reg = gen->temp_regs[dest->DstRegister.Index][channel];
+      if (gen->if_nesting > 0 || gen->loop_nesting > 0)
+         reg = get_itemp(gen);
+      else
+         reg = gen->temp_regs[dest->DstRegister.Index][channel];
       break;
    case TGSI_FILE_OUTPUT:
       reg = get_itemp(gen);
@@ -211,19 +412,59 @@ store_dest_reg(struct codegen *gen,
                int value_reg, int channel,
                const struct tgsi_full_dst_register *dest)
 {
+   /*
+    * XXX need to implement dst reg clamping/saturation
+    */
+#if 0
+   switch (inst->Instruction.Saturate) {
+   case TGSI_SAT_NONE:
+      break;
+   case TGSI_SAT_ZERO_ONE:
+      break;
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      break;
+   default:
+      assert( 0 );
+   }
+#endif
+
    switch (dest->DstRegister.File) {
    case TGSI_FILE_TEMPORARY:
-      /* no-op */
+      if (gen->if_nesting > 0 || gen->loop_nesting > 0) {
+         int d_reg = gen->temp_regs[dest->DstRegister.Index][channel];
+         int exec_reg = get_exec_mask_reg(gen);
+         /* Mix d with new value according to exec mask:
+          * d[i] = mask_reg[i] ? value_reg : d_reg
+          */
+         spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg);
+      }
+      else {
+         /* we're not inside a condition or loop: do nothing special */
+
+      }
       break;
    case TGSI_FILE_OUTPUT:
       {
          /* offset is measured in quadwords, not bytes */
          int offset = dest->DstRegister.Index * 4 + channel;
-         /* Store: memory[(machine_reg) + offset] = reg */
-         spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
-#if DISASSEM
-         printf("stqd\tr%d, r%d + %d\n", value_reg, gen->outputs_reg, offset);
-#endif
+         if (gen->if_nesting > 0 || gen->loop_nesting > 0) {
+            int exec_reg = get_exec_mask_reg(gen);
+            int curval_reg = get_itemp(gen);
+            /* First read the current value from memory:
+             * Load:  curval = memory[(machine_reg) + offset]
+             */
+            spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
+            /* Mix curval with newvalue according to exec mask:
+             * d[i] = mask_reg[i] ? value_reg : d_reg
+             */
+            spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
+            /* Store: memory[(machine_reg) + offset] = curval */
+            spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
+         }
+         else {
+            /* Store: memory[(machine_reg) + offset] = reg */
+            spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16);
+         }
       }
       break;
    default:
@@ -232,125 +473,1265 @@ store_dest_reg(struct codegen *gen,
 }
 
 
+
+static void
+emit_prologue(struct codegen *gen)
+{
+   gen->frame_size = 1024; /* XXX temporary, should be dynamic */
+
+   spe_comment(gen->f, 0, "Function prologue:");
+
+   /* save $lr on stack     # stqd $lr,16($sp) */
+   spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   if (gen->frame_size >= 512) {
+      /* offset is too large for ai instruction */
+      int offset_reg = spe_allocate_available_register(gen->f);
+      int sp_reg = spe_allocate_available_register(gen->f);
+      /* offset = -framesize */
+      spe_load_int(gen->f, offset_reg, -gen->frame_size);
+      /* sp = $sp */
+      spe_move(gen->f, sp_reg, SPE_REG_SP);
+      /* $sp = $sp + offset_reg */
+      spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+      /* save $sp in stack frame */
+      spe_stqd(gen->f, sp_reg, SPE_REG_SP, 0);
+      /* clean up */
+      spe_release_register(gen->f, offset_reg);
+      spe_release_register(gen->f, sp_reg);
+   }
+   else {
+      /* save stack pointer    # stqd $sp,-frameSize($sp) */
+      spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+
+      /* adjust stack pointer  # ai $sp,$sp,-frameSize */
+      spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+   }
+}
+
+
+static void
+emit_epilogue(struct codegen *gen)
+{
+   const int return_reg = 3;
+
+   spe_comment(gen->f, 0, "Function epilogue:");
+
+   spe_comment(gen->f, 0, "return the killed mask");
+   if (gen->kill_mask_reg > 0) {
+      /* shader called KIL, return the "alive" mask */
+      spe_move(gen->f, return_reg, gen->kill_mask_reg);
+   }
+   else {
+      /* return {0,0,0,0} */
+      spe_load_uint(gen->f, return_reg, 0);
+   }
+
+   spe_comment(gen->f, 0, "restore stack and return");
+   if (gen->frame_size >= 512) {
+      /* offset is too large for ai instruction */
+      int offset_reg = spe_allocate_available_register(gen->f);
+      /* offset = framesize */
+      spe_load_int(gen->f, offset_reg, gen->frame_size);
+      /* $sp = $sp + offset */
+      spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+      /* clean up */
+      spe_release_register(gen->f, offset_reg);
+   }
+   else {
+      /* restore stack pointer    # ai $sp,$sp,frameSize */
+      spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size);
+   }
+
+   /* restore $lr              # lqd $lr,16($sp) */
+   spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   /* return from function call */
+   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+}
+
+
+#define FOR_EACH_ENABLED_CHANNEL(inst, ch) \
+   for (ch = 0; ch < 4; ch++) \
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch))
+
+
+static boolean
+emit_ARL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch = 0, src_reg, addr_reg;
+
+   src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+   addr_reg = get_address_reg(gen);
+
+   /* convert float to int */
+   spe_cflts(gen->f, addr_reg, src_reg, 0);
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
 static boolean
 emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
+   int ch, src_reg[4], dst_reg[4];
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      if (is_register_src(gen, ch, &inst->FullSrcRegisters[0]) &&
+          is_memory_dst(gen, ch, &inst->FullDstRegisters[0])) {
+         /* special-case: register to memory store */
+         store_dest_reg(gen, src_reg[ch], ch, &inst->FullDstRegisters[0]);
+      }
+      else {
+         spe_move(gen->f, dst_reg[ch], src_reg[ch]);
+         store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+/**
+ * Emit binary operation
+ */
+static boolean
+emit_binop(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
+
+   /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   /* Loop over Red/Green/Blue/Alpha channels, do the op, store results */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      /* Emit actual SPE instruction: d = s1 + s2 */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ADD:
+         spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         break;
+      case TGSI_OPCODE_SUB:
+         spe_fs(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         break;
+      case TGSI_OPCODE_MUL:
+         spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         break;
+      default:
+         ;
+      }
+   }
+
+   /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+
+   /* Free any intermediate temps we allocated */
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
+/**
+ * Emit multiply add.  See emit_ADD for comments.
+ */
+static boolean
+emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4];
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+      s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit linear interpolate.  See emit_ADD for comments.
+ */
+static boolean
+emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4], tmp_reg[4];
+
+   /* setup/get src/dst/temp regs */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+      s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      tmp_reg[ch] = get_itemp(gen);
+   }
+
+   /* d = s3 + s1(s2 - s3) */
+   /* do all subtracts, then all fma, then all stores to better pipeline */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fs(gen->f, tmp_reg[ch], s2_reg[ch], s3_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+
+/**
+ * Emit reciprocal or recip sqrt.
+ */
+static boolean
+emit_RCP_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], d_reg[4], tmp_reg[4];
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      tmp_reg[ch] = get_itemp(gen);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      if (inst->Instruction.Opcode == TGSI_OPCODE_RCP) {
+         /* tmp = 1/s1 */
+         spe_frest(gen->f, tmp_reg[ch], s1_reg[ch]);
+      }
+      else {
+         /* tmp = 1/sqrt(s1) */
+         spe_frsqest(gen->f, tmp_reg[ch], s1_reg[ch]);
+      }
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      /* d = float_interp(s1, tmp) */
+      spe_fi(gen->f, d_reg[ch], s1_reg[ch], tmp_reg[ch]);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit absolute value.  See emit_ADD for comments.
+ */
+static boolean
+emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], d_reg[4];
+   const int bit31mask_reg = get_itemp(gen);
+
+   /* mask with bit 31 set, the rest cleared */  
+   spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   /* d = sign bit cleared in s1 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_andc(gen->f, d_reg[ch], s1_reg[ch], bit31mask_reg);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+/**
+ * Emit 3 component dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
    int ch;
-   for (ch = 0; ch < 4; ch++) {
-      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-         /* XXX we don't always need to actually emit a mov instruction here */
-         spe_move(gen->f, dst_reg, src_reg);
-#if DISASSEM
-         printf("mov\tr%d, r%d\n", dst_reg, src_reg);
-#endif
-         store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]);
-         free_itemps(gen);
+   int s1x_reg, s1y_reg, s1z_reg;
+   int s2x_reg, s2y_reg, s2z_reg;
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+
+   /* t0 = x0 * x1 */
+   spe_fm(gen->f, t0_reg, s1x_reg, s2x_reg);
+
+   /* t1 = y0 * y1 */
+   spe_fm(gen->f, t1_reg, s1y_reg, s2y_reg);
+
+   /* t0 = z0 * z1 + t0 */
+   spe_fma(gen->f, t0_reg, s1z_reg, s2z_reg, t0_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      spe_move(gen->f, d_reg, t0_reg);
+      store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+/**
+ * Emit 4 component dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s0x_reg, s0y_reg, s0z_reg, s0w_reg;
+   int s1x_reg, s1y_reg, s1z_reg, s1w_reg;
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+   s0x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   s0y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   s0z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   s0w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
+   s1w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+
+   /* t0 = x0 * x1 */
+   spe_fm(gen->f, t0_reg, s0x_reg, s1x_reg);
+
+   /* t1 = y0 * y1 */
+   spe_fm(gen->f, t1_reg, s0y_reg, s1y_reg);
+
+   /* t0 = z0 * z1 + t0 */
+   spe_fma(gen->f, t0_reg, s0z_reg, s1z_reg, t0_reg);
+
+   /* t1 = w0 * w1 + t1 */
+   spe_fma(gen->f, t1_reg, s0w_reg, s1w_reg, t1_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      spe_move(gen->f, d_reg, t0_reg);
+      store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+/**
+ * Emit homogeneous dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   /* XXX rewrite this function to look more like DP3/DP4 */
+   int ch;
+   int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   int tmp_reg = get_itemp(gen);
+
+   /* t = x0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* t = y0 * y1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = z0 * z1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+   /* t = w1 + t */
+   spe_fa(gen->f, tmp_reg, s2_reg, tmp_reg);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      spe_move(gen->f, d_reg, tmp_reg);
+      store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+/**
+ * Emit 3-component vector normalize.
+ */
+static boolean
+emit_NRM3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int src_reg[3];
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+   src_reg[0] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   src_reg[1] = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   src_reg[2] = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+
+   /* t0 = x * x */
+   spe_fm(gen->f, t0_reg, src_reg[0], src_reg[0]);
+
+   /* t1 = y * y */
+   spe_fm(gen->f, t1_reg, src_reg[1], src_reg[1]);
+
+   /* t0 = z * z + t0 */
+   spe_fma(gen->f, t0_reg, src_reg[2], src_reg[2], t0_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+   /* t1 = 1.0 / sqrt(t0) */
+   spe_frsqest(gen->f, t1_reg, t0_reg);
+   spe_fi(gen->f, t1_reg, t0_reg, t1_reg);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      /* dst = src[ch] * t1 */
+      spe_fm(gen->f, d_reg, src_reg[ch], t1_reg);
+      store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit cross product.  See emit_ADD for comments.
+ */
+static boolean
+emit_XPD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   int tmp_reg = get_itemp(gen);
+
+   /* t = z0 * y1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = y0 * z1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_X)) {
+      store_dest_reg(gen, tmp_reg, CHAN_X, &inst->FullDstRegisters[0]);
+   }
+
+   s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = x0 * z1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   /* t = z0 * x1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Y)) {
+      store_dest_reg(gen, tmp_reg, CHAN_Y, &inst->FullDstRegisters[0]);
+   }
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   /* t = y0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* t = x0 * y1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Z)) {
+      store_dest_reg(gen, tmp_reg, CHAN_Z, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit inequality instruction.
+ * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
+ * the result but OpenGL/TGSI needs 0.0 and 1.0 results.
+ * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND.
+ */
+static boolean
+emit_inequality(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], d_reg[4], one_reg;
+   bool complement = FALSE;
+
+   one_reg = get_const_one_reg(gen);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_SGT:
+         spe_fcgt(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         break;
+      case TGSI_OPCODE_SLT:
+         spe_fcgt(gen->f, d_reg[ch], s2_reg[ch], s1_reg[ch]);
+         break;
+      case TGSI_OPCODE_SGE:
+         spe_fcgt(gen->f, d_reg[ch], s2_reg[ch], s1_reg[ch]);
+         complement = TRUE;
+         break;
+      case TGSI_OPCODE_SLE:
+         spe_fcgt(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         complement = TRUE;
+         break;
+      case TGSI_OPCODE_SEQ:
+         spe_fceq(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         break;
+      case TGSI_OPCODE_SNE:
+         spe_fceq(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         complement = TRUE;
+         break;
+      default:
+         assert(0);
       }
    }
-   return true;
+
+   /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      /* d = d & one_reg */
+      if (complement)
+         spe_andc(gen->f, d_reg[ch], one_reg, d_reg[ch]);
+      else
+         spe_and(gen->f, d_reg[ch], one_reg, d_reg[ch]);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
 }
 
 
 /**
- * Emit addition instructions.  Recall that a single TGSI_OPCODE_ADD
- * becomes (up to) four SPU "fa" instructions because we're doing SOA
- * processing.
+ * Emit compare.
  */
 static boolean
-emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
-   /* Loop over Red/Green/Blue/Alpha channels */
-   for (ch = 0; ch < 4; ch++) {
-      /* If the dest R, G, B or A writemask is enabled... */
-      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         /* get indexes of the two src, one dest SPE registers */
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-
-         /* Emit actual SPE instruction: d = s1 + s2 */
-         spe_fa(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
-         printf("fa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+      int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      int zero_reg = get_itemp(gen);
+   
+      spe_zero(gen->f, zero_reg);
+
+      /* d = (s1 < 0) ? s2 : s3 */
+      spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+      spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
+
+      store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      free_itemps(gen);
+   }
+
+   return TRUE;
+}
+
+/**
+ * Emit trunc.  
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], d_reg[4];
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   /* Convert float to int */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_cflts(gen->f, d_reg[ch], s1_reg[ch], 0);
+   }
+
+   /* Convert int to float */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_csflt(gen->f, d_reg[ch], d_reg[ch], 0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit floor.  
+ * If negative int subtract one
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], d_reg[4], tmp_reg[4], zero_reg, one_reg;
+
+   zero_reg = get_itemp(gen);
+   spe_zero(gen->f, zero_reg);
+   one_reg = get_const_one_reg(gen);
+   
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      tmp_reg[ch] = get_itemp(gen);
+   }
+
+   /* If negative, subtract 1.0 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fcgt(gen->f, tmp_reg[ch], zero_reg, s1_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_selb(gen->f, tmp_reg[ch], zero_reg, one_reg, tmp_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fs(gen->f, tmp_reg[ch], s1_reg[ch], tmp_reg[ch]);
+   }
+
+   /* Convert float to int */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_cflts(gen->f, tmp_reg[ch], tmp_reg[ch], 0);
+   }
+
+   /* Convert int to float */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_csflt(gen->f, d_reg[ch], tmp_reg[ch], 0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Compute frac = Input - FLR(Input)
+ */
+static boolean
+emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], d_reg[4], tmp_reg[4], zero_reg, one_reg;
+
+   zero_reg = get_itemp(gen);
+   spe_zero(gen->f, zero_reg);
+   one_reg = get_const_one_reg(gen);
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      tmp_reg[ch] = get_itemp(gen);
+   }
+
+   /* If negative, subtract 1.0 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fcgt(gen->f, tmp_reg[ch], zero_reg, s1_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_selb(gen->f, tmp_reg[ch], zero_reg, one_reg, tmp_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fs(gen->f, tmp_reg[ch], s1_reg[ch], tmp_reg[ch]);
+   }
+
+   /* Convert float to int */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_cflts(gen->f, tmp_reg[ch], tmp_reg[ch], 0);
+   }
+
+   /* Convert int to float */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_csflt(gen->f, tmp_reg[ch], tmp_reg[ch], 0);
+   }
+
+   /* d = s1 - FLR(s1) */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_fs(gen->f, d_reg[ch], s1_reg[ch], tmp_reg[ch]);
+   }
+
+   /* store result */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+#if 0
+static void
+print_functions(struct cell_context *cell)
+{
+   struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i;
+   for (i = 0; i < funcs->num; i++) {
+      printf("SPU func %u: %s at %u\n",
+             i, funcs->names[i], funcs->addrs[i]);
+   }
+}
 #endif
 
-         /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
-         /* Free any intermediate temps we allocated */
-         free_itemps(gen);
+
+static uint
+lookup_function(struct cell_context *cell, const char *funcname)
+{
+   const struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i, addr = 0;
+   for (i = 0; i < funcs->num; i++) {
+      if (strcmp(funcs->names[i], funcname) == 0) {
+         addr = funcs->addrs[i];
       }
    }
-   return true;
+   assert(addr && "spu function not found");
+   return addr / 4;  /* discard 2 least significant bits */
 }
 
 
 /**
- * Emit multiply.  See emit_ADD for comments.
+ * Emit code to call a SPU function.
+ * Used to implement instructions like SIN/COS/POW/TEX/etc.
+ * If scalar, only the X components of the src regs are used, and the
+ * result is replicated across the dest register's XYZW components.
  */
 static boolean
-emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+emit_function_call(struct codegen *gen,
+                   const struct tgsi_full_instruction *inst,
+                   char *funcname, uint num_args, boolean scalar)
+{
+   const uint addr = lookup_function(gen->cell, funcname);
+   char comment[100];
+   int s_regs[3];
+   int func_called = FALSE;
+   uint a, ch;
+   int retval_reg = -1;
+
+   assert(num_args <= 3);
+
+   snprintf(comment, sizeof(comment), "CALL %s:", funcname);
+   spe_comment(gen->f, -4, comment);
+
+   if (scalar) {
+      for (a = 0; a < num_args; a++) {
+         s_regs[a] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[a]);
+      }
+      /* we'll call the function, put the return value in this register,
+       * then replicate it across all write-enabled components in d_reg.
+       */
+      retval_reg = spe_allocate_available_register(gen->f);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int d_reg;
+      ubyte usedRegs[SPE_NUM_REGS];
+      uint i, numUsed;
+
+      if (!scalar) {
+         for (a = 0; a < num_args; a++) {
+            s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+         }
+      }
+
+      d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+      if (!scalar || !func_called) {
+         /* for a scalar function, we'll really only call the function once */
+
+         numUsed = spe_get_registers_used(gen->f, usedRegs);
+         assert(numUsed < gen->frame_size / 16 - 2);
+
+         /* save registers to stack */
+         for (i = 0; i < numUsed; i++) {
+            uint reg = usedRegs[i];
+            int offset = 2 + i;
+            spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
+
+         /* setup function arguments */
+         for (a = 0; a < num_args; a++) {
+            spe_move(gen->f, 3 + a, s_regs[a]);
+         }
+
+         /* branch to function, save return addr */
+         spe_brasl(gen->f, SPE_REG_RA, addr);
+
+         /* save function's return value */
+         if (scalar)
+            spe_move(gen->f, retval_reg, 3);
+         else
+            spe_move(gen->f, d_reg, 3);
+
+         /* restore registers from stack */
+         for (i = 0; i < numUsed; i++) {
+            uint reg = usedRegs[i];
+            if (reg != d_reg && reg != retval_reg) {
+               int offset = 2 + i;
+               spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+            }
+         }
+
+         func_called = TRUE;
+      }
+
+      if (scalar) {
+         spe_move(gen->f, d_reg, retval_reg);
+      }
+
+      store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      free_itemps(gen);
+   }
+
+   if (scalar) {
+      spe_release_register(gen->f, retval_reg);
+   }
+
+   return TRUE;
+}
+
+
+static boolean
+emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
+   const uint target = inst->InstructionExtTexture.Texture;
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   uint addr;
    int ch;
+   int coord_regs[4], d_regs[4];
+
+   switch (target) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_2D:
+      addr = lookup_function(gen->cell, "spu_tex_2d");
+      break;
+   case TGSI_TEXTURE_3D:
+      addr = lookup_function(gen->cell, "spu_tex_3d");
+      break;
+   case TGSI_TEXTURE_CUBE:
+      addr = lookup_function(gen->cell, "spu_tex_cube");
+      break;
+   default:
+      ASSERT(0 && "unsupported texture target");
+      return FALSE;
+   }
+
+   assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER);
+
+   spe_comment(gen->f, -4, "CALL tex:");
+
+   /* get src/dst reg info */
    for (ch = 0; ch < 4; ch++) {
-      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-         /* d = s1 * s2 */
-         spe_fm(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
-         printf("fm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
-         free_itemps(gen);
+      coord_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_regs[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   {
+      ubyte usedRegs[SPE_NUM_REGS];
+      uint i, numUsed;
+
+      numUsed = spe_get_registers_used(gen->f, usedRegs);
+      assert(numUsed < gen->frame_size / 16 - 2);
+
+      /* save registers to stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         int offset = 2 + i;
+         spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+      }
+
+      /* setup function arguments (XXX depends on target) */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, 3 + i, coord_regs[i]);
       }
+      spe_load_uint(gen->f, 7, unit); /* sampler unit */
+
+      /* branch to function, save return addr */
+      spe_brasl(gen->f, SPE_REG_RA, addr);
+
+      /* save function's return values (four pixel's colors) */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, d_regs[i], 3 + i);
+      }
+
+      /* restore registers from stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         if (reg != d_regs[0] &&
+             reg != d_regs[1] &&
+             reg != d_regs[2] &&
+             reg != d_regs[3]) {
+            int offset = 2 + i;
+            spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
+      }
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]);
+      free_itemps(gen);
    }
-   return true;
+
+   return TRUE;
 }
 
 
 /**
- * Emit set-if-greater-than.
- * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
- * the result but OpenGL/TGSI needs 0.0 and 1.0 results.
- * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND.
+ * KILL if any of src reg values are less than zero.
  */
 static boolean
-emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
+   int s_regs[4], kil_reg = -1, cmp_reg, zero_reg;
 
-   for (ch = 0; ch < 4; ch++) {
-      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-
-         /* d = (s1 > s2) */
-         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
-         printf("fcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
+   spe_comment(gen->f, -4, "CALL kil:");
 
-         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
-         /* d = d & one_reg */
-         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
-#if DISASSEM
-         printf("and\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen));
-#endif
+   /* zero = {0,0,0,0} */
+   zero_reg = get_itemp(gen);
+   spe_zero(gen->f, zero_reg);
+
+   cmp_reg = get_itemp(gen);
+
+   /* get src regs */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+   }
+
+   /* test if any src regs are < 0 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      if (kil_reg >= 0) {
+         /* cmp = 0 > src ? : ~0 : 0 */
+         spe_fcgt(gen->f, cmp_reg, zero_reg, s_regs[ch]);
+         /* kil = kil | cmp */
+         spe_or(gen->f, kil_reg, kil_reg, cmp_reg);
+      }
+      else {
+         kil_reg = get_itemp(gen);
+         /* kil = 0 > src ? : ~0 : 0 */
+         spe_fcgt(gen->f, kil_reg, zero_reg, s_regs[ch]);
+      }
+   }
+
+   if (gen->if_nesting || gen->loop_nesting) {
+      /* may have been a conditional kil */
+      spe_and(gen->f, kil_reg, kil_reg, gen->exec_mask_reg);
+   }
 
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
-         free_itemps(gen);
+   /* allocate the kill mask reg if needed */
+   if (gen->kill_mask_reg <= 0) {
+      gen->kill_mask_reg = spe_allocate_available_register(gen->f);
+      spe_move(gen->f, gen->kill_mask_reg, kil_reg);
+   }
+   else {
+      spe_or(gen->f, gen->kill_mask_reg, gen->kill_mask_reg, kil_reg);
+   }
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
+
+/**
+ * Emit min or max.
+ */
+static boolean
+emit_MIN_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      tmp_reg[ch] = get_itemp(gen);         
+   }
+
+   /* d = (s0 > s1) ? s0 : s1 */
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      if (inst->Instruction.Opcode == TGSI_OPCODE_MAX)
+         spe_fcgt(gen->f, tmp_reg[ch], s0_reg[ch], s1_reg[ch]);
+      else
+         spe_fcgt(gen->f, tmp_reg[ch], s1_reg[ch], s0_reg[ch]);
+   }
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return TRUE;
+}
+
+
+/**
+ * Emit code to update the execution mask.
+ * This needs to be done whenever the execution status of a conditional
+ * or loop is changed.
+ */
+static void
+emit_update_exec_mask(struct codegen *gen)
+{
+   const int exec_reg = get_exec_mask_reg(gen);
+   const int cond_reg = gen->cond_mask_reg;
+   const int loop_reg = gen->loop_mask_reg;
+
+   spe_comment(gen->f, 0, "Update master execution mask");
+
+   if (gen->if_nesting > 0 && gen->loop_nesting > 0) {
+      /* exec_mask = cond_mask & loop_mask */
+      assert(cond_reg > 0);
+      assert(loop_reg > 0);
+      spe_and(gen->f, exec_reg, cond_reg, loop_reg);
+   }
+   else if (gen->if_nesting > 0) {
+      assert(cond_reg > 0);
+      spe_move(gen->f, exec_reg, cond_reg);
+   }
+   else if (gen->loop_nesting > 0) {
+      assert(loop_reg > 0);
+      spe_move(gen->f, exec_reg, loop_reg);
+   }
+   else {
+      spe_load_int(gen->f, exec_reg, ~0x0);
+   }
+}
+
+
+static boolean
+emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int channel = 0;
+   int cond_reg;
+
+   cond_reg = get_cond_mask_reg(gen);
+
+   /* XXX push cond exec mask */
+
+   spe_comment(gen->f,  0, "init conditional exec mask = ~0:");
+   spe_load_int(gen->f, cond_reg, ~0);
+
+   /* update conditional execution mask with the predicate register */
+   int tmp_reg = get_itemp(gen);
+   int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]);
+
+   /* tmp = (s1_reg == 0) */
+   spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
+   /* tmp = !tmp */
+   spe_complement(gen->f, tmp_reg, tmp_reg);
+   /* cond_mask = cond_mask & tmp */
+   spe_and(gen->f, cond_reg, cond_reg, tmp_reg);
+
+   gen->if_nesting++;
+
+   /* update the master execution mask */
+   emit_update_exec_mask(gen);
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int cond_reg = get_cond_mask_reg(gen);
+
+   spe_comment(gen->f, 0, "cond exec mask = !cond exec mask");
+   spe_complement(gen->f, cond_reg, cond_reg);
+   emit_update_exec_mask(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   /* XXX todo: pop cond exec mask */
+
+   gen->if_nesting--;
+
+   emit_update_exec_mask(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_BGNLOOP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int exec_reg, loop_reg;
+
+   exec_reg = get_exec_mask_reg(gen);
+   loop_reg = get_loop_mask_reg(gen);
+
+   /* XXX push loop_exec mask */
+
+   spe_comment(gen->f,  0*-4, "initialize loop exec mask = ~0");
+   spe_load_int(gen->f, loop_reg, ~0x0);
+
+   gen->loop_nesting++;
+   gen->loop_start = spe_code_size(gen->f);  /* in bytes */
+
+   return TRUE;
+}
+
+
+static boolean
+emit_ENDLOOP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int loop_reg = get_loop_mask_reg(gen);
+   const int tmp_reg = get_itemp(gen);
+   int offset;
+
+   /* tmp_reg = exec[0] | exec[1] | exec[2] | exec[3] */
+   spe_orx(gen->f, tmp_reg, loop_reg);
+
+   offset = gen->loop_start - spe_code_size(gen->f); /* in bytes */
+
+   /* branch back to top of loop if tmp_reg != 0 */
+   spe_brnz(gen->f, tmp_reg, offset / 4);
+
+   /* XXX pop loop_exec mask */
+
+   gen->loop_nesting--;
+
+   emit_update_exec_mask(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_BRK(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int exec_reg = get_exec_mask_reg(gen);
+   const int loop_reg = get_loop_mask_reg(gen);
+
+   assert(gen->loop_nesting > 0);
+
+   spe_comment(gen->f, 0, "loop exec mask &= ~master exec mask");
+   spe_andc(gen->f, loop_reg, loop_reg, exec_reg);
+
+   emit_update_exec_mask(gen);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_CONT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   assert(gen->loop_nesting > 0);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
+             boolean ddx)
+{
+   int ch;
+
+   FOR_EACH_ENABLED_CHANNEL(inst, ch) {
+      int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+      int t1_reg = get_itemp(gen);
+      int t2_reg = get_itemp(gen);
+
+      spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */
+      if (ddx) {
+         spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */
+      }
+      else {
+         spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */
       }
+      spe_fs(gen->f, d_reg, t2_reg, t1_reg);
+
+      free_itemps(gen);
    }
 
-   return true;
+   return TRUE;
 }
 
 
+
+
 /**
  * Emit END instruction.
  * We just return from the shader function at this point.
@@ -361,12 +1742,8 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_END(struct codegen *gen)
 {
-   /* return from function call */
-   spe_bi(gen->f, SPE_REG_RA, 0, 0);
-#if DISASSEM
-   printf("bi\trRA\n");
-#endif
-   return true;
+   emit_epilogue(gen);
+   return TRUE;
 }
 
 
@@ -378,24 +1755,153 @@ emit_instruction(struct codegen *gen,
                  const struct tgsi_full_instruction *inst)
 {
    switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      return emit_ARL(gen, inst);
    case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
       return emit_MOV(gen, inst);
-   case TGSI_OPCODE_MUL:
-      return emit_MUL(gen, inst);
    case TGSI_OPCODE_ADD:
-      return emit_ADD(gen, inst);
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_MUL:
+      return emit_binop(gen, inst);
+   case TGSI_OPCODE_MAD:
+      return emit_MAD(gen, inst);
+   case TGSI_OPCODE_LERP:
+      return emit_LERP(gen, inst);
+   case TGSI_OPCODE_DP3:
+      return emit_DP3(gen, inst);
+   case TGSI_OPCODE_DP4:
+      return emit_DP4(gen, inst);
+   case TGSI_OPCODE_DPH:
+      return emit_DPH(gen, inst);
+   case TGSI_OPCODE_NRM:
+      return emit_NRM3(gen, inst);
+   case TGSI_OPCODE_XPD:
+      return emit_XPD(gen, inst);
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_RSQ:
+      return emit_RCP_RSQ(gen, inst);
+   case TGSI_OPCODE_ABS:
+      return emit_ABS(gen, inst);
    case TGSI_OPCODE_SGT:
-      return emit_SGT(gen, inst);
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGE:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SNE:
+      return emit_inequality(gen, inst);
+   case TGSI_OPCODE_CMP:
+      return emit_CMP(gen, inst);
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_MAX:
+      return emit_MIN_MAX(gen, inst);
+   case TGSI_OPCODE_TRUNC:
+      return emit_TRUNC(gen, inst);
+   case TGSI_OPCODE_FLR:
+      return emit_FLR(gen, inst);
+   case TGSI_OPCODE_FRC:
+      return emit_FRC(gen, inst);
    case TGSI_OPCODE_END:
       return emit_END(gen);
 
+   case TGSI_OPCODE_COS:
+      return emit_function_call(gen, inst, "spu_cos", 1, TRUE);
+   case TGSI_OPCODE_SIN:
+      return emit_function_call(gen, inst, "spu_sin", 1, TRUE);
+   case TGSI_OPCODE_POW:
+      return emit_function_call(gen, inst, "spu_pow", 2, TRUE);
+   case TGSI_OPCODE_EXPBASE2:
+      return emit_function_call(gen, inst, "spu_exp2", 1, TRUE);
+   case TGSI_OPCODE_LOGBASE2:
+      return emit_function_call(gen, inst, "spu_log2", 1, TRUE);
+   case TGSI_OPCODE_TEX:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXD:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXB:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXL:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXP:
+      return emit_TEX(gen, inst);
+   case TGSI_OPCODE_KIL:
+      return emit_KIL(gen, inst);
+
+   case TGSI_OPCODE_IF:
+      return emit_IF(gen, inst);
+   case TGSI_OPCODE_ELSE:
+      return emit_ELSE(gen, inst);
+   case TGSI_OPCODE_ENDIF:
+      return emit_ENDIF(gen, inst);
+
+   case TGSI_OPCODE_BGNLOOP2:
+      return emit_BGNLOOP(gen, inst);
+   case TGSI_OPCODE_ENDLOOP2:
+      return emit_ENDLOOP(gen, inst);
+   case TGSI_OPCODE_BRK:
+      return emit_BRK(gen, inst);
+   case TGSI_OPCODE_CONT:
+      return emit_CONT(gen, inst);
+
+   case TGSI_OPCODE_DDX:
+      return emit_DDX_DDY(gen, inst, TRUE);
+   case TGSI_OPCODE_DDY:
+      return emit_DDX_DDY(gen, inst, FALSE);
+
    /* XXX lots more cases to do... */
 
    default:
-      return false;
+      fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n",
+              inst->Instruction.Opcode);
+      return FALSE;
    }
 
-   return true;
+   return TRUE;
+}
+
+
+
+/**
+ * Emit code for a TGSI immediate value (vector of four floats).
+ * This involves register allocation and initialization.
+ * XXX the initialization should be done by a "prepare" stage, not
+ * per quad execution!
+ */
+static boolean
+emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
+{
+   int ch;
+
+   assert(gen->num_imm < MAX_TEMPS);
+
+   for (ch = 0; ch < 4; ch++) {
+      float val = immed->u.ImmediateFloat32[ch].Float;
+
+      if (ch > 0 && val == immed->u.ImmediateFloat32[ch - 1].Float) {
+         /* re-use previous register */
+         gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1];
+      }
+      else {
+         char str[100];
+         int reg = spe_allocate_available_register(gen->f);
+
+         if (reg < 0)
+            return FALSE;
+
+         sprintf(str, "init $%d = %f", reg, val);
+         spe_comment(gen->f, 0, str);
+
+         /* update immediate map */
+         gen->imm_regs[gen->num_imm][ch] = reg;
+
+         /* emit initializer instruction */
+         spe_load_float(gen->f, reg, val);
+      }
+   }
+
+   gen->num_imm++;
+
+   return TRUE;
 }
 
 
@@ -405,44 +1911,46 @@ emit_instruction(struct codegen *gen,
  * We only care about TGSI TEMPORARY register declarations at this time.
  * For each TGSI TEMPORARY we allocate four SPE registers.
  */
-static void
-emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl)
+static boolean
+emit_declaration(struct cell_context *cell,
+                 struct codegen *gen, const struct tgsi_full_declaration *decl)
 {
    int i, ch;
 
    switch (decl->Declaration.File) {
    case TGSI_FILE_TEMPORARY:
-#if DISASSEM
-      printf("Declare temp reg %d .. %d\n",
-             decl->DeclarationRange.First,
-             decl->DeclarationRange.Last);
-#endif
       for (i = decl->DeclarationRange.First;
            i <= decl->DeclarationRange.Last;
            i++) {
+         assert(i < MAX_TEMPS);
          for (ch = 0; ch < 4; ch++) {
             gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
+            if (gen->temp_regs[i][ch] < 0)
+               return FALSE; /* out of regs */
          }
 
          /* XXX if we run out of SPE registers, we need to spill
           * to SPU memory.  someday...
           */
 
-#if DISASSEM
-         printf("  SPE regs: %d %d %d %d\n",
-                gen->temp_regs[i][0],
-                gen->temp_regs[i][1],
-                gen->temp_regs[i][2],
-                gen->temp_regs[i][3]);
-#endif
+         {
+            char buf[100];
+            sprintf(buf, "TGSI temp[%d] maps to SPU regs [$%d $%d $%d $%d]", i,
+                    gen->temp_regs[i][0], gen->temp_regs[i][1],
+                    gen->temp_regs[i][2], gen->temp_regs[i][3]);
+            spe_comment(gen->f, 0, buf);
+         }
       }
       break;
    default:
       ; /* ignore */
    }
+
+   return TRUE;
 }
 
 
+
 /**
  * Translate TGSI shader code to SPE instructions.  This is done when
  * the state tracker gives us a new shader (via pipe->create_fs_state()).
@@ -458,8 +1966,10 @@ cell_gen_fragment_program(struct cell_context *cell,
 {
    struct tgsi_parse_context parse;
    struct codegen gen;
+   uint ic = 0;
 
    memset(&gen, 0, sizeof(gen));
+   gen.cell = cell;
    gen.f = f;
 
    /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
@@ -472,50 +1982,63 @@ cell_gen_fragment_program(struct cell_context *cell,
    spe_allocate_register(f, gen.outputs_reg);
    spe_allocate_register(f, gen.constants_reg);
 
-#if DISASSEM
-   printf("Begin %s\n", __FUNCTION__);
-   tgsi_dump(tokens, 0);
-#endif
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_print_code(f, TRUE);
+      spe_indent(f, 2*8);
+      printf("Begin %s\n", __FUNCTION__);
+      tgsi_dump(tokens, 0);
+   }
 
    tgsi_parse_init(&parse, tokens);
 
+   emit_prologue(&gen);
+
    while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
       tgsi_parse_token(&parse);
 
       switch (parse.FullToken.Token.Type) {
       case TGSI_TOKEN_TYPE_IMMEDIATE:
-#if 0
-         if (!note_immediate(&gen, &parse.FullToken.FullImmediate ))
-            goto fail;
-#endif
+         if (f->print) {
+            _debug_printf("    # ");
+            tgsi_dump_immediate(&parse.FullToken.FullImmediate);
+         }
+         if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
+            gen.error = TRUE;
          break;
 
       case TGSI_TOKEN_TYPE_DECLARATION:
-         emit_declaration(&gen, &parse.FullToken.FullDeclaration);
+         if (f->print) {
+            _debug_printf("    # ");
+            tgsi_dump_declaration(&parse.FullToken.FullDeclaration);
+         }
+         if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration))
+            gen.error = TRUE;
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
-         if (!emit_instruction(&gen, &parse.FullToken.FullInstruction )) {
-            gen.error = true;
+         if (f->print) {
+            _debug_printf("    # ");
+            ic++;
+            tgsi_dump_instruction(&parse.FullToken.FullInstruction, ic);
          }
+         if (!emit_instruction(&gen, &parse.FullToken.FullInstruction))
+            gen.error = TRUE;
          break;
 
       default:
          assert(0);
-
       }
    }
 
-
    if (gen.error) {
       /* terminate the SPE code */
       return emit_END(&gen);
    }
 
-#if DISASSEM
-   printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
-   printf("End %s\n", __FUNCTION__);
-#endif
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
+      printf("End %s\n", __FUNCTION__);
+   }
 
    tgsi_parse_free( &parse );
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 06219d4e98..66d4b3b6a3 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -2,6 +2,7 @@
  * 
  * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2009 VMware, Inc.  All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -25,11 +26,10 @@
  * 
  **************************************************************************/
 
-
-
 /**
  * Generate SPU per-fragment code (actually per-quad code).
  * \author Brian Paul
+ * \author Bob Ellison
  */
 
 
@@ -54,12 +54,17 @@
  * \param ifragZ_reg  register containing integer fragment Z values (in)
  * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
  * \param zmask_reg   register containing result of Z test/comparison (out)
+ *
+ * Returns TRUE if the Z-buffer needs to be updated.
  */
-static void
-gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
-               struct spe_function *f,
+static boolean
+gen_depth_test(struct spe_function *f,
+               const struct pipe_depth_stencil_alpha_state *dsa,
                int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
 {
+   /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
+    * quantities.  This only makes a difference for 32-bit Z values though.
+    */
    ASSERT(dsa->depth.enabled);
 
    switch (dsa->depth.func) {
@@ -79,28 +84,28 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
 
    case PIPE_FUNC_GREATER:
       /* zmask = (ifragZ > ref) */
-      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
       /* mask = (mask & zmask) */
       spe_and(f, mask_reg, mask_reg, zmask_reg);
       break;
 
    case PIPE_FUNC_LESS:
       /* zmask = (ref > ifragZ) */
-      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
       /* mask = (mask & zmask) */
       spe_and(f, mask_reg, mask_reg, zmask_reg);
       break;
 
    case PIPE_FUNC_LEQUAL:
       /* zmask = (ifragZ > ref) */
-      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
       /* mask = (mask & ~zmask) */
       spe_andc(f, mask_reg, mask_reg, zmask_reg);
       break;
 
    case PIPE_FUNC_GEQUAL:
       /* zmask = (ref > ifragZ) */
-      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
       /* mask = (mask & ~zmask) */
       spe_andc(f, mask_reg, mask_reg, zmask_reg);
       break;
@@ -129,7 +134,10 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
        * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
        */
       spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+      return TRUE;
    }
+
+   return FALSE;
 }
 
 
@@ -153,7 +161,7 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
    if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
        (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
       /* load/splat the alpha reference float value */
-      spe_load_float(f, ref_reg, dsa->alpha.ref);
+      spe_load_float(f, ref_reg, dsa->alpha.ref_value);
    }
 
    /* emit code to do the alpha comparison, updating 'mask' */
@@ -230,6 +238,134 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
 }
 
 
+/**
+ * This pair of functions is used inline to allocate and deallocate
+ * optional constant registers.  Once a constant is discovered to be 
+ * needed, we will likely need it again, so we don't want to deallocate
+ * it and have to allocate and load it again unnecessarily.
+ */
+static INLINE void
+setup_optional_register(struct spe_function *f,
+                        int *r)
+{
+   if (*r < 0)
+      *r = spe_allocate_available_register(f);
+}
+
+static INLINE void
+release_optional_register(struct spe_function *f,
+                          int r)
+{
+   if (r >= 0)
+      spe_release_register(f, r);
+}
+
+static INLINE void
+setup_const_register(struct spe_function *f,
+                     int *r,
+                     float value)
+{
+   if (*r >= 0)
+      return;
+   setup_optional_register(f, r);
+   spe_load_float(f, *r, value);
+}
+
+static INLINE void
+release_const_register(struct spe_function *f,
+                       int r)
+{
+   release_optional_register(f, r);
+}
+
+
+
+/**
+ * Unpack/convert framebuffer colors from four 32-bit packed colors
+ * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
+ * Each 8-bit color component is expanded into a float in [0.0, 1.0].
+ */
+static void
+unpack_colors(struct spe_function *f,
+              enum pipe_format color_format,
+              int fbRGBA_reg,
+              int fbR_reg, int fbG_reg, int fbB_reg, int fbA_reg)
+{
+   int mask0_reg = spe_allocate_available_register(f);
+   int mask1_reg = spe_allocate_available_register(f);
+   int mask2_reg = spe_allocate_available_register(f);
+   int mask3_reg = spe_allocate_available_register(f);
+
+   spe_load_int(f, mask0_reg, 0xff);
+   spe_load_int(f, mask1_reg, 0xff00);
+   spe_load_int(f, mask2_reg, 0xff0000);
+   spe_load_int(f, mask3_reg, 0xff000000);
+
+   spe_comment(f, 0, "Unpack framebuffer colors, convert to floats");
+
+   switch (color_format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      /* fbB = fbRGBA & mask */
+      spe_and(f, fbB_reg, fbRGBA_reg, mask0_reg);
+
+      /* fbG = fbRGBA & mask */
+      spe_and(f, fbG_reg, fbRGBA_reg, mask1_reg);
+
+      /* fbR = fbRGBA & mask */
+      spe_and(f, fbR_reg, fbRGBA_reg, mask2_reg);
+
+      /* fbA = fbRGBA & mask */
+      spe_and(f, fbA_reg, fbRGBA_reg, mask3_reg);
+
+      /* fbG = fbG >> 8 */
+      spe_roti(f, fbG_reg, fbG_reg, -8);
+
+      /* fbR = fbR >> 16 */
+      spe_roti(f, fbR_reg, fbR_reg, -16);
+
+      /* fbA = fbA >> 24 */
+      spe_roti(f, fbA_reg, fbA_reg, -24);
+      break;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      /* fbA = fbRGBA & mask */
+      spe_and(f, fbA_reg, fbRGBA_reg, mask0_reg);
+
+      /* fbR = fbRGBA & mask */
+      spe_and(f, fbR_reg, fbRGBA_reg, mask1_reg);
+
+      /* fbG = fbRGBA & mask */
+      spe_and(f, fbG_reg, fbRGBA_reg, mask2_reg);
+
+      /* fbB = fbRGBA & mask */
+      spe_and(f, fbB_reg, fbRGBA_reg, mask3_reg);
+
+      /* fbR = fbR >> 8 */
+      spe_roti(f, fbR_reg, fbR_reg, -8);
+
+      /* fbG = fbG >> 16 */
+      spe_roti(f, fbG_reg, fbG_reg, -16);
+
+      /* fbB = fbB >> 24 */
+      spe_roti(f, fbB_reg, fbB_reg, -24);
+      break;
+
+   default:
+      ASSERT(0);
+   }
+
+   /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
+   spe_cuflt(f, fbR_reg, fbR_reg, 8);
+   spe_cuflt(f, fbG_reg, fbG_reg, 8);
+   spe_cuflt(f, fbB_reg, fbB_reg, 8);
+   spe_cuflt(f, fbA_reg, fbA_reg, 8);
+
+   spe_release_register(f, mask0_reg);
+   spe_release_register(f, mask1_reg);
+   spe_release_register(f, mask2_reg);
+   spe_release_register(f, mask3_reg);
+}
+
 
 /**
  * Generate SPE code to implement the given blend mode for a quad of pixels.
@@ -242,6 +378,7 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
  */
 static void
 gen_blend(const struct pipe_blend_state *blend,
+          const struct pipe_blend_color *blend_color,
           struct spe_function *f,
           enum pipe_format color_format,
           int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
@@ -262,211 +399,464 @@ gen_blend(const struct pipe_blend_state *blend,
    int fbB_reg = spe_allocate_available_register(f);
    int fbA_reg = spe_allocate_available_register(f);
 
-   int one_reg = spe_allocate_available_register(f);
    int tmp_reg = spe_allocate_available_register(f);
 
-   boolean one_reg_set = false; /* avoid setting one_reg more than once */
-
-   ASSERT(blend->blend_enable);
-
-   /* Unpack/convert framebuffer colors from four 32-bit packed colors
-    * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
-    * Each 8-bit color component is expanded into a float in [0.0, 1.0].
+   /* Optional constant registers we might or might not end up using;
+    * if we do use them, make sure we only allocate them once by
+    * keeping a flag on each one.
     */
-   {
-      int mask_reg = spe_allocate_available_register(f);
-
-      /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */
-      spe_load_int(f, mask_reg, 0xff);
-
-      /* XXX there may be more clever ways to implement the following code */
-      switch (color_format) {
-      case PIPE_FORMAT_A8R8G8B8_UNORM:
-         /* fbB = fbB & mask */
-         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
-         /* mask = mask << 8 */
-         spe_roti(f, mask_reg, mask_reg, 8);
-
-         /* fbG = fbRGBA & mask */
-         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
-         /* fbG = fbG >> 8 */
-         spe_roti(f, fbG_reg, fbG_reg, -8);
-         /* mask = mask << 8 */
-         spe_roti(f, mask_reg, mask_reg, 8);
-
-         /* fbR = fbRGBA & mask */
-         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
-         /* fbR = fbR >> 16 */
-         spe_roti(f, fbR_reg, fbR_reg, -16);
-         /* mask = mask << 8 */
-         spe_roti(f, mask_reg, mask_reg, 8);
-
-         /* fbA = fbRGBA & mask */
-         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
-         /* fbA = fbA >> 24 */
-         spe_roti(f, fbA_reg, fbA_reg, -24);
-         break;
-
-      case PIPE_FORMAT_B8G8R8A8_UNORM:
-         /* fbA = fbA & mask */
-         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
-         /* mask = mask << 8 */
-         spe_roti(f, mask_reg, mask_reg, 8);
-
-         /* fbR = fbRGBA & mask */
-         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
-         /* fbR = fbR >> 8 */
-         spe_roti(f, fbR_reg, fbR_reg, -8);
-         /* mask = mask << 8 */
-         spe_roti(f, mask_reg, mask_reg, 8);
-
-         /* fbG = fbRGBA & mask */
-         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
-         /* fbG = fbG >> 16 */
-         spe_roti(f, fbG_reg, fbG_reg, -16);
-         /* mask = mask << 8 */
-         spe_roti(f, mask_reg, mask_reg, 8);
-
-         /* fbB = fbRGBA & mask */
-         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
-         /* fbB = fbB >> 24 */
-         spe_roti(f, fbB_reg, fbB_reg, -24);
-         break;
+   int one_reg = -1;
+   int constR_reg = -1, constG_reg = -1, constB_reg = -1, constA_reg = -1;
 
-      default:
-         ASSERT(0);
-      }
-
-      /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
-      spe_cuflt(f, fbR_reg, fbR_reg, 8);
-      spe_cuflt(f, fbG_reg, fbG_reg, 8);
-      spe_cuflt(f, fbB_reg, fbB_reg, 8);
-      spe_cuflt(f, fbA_reg, fbA_reg, 8);
-
-      spe_release_register(f, mask_reg);
-   }
+   ASSERT(blend->blend_enable);
 
+   /* packed RGBA -> float colors */
+   unpack_colors(f, color_format, fbRGBA_reg,
+                 fbR_reg, fbG_reg, fbB_reg, fbA_reg);
 
    /*
-    * Compute Src RGB terms
+    * Compute Src RGB terms.  We're actually looking for the value
+    * of (the appropriate RGB factors) * (the incoming source RGB color),
+    * because in some cases (like PIPE_BLENDFACTOR_ONE and 
+    * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
     */
    switch (blend->rgb_src_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (R,G,B) */
       spe_move(f, term1R_reg, fragR_reg);
       spe_move(f, term1G_reg, fragG_reg);
       spe_move(f, term1B_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term1R_reg);
-      spe_zero(f, term1G_reg);
-      spe_zero(f, term1B_reg);
+      /* factors = (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term1R_reg, 0.0f);
+      spe_load_float(f, term1G_reg, 0.0f);
+      spe_load_float(f, term1B_reg, 0.0f);
       break;
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
       spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
       spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
       spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
       spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
       spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
       spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
       break;
-      /* XXX more cases */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) 
+       * or in other words term = (R-R*R, G-G*G, B-B*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
+       * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
+       * or term = (R-R*A,G-G*A,B-B*A)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) 
+       * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
+      spe_fm(f, term1R_reg, fragR_reg, constR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
+      spe_fm(f, term1R_reg, fragR_reg, constA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) 
+       * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
+       * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      /* We'll need the optional {1,1,1,1} register */
+      setup_const_register(f, &one_reg, 1.0f);
+      /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so 
+       * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
+       * We could expand the term (as a*min(b,c) == min(a*b,a*c)
+       * as long as a is positive), but then we'd have to do three
+       * spe_float_min() functions instead of one, so this is simpler.
+       */
+      /* tmp = 1 - Afb */
+      spe_fs(f, tmp_reg, one_reg, fbA_reg);
+      /* tmp = min(A,tmp) */
+      spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
+      /* term = R*tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
    default:
       ASSERT(0);
    }
 
    /*
-    * Compute Src Alpha term
+    * Compute Src Alpha term.  Like the above, we're looking for
+    * the full term A*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term1A_reg, 0.0f);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
    case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = A */
       spe_move(f, term1A_reg, fragA_reg);
       break;
+
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = A*A */
       spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
       spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
       break;
-      /* XXX more cases */
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = A*(1-A) = A-A*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = A*Afb */
+      spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = A*Ac */
+      spe_fm(f, term1A_reg, fragA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
    default:
       ASSERT(0);
    }
 
    /*
-    * Compute Dest RGB terms
+    * Compute Dest RGB term.  Like the above, we're looking for
+    * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->rgb_dst_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
       spe_move(f, term2R_reg, fbR_reg);
       spe_move(f, term2G_reg, fbG_reg);
       spe_move(f, term2B_reg, fbB_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term2R_reg);
-      spe_zero(f, term2G_reg);
-      spe_zero(f, term2B_reg);
+      /* factor s= (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term2R_reg, 0.0f);
+      spe_load_float(f, term2G_reg, 0.0f);
+      spe_load_float(f, term2B_reg, 0.0f);
       break;
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
       spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
       spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
       spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
       break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B)) 
+       * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
+      break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
       spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
       spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
       spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      /* one = {1.0, 1.0, 1.0, 1.0} */
-      if (!one_reg_set) {
-         spe_load_float(f, one_reg, 1.0f);
-         one_reg_set = true;
-      }
-      /* tmp = one - fragA */
-      spe_fs(f, tmp_reg, one_reg, fragA_reg);
-      /* term = fb * tmp */
-      spe_fm(f, term2R_reg, fbR_reg, tmp_reg);
-      spe_fm(f, term2G_reg, fbG_reg, tmp_reg);
-      spe_fm(f, term2B_reg, fbB_reg, tmp_reg);
-      break;
-      /* XXX more cases */
+      /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
+       * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb)) 
+       * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
+      spe_fm(f, term2R_reg, fbR_reg, constR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
+      spe_fm(f, term2R_reg, fbR_reg, constA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc)) 
+       * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
+       * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
    default:
       ASSERT(0);
    }
 
    /*
-    * Compute Dest Alpha term
+    * Compute Dest Alpha term.  Like the above, we're looking for
+    * the full term Afb*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->alpha_dst_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = Afb */
       spe_move(f, term2A_reg, fbA_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term2A_reg);
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term2A_reg, 0.0f);
       break;
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = Afb*A */
       spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
       break;
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      /* one = {1.0, 1.0, 1.0, 1.0} */
-      if (!one_reg_set) {
-         spe_load_float(f, one_reg, 1.0f);
-         one_reg_set = true;
-      }
-      /* tmp = one - fragA */
-      spe_fs(f, tmp_reg, one_reg, fragA_reg);
-      /* termA = fbA * tmp */
-      spe_fm(f, term2A_reg, fbA_reg, tmp_reg);
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = Afb*Afb */
+      spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = Afb*Ac */
+      spe_fm(f, term2A_reg, fbA_reg, constA_reg);
       break;
-      /* XXX more cases */
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
    default:
       ASSERT(0);
    }
 
    /*
-    * Combine Src/Dest RGB terms
+    * Combine Src/Dest RGB terms as per the blend equation.
     */
    switch (blend->rgb_func) {
    case PIPE_BLEND_ADD:
@@ -479,7 +869,21 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
       spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
       break;
-      /* XXX more cases */
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
+      spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
+      spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_MAX:
+      spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
    default:
       ASSERT(0);
    }
@@ -494,7 +898,15 @@ gen_blend(const struct pipe_blend_state *blend,
    case PIPE_BLEND_SUBTRACT:
       spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
       break;
-      /* XXX more cases */
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_MAX:
+      spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
    default:
       ASSERT(0);
    }
@@ -514,8 +926,14 @@ gen_blend(const struct pipe_blend_state *blend,
    spe_release_register(f, fbB_reg);
    spe_release_register(f, fbA_reg);
 
-   spe_release_register(f, one_reg);
    spe_release_register(f, tmp_reg);
+
+   /* Free any optional registers that actually got used */
+   release_const_register(f, one_reg);
+   release_const_register(f, constR_reg);
+   release_const_register(f, constG_reg);
+   release_const_register(f, constB_reg);
+   release_const_register(f, constA_reg);
 }
 
 
@@ -524,24 +942,74 @@ gen_logicop(const struct pipe_blend_state *blend,
             struct spe_function *f,
             int fragRGBA_reg, int fbRGBA_reg)
 {
-   /* XXX to-do */
-   /* operate on 32-bit packed pixels, not float colors */
-}
-
-
-static void
-gen_colormask(uint colormask,
-              struct spe_function *f,
-              int fragRGBA_reg, int fbRGBA_reg)
-{
-   /* XXX to-do */
-   /* operate on 32-bit packed pixels, not float colors */
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.
+    * */
+   ASSERT(blend->logicop_enable);
+
+   switch(blend->logicop_func) {
+      case PIPE_LOGICOP_CLEAR: /* 0 */
+         spe_zero(f, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOR: /* ~(s | d) */
+         spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_INVERT: /* ~d */
+         /* Note that (A nor A) == ~(A|A) == ~A */
+         spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_XOR: /* s ^ d */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NAND: /* ~(s & d) */
+         spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND: /* s & d */
+         spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOOP: /* d */
+         spe_move(f, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY: /* s */
+         break;
+      case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR: /* s | d */
+         spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_SET: /* 1 */
+         spe_load_int(f, fragRGBA_reg, 0xffffffff);
+         break;
+      default:
+         ASSERT(0);
+   }
 }
 
 
-
 /**
- * Generate code to pack a quad of float colors into a four 32-bit integers.
+ * Generate code to pack a quad of float colors into four 32-bit integers.
  *
  * \param f             SPE function to append instruction onto.
  * \param color_format  the dest color packing format
@@ -557,13 +1025,16 @@ gen_pack_colors(struct spe_function *f,
                 int r_reg, int g_reg, int b_reg, int a_reg,
                 int rgba_reg)
 {
+   int rg_reg = spe_allocate_available_register(f);
+   int ba_reg = spe_allocate_available_register(f);
+
    /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
    spe_cfltu(f, r_reg, r_reg, 32);
    spe_cfltu(f, g_reg, g_reg, 32);
    spe_cfltu(f, b_reg, b_reg, 32);
    spe_cfltu(f, a_reg, a_reg, 32);
 
-   /* Shift the most significant bytes to least the significant positions.
+   /* Shift the most significant bytes to the least significant positions.
     * I.e.: reg = reg >> 24
     */
    spe_rotmi(f, r_reg, r_reg, -24);
@@ -595,12 +1066,936 @@ gen_pack_colors(struct spe_function *f,
     * OR-ing all those together gives us four packed colors:
     *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
     */
-   spe_or(f, rgba_reg, r_reg, g_reg);
-   spe_or(f, rgba_reg, rgba_reg, b_reg);
-   spe_or(f, rgba_reg, rgba_reg, a_reg);
+   spe_or(f, rg_reg, r_reg, g_reg);
+   spe_or(f, ba_reg, a_reg, b_reg);
+   spe_or(f, rgba_reg, rg_reg, ba_reg);
+
+   spe_release_register(f, rg_reg);
+   spe_release_register(f, ba_reg);
 }
 
 
+static void
+gen_colormask(struct spe_function *f,
+              uint colormask,
+              enum pipe_format color_format,
+              int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.  Further, the pixels
+    * are packed according to the given color format, not
+    * necessarily RGBA...
+    */
+   uint r_mask;
+   uint g_mask;
+   uint b_mask;
+   uint a_mask;
+
+   /* Calculate exactly where the bits for any particular color
+    * end up, so we can mask them correctly.
+    */
+   switch(color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         /* ARGB */
+         a_mask = 0xff000000;
+         r_mask = 0x00ff0000;
+         g_mask = 0x0000ff00;
+         b_mask = 0x000000ff;
+         break;
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         /* BGRA */
+         b_mask = 0xff000000;
+         g_mask = 0x00ff0000;
+         r_mask = 0x0000ff00;
+         a_mask = 0x000000ff;
+         break;
+      default:
+         ASSERT(0);
+   }
+
+   /* For each R, G, B, and A component we're supposed to mask out, 
+    * clear its bits.   Then our mask operation later will work 
+    * as expected.
+    */
+   if (!(colormask & PIPE_MASK_R)) {
+      r_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_G)) {
+      g_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_B)) {
+      b_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_A)) {
+      a_mask = 0;
+   }
+
+   /* Get a temporary register to hold the mask that will be applied
+    * to the fragment
+    */
+   int colormask_reg = spe_allocate_available_register(f);
+
+   /* The actual mask we're going to use is an OR of the remaining R, G, B,
+    * and A masks.  Load the result value into our temporary register.
+    */
+   spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
+
+   /* Use the mask register to select between the fragment color
+    * values and the frame buffer color values.  Wherever the
+    * mask has a 0 bit, the current frame buffer color should override
+    * the fragment color.  Wherever the mask has a 1 bit, the 
+    * fragment color should persevere.  The Select Bits (selb rt, rA, rB, rM)
+    * instruction will select bits from its first operand rA wherever the
+    * the mask bits rM are 0, and from its second operand rB wherever the
+    * mask bits rM are 1.  That means that the frame buffer color is the
+    * first operand, and the fragment color the second.
+    */
+    spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
+
+    /* Release the temporary register and we're done */
+    spe_release_register(f, colormask_reg);
+}
+
+
+/**
+ * This function is annoyingly similar to gen_depth_test(), above, except
+ * that instead of comparing two varying values (i.e. fragment and buffer),
+ * we're comparing a varying value with a static value.  As such, we have
+ * access to the Compare Immediate instructions where we don't in 
+ * gen_depth_test(), which is what makes us very different.
+ *
+ * There's some added complexity if there's a non-trivial state->mask
+ * value; then stencil and reference both must be masked
+ *
+ * The return value in the stencil_pass_reg is a bitmask of valid
+ * fragments that also passed the stencil test.  The bitmask of valid
+ * fragments that failed would be found in
+ * (fragment_mask_reg & ~stencil_pass_reg).
+ */
+static void
+gen_stencil_test(struct spe_function *f,
+                 const struct pipe_stencil_state *state, 
+                 uint stencil_max_value,
+                 int fragment_mask_reg,
+                 int fbS_reg, 
+                 int stencil_pass_reg)
+{
+   /* Generate code that puts the set of passing fragments into the
+    * stencil_pass_reg register, taking into account whether each fragment
+    * was active to begin with.
+    */
+   switch (state->func) {
+   case PIPE_FUNC_EQUAL:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (s == reference) */
+         spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
+         uint tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil,
+                                state->valuemask & state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & ~(s == reference) */
+         spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
+         int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil,
+                                state->valuemask & state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_LESS:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference < s)  */
+         spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((reference&mask) < (s & mask)) */
+         int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil,
+                                  state->valuemask & state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_GREATER:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference > s) */
+         /* There's no convenient Compare Less Than Immediate instruction, so
+          * we'll have to do this one the harder way, by loading a register and 
+          * comparing directly.  Compare Logical Greater Than Word (clgt) 
+          * treats its operands as unsigned - no sign extension.
+          */
+         int tmp_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
+         int tmp_reg = spe_allocate_available_register(f);
+         int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->valuemask & state->ref_value);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference >= s) 
+          *              = fragment_mask & ~(s > reference) */
+         spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg,
+                                  state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
+         int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil,
+                                  state->valuemask & state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      if (state->valuemask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference <= s) ]
+          *               = fragment_mask & ~(reference > s) */
+         /* As above, we have to do this by loading a register */
+         int tmp_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
+         int tmp_reg = spe_allocate_available_register(f);
+         int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value & state->valuemask);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->valuemask);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_NEVER:
+      /* stencil_pass = fragment_mask & 0 = 0 */
+      spe_load_uint(f, stencil_pass_reg, 0);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* stencil_pass = fragment_mask & 1 = fragment_mask */
+      spe_move(f, stencil_pass_reg, fragment_mask_reg);
+      break;
+   }
+
+   /* The fragments that passed the stencil test are now in stencil_pass_reg.
+    * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg).
+    */
+}
+
+
+/**
+ * This function generates code that calculates a set of new stencil values
+ * given the earlier values and the operation to apply.  It does not
+ * apply any tests.  It is intended to be called up to 3 times
+ * (for the stencil fail operation, for the stencil pass-z fail operation,
+ * and for the stencil pass-z pass operation) to collect up to three
+ * possible sets of values, and for the caller to combine them based
+ * on the result of the tests.
+ *
+ * stencil_max_value should be (2^n - 1) where n is the number of bits
+ * in the stencil buffer - in other words, it should be usable as a mask.
+ */
+static void
+gen_stencil_values(struct spe_function *f,
+                   uint stencil_op,
+                   uint stencil_ref_value,
+                   uint stencil_max_value,
+                   int fbS_reg,
+                   int newS_reg)
+{
+   /* The code below assumes that newS_reg and fbS_reg are not the same
+    * register; if they can be, the calculations below will have to use
+    * an additional temporary register.  For now, mark the assumption
+    * with an assertion that will fail if they are the same.
+    */
+   ASSERT(fbS_reg != newS_reg);
+
+   /* The code also assumes the the stencil_max_value is of the form 
+    * 2^n-1 and can therefore be used as a mask for the valid bits in 
+    * addition to a maximum.  Make sure this is the case as well.
+    * The clever math below exploits the fact that incrementing a 
+    * binary number serves to flip all the bits of a number starting at
+    * the LSB and continuing to (and including) the first zero bit
+    * found.  That means that a number and its increment will always
+    * have at least one bit in common (the high order bit, if nothing
+    * else) *unless* the number is zero, *or* the number is of a form
+    * consisting of some number of 1s in the low-order bits followed
+    * by nothing but 0s in the high-order bits.  The latter case
+    * implies it's of the form 2^n-1.
+    */
+   ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
+
+   switch(stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* newS = S */
+      spe_move(f, newS_reg, fbS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_ZERO:
+      /* newS = 0 */
+      spe_zero(f, newS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_REPLACE:
+      /* newS = stencil reference value */
+      spe_load_uint(f, newS_reg, stencil_ref_value);
+      break;
+
+   case PIPE_STENCIL_OP_INCR: {
+      /* newS = (s == max ? max : s + 1) */
+      int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
+      /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_DECR: {
+      /* newS = (s == 0 ? 0 : s - 1) */
+      int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
+      /* Add Word Immediate with a (-1) value works */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
+       * do a normal add and mask off the correct bits 
+       */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_INVERT:
+      /* newS = ~s.  We take advantage of the mask/max value to invert only
+       * the valid bits for the field so we don't have to do an extra "and".
+       */
+      spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
+      break;
+
+   default:
+      ASSERT(0);
+   }
+}
+
+
+/**
+ * This function generates code to get all the necessary possible
+ * stencil values.  For each of the output registers (fail_reg,
+ * zfail_reg, and zpass_reg), it either allocates a new register
+ * and calculates a new set of values based on the stencil operation,
+ * or it reuses a register allocation and calculation done for an
+ * earlier (matching) operation, or it reuses the fbS_reg register
+ * (if the stencil operation is KEEP, which doesn't change the 
+ * stencil buffer).
+ *
+ * Since this function allocates a variable number of registers,
+ * to avoid incurring complex logic to free them, they should
+ * be allocated after a spe_allocate_register_set() call
+ * and released by the corresponding spe_release_register_set() call.
+ */
+static void
+gen_get_stencil_values(struct spe_function *f,
+                       const struct pipe_stencil_state *stencil,
+                       const uint depth_enabled,
+                       int fbS_reg, 
+                       int *fail_reg,
+                       int *zfail_reg, 
+                       int *zpass_reg)
+{
+   uint zfail_op;
+
+   /* Stenciling had better be enabled here */
+   ASSERT(stencil->enabled);
+
+   /* If the depth test is not enabled, it is treated as though it always
+    * passes, which means that the zfail_op is not considered - a
+    * failing stencil test triggers the fail_op, and a passing one
+    * triggers the zpass_op
+    *
+    * As an optimization, override calculation of the zfail_op values
+    * if they aren't going to be used.  By setting the value of
+    * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed
+    * to match the incoming stencil values, and no calculation will
+    * be done.
+    */
+   if (depth_enabled) {
+      zfail_op = stencil->zfail_op;
+   }
+   else {
+      zfail_op = PIPE_STENCIL_OP_KEEP;
+   }
+
+   /* One-sided or front-facing stencil */
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP) {
+      *fail_reg = fbS_reg;
+   }
+   else {
+      *fail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, stencil->fail_op, stencil->ref_value, 
+         0xff, fbS_reg, *fail_reg);
+   }
+
+   /* Check the possibly overridden value, not the structure value */
+   if (zfail_op == PIPE_STENCIL_OP_KEEP) {
+      *zfail_reg = fbS_reg;
+   }
+   else if (zfail_op == stencil->fail_op) {
+      *zfail_reg = *fail_reg;
+   }
+   else {
+      *zfail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, stencil->zfail_op, stencil->ref_value, 
+         0xff, fbS_reg, *zfail_reg);
+   }
+
+   if (stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
+      *zpass_reg = fbS_reg;
+   }
+   else if (stencil->zpass_op == stencil->fail_op) {
+      *zpass_reg = *fail_reg;
+   }
+   else if (stencil->zpass_op == zfail_op) {
+      *zpass_reg = *zfail_reg;
+   }
+   else {
+      *zpass_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, stencil->zpass_op, stencil->ref_value, 
+         0xff, fbS_reg, *zpass_reg);
+   }
+}
+
+/**
+ * Note that fbZ_reg may *not* be set on entry, if in fact
+ * the depth test is not enabled.  This function must not use
+ * the register if depth is not enabled.
+ */
+static boolean
+gen_stencil_depth_test(struct spe_function *f, 
+                       const struct pipe_depth_stencil_alpha_state *dsa, 
+                       const uint facing,
+                       const int mask_reg, const int fragZ_reg, 
+                       const int fbZ_reg, const int fbS_reg)
+{
+   /* True if we've generated code that could require writeback to the
+    * depth and/or stencil buffers
+    */
+   boolean modified_buffers = FALSE;
+
+   boolean need_to_calculate_stencil_values;
+   boolean need_to_writemask_stencil_values;
+
+   struct pipe_stencil_state *stencil;
+
+   /* Registers.  We may or may not actually allocate these, depending
+    * on whether the state values indicate that we need them.
+    */
+   int stencil_pass_reg, stencil_fail_reg;
+   int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+   int stencil_writemask_reg;
+   int zmask_reg;
+   int newS_reg;
+
+   /* Stenciling is quite complex: up to six different configurable stencil 
+    * operations/calculations can be required (three each for front-facing
+    * and back-facing fragments).  Many of those operations will likely 
+    * be identical, so there's good reason to try to avoid calculating 
+    * the same values more than once (which unfortunately makes the code less 
+    * straightforward).
+    *
+    * To make register management easier, we start a new 
+    * register set; we can release all the registers in the set at
+    * once, and avoid having to keep track of exactly which registers
+    * we allocate.  We can still allocate and free registers as 
+    * desired (if we know we no longer need a register), but we don't
+    * have to spend the complexity to track the more difficult variant
+    * register usage scenarios.
+    */
+   spe_comment(f, 0, "Allocating stencil register set");
+   spe_allocate_register_set(f);
+
+   /* The facing we're given is the fragment facing; it doesn't
+    * exactly match the stencil facing.  If stencil is enabled,
+    * but two-sided stencil is *not* enabled, we use the same
+    * stencil settings for both front- and back-facing fragments.
+    * We only use the "back-facing" stencil for backfacing fragments
+    * if two-sided stenciling is enabled.
+    */
+   if (facing == CELL_FACING_BACK && dsa->stencil[1].enabled) {
+      stencil = &dsa->stencil[1];
+   }
+   else {
+      stencil = &dsa->stencil[0];
+   }
+
+   /* Calculate the writemask.  If the writemask is trivial (either
+    * all 0s, meaning that we don't need to calculate any stencil values
+    * because they're not going to change the stencil anyway, or all 1s,
+    * meaning that we have to calculate the stencil values but do not
+    * need to mask them), we can avoid generating code.  Don't forget
+    * that we need to consider backfacing stencil, if enabled.
+    *
+    * Note that if the backface stencil is *not* enabled, the backface
+    * stencil will have the same values as the frontface stencil.
+    */
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zfail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
+       need_to_calculate_stencil_values = FALSE;
+       need_to_writemask_stencil_values = FALSE;
+    }
+    else if (stencil->writemask == 0x0) {
+      /* All changes are writemasked out, so no need to calculate
+       * what those changes might be, and no need to write anything back.
+       */
+      need_to_calculate_stencil_values = FALSE;
+      need_to_writemask_stencil_values = FALSE;
+   }
+   else if (stencil->writemask == 0xff) {
+      /* Still trivial, but a little less so.  We need to write the stencil
+       * values, but we don't need to mask them.
+       */
+      need_to_calculate_stencil_values = TRUE;
+      need_to_writemask_stencil_values = FALSE;
+   }
+   else {
+      /* The general case: calculate, mask, and write */
+      need_to_calculate_stencil_values = TRUE;
+      need_to_writemask_stencil_values = TRUE;
+
+      /* While we're here, generate code that calculates what the
+       * writemask should be.  If backface stenciling is enabled,
+       * and the backface writemask is not the same as the frontface
+       * writemask, we'll have to generate code that merges the
+       * two masks into a single effective mask based on fragment facing.
+       */
+      spe_comment(f, 0, "Computing stencil writemask");
+      stencil_writemask_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[facing].writemask);
+   }
+
+   /* At least one-sided stenciling must be on.  Generate code that
+    * runs the stencil test on the basic/front-facing stencil, leaving
+    * the mask of passing stencil bits in stencil_pass_reg.  This mask will
+    * be used both to mask the set of active pixels, and also to
+    * determine how the stencil buffer changes.
+    *
+    * This test will *not* change the value in mask_reg (because we don't
+    * yet know whether to apply the two-sided stencil or one-sided stencil).
+    */
+   spe_comment(f, 0, "Running basic stencil test");
+   stencil_pass_reg = spe_allocate_available_register(f);
+   gen_stencil_test(f, stencil, 0xff, mask_reg, fbS_reg, stencil_pass_reg);
+
+   /* Generate code that, given the mask of valid fragments and the
+    * mask of valid fragments that passed the stencil test, computes
+    * the mask of valid fragments that failed the stencil test.  We
+    * have to do this before we run a depth test (because the
+    * depth test should not be performed on fragments that failed the
+    * stencil test, and because the depth test will update the 
+    * mask of valid fragments based on the results of the depth test).
+    */
+   spe_comment(f, 0, "Computing stencil fail mask and updating fragment mask");
+   stencil_fail_reg = spe_allocate_available_register(f);
+   spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
+   /* Now remove the stenciled-out pixels from the valid fragment mask,
+    * so we can later use the valid fragment mask in the depth test.
+    */
+   spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
+
+   /* We may not need to calculate stencil values, if the writemask is off */
+   if (need_to_calculate_stencil_values) {
+      /* Generate code that calculates exactly which stencil values we need,
+       * without calculating the same value twice (say, if two different
+       * stencil ops have the same value).  This code will work for one-sided
+       * and two-sided stenciling (so that we take into account that operations
+       * may match between front and back stencils), and will also take into
+       * account whether the depth test is enabled (if the depth test is off,
+       * we don't need any of the zfail results, because the depth test always
+       * is considered to pass if it is disabled).  Any register value that
+       * does not need to be calculated will come back with the same value
+       * that's in fbS_reg.
+       *
+       * This function will allocate a variant number of registers that
+       * will be released as part of the register set.
+       */
+      spe_comment(f, 0, facing == CELL_FACING_FRONT
+                  ? "Computing front-facing stencil values"
+                  : "Computing back-facing stencil values");
+      gen_get_stencil_values(f, stencil, dsa->depth.enabled, fbS_reg, 
+         &stencil_fail_values, &stencil_pass_depth_fail_values, 
+         &stencil_pass_depth_pass_values);
+   }  
+
+   /* We now have all the stencil values we need.  We also need 
+    * the results of the depth test to figure out which
+    * stencil values will become the new stencil values.  (Even if
+    * we aren't actually calculating stencil values, we need to apply
+    * the depth test if it's enabled.)
+    *
+    * The code generated by gen_depth_test() returns the results of the
+    * test in the given register, but also alters the mask_reg based
+    * on the results of the test.
+    */
+   if (dsa->depth.enabled) {
+      spe_comment(f, 0, "Running stencil depth test");
+      zmask_reg = spe_allocate_available_register(f);
+      modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg,
+                                         fbZ_reg, zmask_reg);
+   }
+
+   if (need_to_calculate_stencil_values) {
+
+      /* If we need to writemask the stencil values before going into
+       * the stencil buffer, we'll have to use a new register to
+       * hold the new values.  If not, we can just keep using the
+       * current register.
+       */
+      if (need_to_writemask_stencil_values) {
+         newS_reg = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Saving current stencil values for writemasking");
+         spe_move(f, newS_reg, fbS_reg);
+      }
+      else {
+         newS_reg = fbS_reg;
+      }
+
+      /* Merge in the selected stencil fail values */
+      if (stencil_fail_values != fbS_reg) {
+         spe_comment(f, 0, "Loading stencil fail values");
+         spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+         modified_buffers = TRUE;
+      }
+
+      /* Same for the stencil pass/depth fail values.  If this calculation
+       * is not needed (say, if depth test is off), then the
+       * stencil_pass_depth_fail_values register will be equal to fbS_reg
+       * and we'll skip the calculation.
+       */
+      if (stencil_pass_depth_fail_values != fbS_reg) {
+         /* We don't actually have a stencil pass/depth fail mask yet.
+          * Calculate it here from the stencil passing mask and the
+          * depth passing mask.  Note that zmask_reg *must* have been
+          * set above if we're here.
+          */
+         uint stencil_pass_depth_fail_mask =
+            spe_allocate_available_register(f);
+
+         spe_comment(f, 0, "Loading stencil pass/depth fail values");
+         spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values,
+                  stencil_pass_depth_fail_mask);
+
+         spe_release_register(f, stencil_pass_depth_fail_mask);
+         modified_buffers = TRUE;
+      }
+
+      /* Same for the stencil pass/depth pass mask.  Note that we
+       * *can* get here with zmask_reg being unset (if the depth
+       * test is off but the stencil test is on).  In this case,
+       * we assume the depth test passes, and don't need to mask
+       * the stencil pass mask with the Z mask.
+       */
+      if (stencil_pass_depth_pass_values != fbS_reg) {
+         if (dsa->depth.enabled) {
+            uint stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+            /* We'll need a separate register */
+            spe_comment(f, 0, "Loading stencil pass/depth pass values");
+            spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+            spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+            spe_release_register(f, stencil_pass_depth_pass_mask);
+         }
+         else {
+            /* We can use the same stencil-pass register */
+            spe_comment(f, 0, "Loading stencil pass values");
+            spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg);
+         }
+         modified_buffers = TRUE;
+      }
+
+      /* Almost done.  If we need to writemask, do it now, leaving the
+       * results in the fbS_reg register passed in.  If we don't need
+       * to writemask, then the results are *already* in the fbS_reg,
+       * so there's nothing more to do.
+       */
+
+      if (need_to_writemask_stencil_values && modified_buffers) {
+         /* The Select Bytes command makes a fine writemask.  Where
+          * the mask is 0, the first (original) values are retained,
+          * effectively masking out changes.  Where the mask is 1, the
+          * second (new) values are retained, incorporating changes.
+          */
+         spe_comment(f, 0, "Writemasking new stencil values");
+         spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
+      }
+
+   } /* done calculating stencil values */
+
+   /* The stencil and/or depth values have been applied, and the
+    * mask_reg, fbS_reg, and fbZ_reg values have been updated.
+    * We're all done, except that we've allocated a fair number
+    * of registers that we didn't bother tracking.  Release all
+    * those registers as part of the register set, and go home.
+    */
+   spe_comment(f, 0, "Releasing stencil register set");
+   spe_release_register_set(f);
+
+   /* Return TRUE if we could have modified the stencil and/or
+    * depth buffers.
+    */
+   return modified_buffers;
+}
+
+
+/**
+ * Generate depth and/or stencil test code.
+ * \param cell  context
+ * \param dsa  depth/stencil/alpha state
+ * \param f  spe function to emit
+ * \param facing  either CELL_FACING_FRONT or CELL_FACING_BACK
+ * \param mask_reg  register containing the pixel alive/dead mask
+ * \param depth_tile_reg  register containing address of z/stencil tile
+ * \param quad_offset_reg  offset to quad from start of tile
+ * \param fragZ_reg  register containg fragment Z values
+ */
+static void
+gen_depth_stencil(struct cell_context *cell,
+                  const struct pipe_depth_stencil_alpha_state *dsa,
+                  struct spe_function *f,
+                  uint facing,
+                  int mask_reg,
+                  int depth_tile_reg,
+                  int quad_offset_reg,
+                  int fragZ_reg)
+
+{
+   const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
+   boolean write_depth_stencil;
+
+   /* framebuffer's combined z/stencil values register */
+   int fbZS_reg = spe_allocate_available_register(f);
+
+   /* Framebufer Z values register */
+   int fbZ_reg = spe_allocate_available_register(f);
+
+   /* Framebuffer stencil values register (may not be used) */
+   int fbS_reg = spe_allocate_available_register(f);
+
+   /* 24-bit mask register (may not be used) */
+   int zmask_reg = spe_allocate_available_register(f);
+
+   /**
+    * The following code:
+    * 1. fetch quad of packed Z/S values from the framebuffer tile.
+    * 2. extract the separate the Z and S values from packed values
+    * 3. convert fragment Z values from float in [0,1] to 32/24/16-bit ints
+    *
+    * The instructions for doing this are interleaved for better performance.
+    */
+   spe_comment(f, 0, "Fetch Z/stencil quad from tile");
+
+   switch(zs_format) {
+   case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
+   case PIPE_FORMAT_X8Z24_UNORM:
+      /* prepare mask to extract Z vals from ZS vals */
+      spe_load_uint(f, zmask_reg, 0x00ffffff);
+
+      /* convert fragment Z from [0,1] to 32-bit ints */
+      spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      /* right shift 32-bit fragment Z to 24 bits */
+      spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+      /* extract 24-bit Z values from ZS values by masking */
+      spe_and(f, fbZ_reg, fbZS_reg, zmask_reg);
+
+      /* extract 8-bit stencil values by shifting */
+      spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+      break;
+
+   case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
+   case PIPE_FORMAT_Z24X8_UNORM:
+      /* convert fragment Z from [0,1] to 32-bit ints */
+      spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      /* right shift 32-bit fragment Z to 24 bits */
+      spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+      /* extract 24-bit Z values from ZS values by shifting */
+      spe_rotmi(f, fbZ_reg, fbZS_reg, -8);
+
+      /* extract 8-bit stencil values by masking */
+      spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+      break;
+
+   case PIPE_FORMAT_Z32_UNORM:
+      /* Load: fbZ_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZ_reg, depth_tile_reg, quad_offset_reg);
+
+      /* convert fragment Z from [0,1] to 32-bit ints */
+      spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
+      /* No stencil, so can't do anything there */
+      break;
+
+   case PIPE_FORMAT_Z16_UNORM:
+      /* XXX This code for 16bpp Z is broken! */
+
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      /* Copy over 4 32-bit values */
+      spe_move(f, fbZ_reg, fbZS_reg);
+
+      /* convert Z from [0,1] to 16-bit ints */
+      spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+      spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+      /* No stencil */
+      break;
+
+   default:
+      ASSERT(0); /* invalid format */
+   }
+
+   /* If stencil is enabled, use the stencil-specific code
+    * generator to generate both the stencil and depth (if needed)
+    * tests.  Otherwise, if only depth is enabled, generate
+    * a quick depth test.  The test generators themselves will
+    * report back whether the depth/stencil buffer has to be
+    * written back.
+    */
+   if (dsa->stencil[0].enabled) {
+      /* This will perform the stencil and depth tests, and update
+       * the mask_reg, fbZ_reg, and fbS_reg as required by the
+       * tests.
+       */
+      ASSERT(fbS_reg >= 0);
+      spe_comment(f, 0, "Perform stencil test");
+
+      /* Note that fbZ_reg may not be set on entry, if stenciling
+       * is enabled but there's no Z-buffer.  The 
+       * gen_stencil_depth_test() function must ignore the
+       * fbZ_reg register if depth is not enabled.
+       */
+      write_depth_stencil = gen_stencil_depth_test(f, dsa, facing,
+                                                   mask_reg, fragZ_reg,
+                                                   fbZ_reg, fbS_reg);
+   }
+   else if (dsa->depth.enabled) {
+      int zmask_reg = spe_allocate_available_register(f);
+      ASSERT(fbZ_reg >= 0);
+      spe_comment(f, 0, "Perform depth test");
+      write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg,
+                                           fbZ_reg, zmask_reg);
+      spe_release_register(f, zmask_reg);
+   }
+   else {
+      write_depth_stencil = FALSE;
+   }
+
+   if (write_depth_stencil) {
+      /* Merge latest Z and Stencil values into fbZS_reg.
+       * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
+       * fbS_reg has four 8-bit Z values in bits [7..0].
+       */
+      spe_comment(f, 0, "Store quad's depth/stencil values in tile");
+      if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+          zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+         spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+         spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+      }
+      else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+               zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+         spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+         spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+      }
+      else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+         spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+      }
+      else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+         spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+      }
+      else if (zs_format == PIPE_FORMAT_S8_UNORM) {
+         ASSERT(0);   /* XXX to do */
+      }
+      else {
+         ASSERT(0); /* bad zs_format */
+      }
+
+      /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
+      spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+   }
+
+   /* Don't need these any more */
+   spe_release_register(f, fbZS_reg);
+   spe_release_register(f, fbZ_reg);
+   spe_release_register(f, fbS_reg);
+   spe_release_register(f, zmask_reg);
+}
+
 
 
 /**
@@ -621,14 +2016,21 @@ gen_pack_colors(struct spe_function *f,
  * should be much faster.
  *
  * \param cell  the rendering context (in)
- * \param f     the generated function (out)
+ * \param facing whether the generated code is for front-facing or 
+ *              back-facing fragments
+ * \param f     the generated function (in/out); on input, the function
+ *              must already have been initialized.  On exit, whatever
+ *              instructions within the generated function have had
+ *              the fragment ops appended.
  */
 void
-cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
+cell_gen_fragment_function(struct cell_context *cell,
+                           const uint facing,
+                           struct spe_function *f)
 {
-   const struct pipe_depth_stencil_alpha_state *dsa =
-      &cell->depth_stencil->base;
-   const struct pipe_blend_state *blend = &cell->blend->base;
+   const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
+   const struct pipe_blend_state *blend = cell->blend;
+   const struct pipe_blend_color *blend_color = &cell->blend_color;
    const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
 
    /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
@@ -643,15 +2045,23 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const int fragA_reg = 11;  /* vector float */
    const int mask_reg = 12;   /* vector uint */
 
+   ASSERT(facing == CELL_FACING_FRONT || facing == CELL_FACING_BACK);
+
    /* offset of quad from start of tile
     * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
     */
    int quad_offset_reg;
 
    int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
-   int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
 
-   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_print_code(f, TRUE);
+      spe_indent(f, 8);
+      spe_comment(f, -4, facing == CELL_FACING_FRONT
+                  ? "Begin front-facing per-fragment ops"
+                  : "Begin back-facing per-fragment ops");
+   }
+
    spe_allocate_register(f, x_reg);
    spe_allocate_register(f, y_reg);
    spe_allocate_register(f, color_tile_reg);
@@ -665,7 +2075,6 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
    quad_offset_reg = spe_allocate_available_register(f);
    fbRGBA_reg = spe_allocate_available_register(f);
-   fbZS_reg = spe_allocate_available_register(f);
 
    /* compute offset of quad from start of tile, in bytes */
    {
@@ -674,8 +2083,9 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
       ASSERT(TILE_SIZE == 32);
 
-      spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
+      spe_comment(f, 0, "Compute quad offset within tile");
       spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
+      spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
       spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
       spe_a(f, quad_offset_reg, y2_reg, x2_reg);  /* offset = y2 + x2 */
       spe_shli(f, quad_offset_reg, quad_offset_reg, 4);   /* offset *= 16 */
@@ -684,139 +2094,33 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       spe_release_register(f, y2_reg);
    }
 
-
+   /* Generate the alpha test, if needed. */
    if (dsa->alpha.enabled) {
       gen_alpha_test(dsa, f, mask_reg, fragA_reg);
    }
 
+   /* generate depth and/or stencil test code */
    if (dsa->depth.enabled || dsa->stencil[0].enabled) {
-      const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
-      boolean write_depth_stencil;
-
-      int fbZ_reg = spe_allocate_available_register(f); /* Z values */
-      int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
-
-      /* fetch quad of depth/stencil values from tile at (x,y) */
-      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
-      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
-
-      if (dsa->depth.enabled) {
-         /* Extract Z bits from fbZS_reg into fbZ_reg */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            int mask_reg = spe_allocate_available_register(f);
-            spe_fsmbi(f, mask_reg, 0x7777);  /* mask[0,1,2,3] = 0x00ffffff */
-            spe_and(f, fbZ_reg, fbZS_reg, mask_reg);  /* fbZ = fbZS & mask */
-            spe_release_register(f, mask_reg);
-            /* OK, fbZ_reg has four 24-bit Z values now */
-         }
-         else {
-            /* XXX handle other z/stencil formats */
-            ASSERT(0);
-         }
-
-         /* Convert fragZ values from float[4] to uint[4] */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-             zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* 24-bit Z values */
-            int scale_reg = spe_allocate_available_register(f);
-
-            /* scale_reg[0,1,2,3] = float(2^24-1) */
-            spe_load_float(f, scale_reg, (float) 0xffffff);
-
-            /* XXX these two instructions might be combined */
-            spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 0);  /* fragZ = (int) fragZ */
-
-            spe_release_register(f, scale_reg);
-         }
-         else {
-            /* XXX handle 16-bit Z format */
-            ASSERT(0);
-         }
-      }
-
-      if (dsa->stencil[0].enabled) {
-         /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            /* XXX extract with a shift */
-            ASSERT(0);
-         }
-         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* XXX extract with a mask */
-            ASSERT(0);
-         }
-      }
-
-
-      if (dsa->stencil[0].enabled) {
-         /* XXX this may involve depth testing too */
-         // gen_stencil_test(dsa, f, ... );
-         ASSERT(0);
-      }
-      else if (dsa->depth.enabled) {
-         int zmask_reg = spe_allocate_available_register(f);
-         gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
-         spe_release_register(f, zmask_reg);
-      }
-
-      /* do we need to write Z and/or Stencil back into framebuffer? */
-      write_depth_stencil = (dsa->depth.writemask |
-                             dsa->stencil[0].write_mask |
-                             dsa->stencil[1].write_mask);
-
-      if (write_depth_stencil) {
-         /* Merge latest Z and Stencil values into fbZS_reg.
-          * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
-          * fbS_reg has four 8-bit Z values in bits [7..0].
-          */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
-            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
-         }
-         else if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-                  zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            /* XXX to do */
-            ASSERT(0);
-         }
-         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            /* XXX to do */
-            ASSERT(0);
-         }
-         else if (zs_format == PIPE_FORMAT_S8_UNORM) {
-            /* XXX to do */
-            ASSERT(0);
-         }
-         else {
-            /* bad zs_format */
-            ASSERT(0);
-         }
-
-         /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
-         spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
-      }
-
-      spe_release_register(f, fbZ_reg);
-      spe_release_register(f, fbS_reg);
+      gen_depth_stencil(cell, dsa, f,
+                        facing,
+                        mask_reg,
+                        depth_tile_reg,
+                        quad_offset_reg,
+                        fragZ_reg);
    }
 
-
    /* Get framebuffer quad/colors.  We'll need these for blending,
     * color masking, and to obey the quad/pixel mask.
     * Load: fbRGBA_reg = memory[color_tile + quad_offset]
     * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
     * we could skip this load.
     */
+   spe_comment(f, 0, "Fetch quad colors from tile");
    spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
 
-
    if (blend->blend_enable) {
-      gen_blend(blend, f, color_format,
+      spe_comment(f, 0, "Perform blending");
+      gen_blend(blend, blend_color, f, color_format,
                 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
    }
 
@@ -829,19 +2133,21 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       int rgba_reg = spe_allocate_available_register(f);
 
       /* Pack four float colors as four 32-bit int colors */
+      spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
       gen_pack_colors(f, color_format,
                       fragR_reg, fragG_reg, fragB_reg, fragA_reg,
                       rgba_reg);
 
       if (blend->logicop_enable) {
+         spe_comment(f, 0, "Compute logic op");
          gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
       }
 
-      if (blend->colormask != 0xf) {
-         gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg);
+      if (blend->colormask != PIPE_MASK_RGBA) {
+         spe_comment(f, 0, "Compute color mask");
+         gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
       }
 
-
       /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
        * if (mask[i])
        *    rgba[i] = rgba[i];
@@ -853,6 +2159,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       /* Store updated quad in tile:
        * memory[color_tile + quad_offset] = rgba_reg;
        */
+      spe_comment(f, 0, "Store quad colors into color tile");
       spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
 
       spe_release_register(f, rgba_reg);
@@ -862,9 +2169,13 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
    spe_bi(f, SPE_REG_RA, 0, 0);  /* return from function call */
 
-
    spe_release_register(f, fbRGBA_reg);
-   spe_release_register(f, fbZS_reg);
    spe_release_register(f, quad_offset_reg);
-}
 
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      char buffer[1024];
+      sprintf(buffer, "End %s-facing per-fragment ops: %d instructions", 
+         facing == CELL_FACING_FRONT ? "front" : "back", f->num_inst);
+      spe_comment(f, -4, buffer);
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
index b59de198dc..21b35d1faf 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -31,7 +31,7 @@
 
 
 extern void
-cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f);
+cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f);
 
 
 #endif /* CELL_GEN_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 475c6ef0ce..facd9551fe 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -35,9 +35,9 @@
 #include "draw/draw_context.h"
 #include "cell_context.h"
 #include "cell_flush.h"
+#include "cell_pipe_state.h"
 #include "cell_state.h"
 #include "cell_texture.h"
-#include "cell_state_per_fragment.h"
 
 
 
@@ -45,24 +45,18 @@ static void *
 cell_create_blend_state(struct pipe_context *pipe,
                         const struct pipe_blend_state *blend)
 {
-   struct cell_blend_state *cb = MALLOC(sizeof(struct cell_blend_state));
-
-   (void) memcpy(cb, blend, sizeof(*blend));
-#if 0
-   cell_generate_alpha_blend(cb);
-#endif
-   return cb;
+   return mem_dup(blend, sizeof(*blend));
 }
 
 
 static void
-cell_bind_blend_state(struct pipe_context *pipe, void *state)
+cell_bind_blend_state(struct pipe_context *pipe, void *blend)
 {
    struct cell_context *cell = cell_context(pipe);
 
    draw_flush(cell->draw);
 
-   cell->blend = (struct cell_blend_state *) state;
+   cell->blend = (struct pipe_blend_state *) blend;
    cell->dirty |= CELL_NEW_BLEND;
 }
 
@@ -70,10 +64,7 @@ cell_bind_blend_state(struct pipe_context *pipe, void *state)
 static void
 cell_delete_blend_state(struct pipe_context *pipe, void *blend)
 {
-   struct cell_blend_state *cb = (struct cell_blend_state *) blend;
-
-   spe_release_func(& cb->code);
-   FREE(cb);
+   FREE(blend);
 }
 
 
@@ -95,41 +86,29 @@ cell_set_blend_color(struct pipe_context *pipe,
 
 static void *
 cell_create_depth_stencil_alpha_state(struct pipe_context *pipe,
-                 const struct pipe_depth_stencil_alpha_state *depth_stencil)
+                 const struct pipe_depth_stencil_alpha_state *dsa)
 {
-   struct cell_depth_stencil_alpha_state *cdsa =
-       MALLOC(sizeof(struct cell_depth_stencil_alpha_state));
-
-   (void) memcpy(cdsa, depth_stencil, sizeof(*depth_stencil));
-#if 0
-   cell_generate_depth_stencil_test(cdsa);
-#endif
-   return cdsa;
+   return mem_dup(dsa, sizeof(*dsa));
 }
 
 
 static void
 cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
-                                    void *depth_stencil)
+                                    void *dsa)
 {
    struct cell_context *cell = cell_context(pipe);
 
    draw_flush(cell->draw);
 
-   cell->depth_stencil =
-       (struct cell_depth_stencil_alpha_state *) depth_stencil;
+   cell->depth_stencil = (struct pipe_depth_stencil_alpha_state *) dsa;
    cell->dirty |= CELL_NEW_DEPTH_STENCIL;
 }
 
 
 static void
-cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *depth)
+cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *dsa)
 {
-   struct cell_depth_stencil_alpha_state *cdsa =
-       (struct cell_depth_stencil_alpha_state *) depth;
-
-   spe_release_func(& cdsa->code);
-   FREE(cdsa);
+   FREE(dsa);
 }
 
 
@@ -191,24 +170,23 @@ cell_set_polygon_stipple( struct pipe_context *pipe,
 
 static void *
 cell_create_rasterizer_state(struct pipe_context *pipe,
-                             const struct pipe_rasterizer_state *setup)
+                             const struct pipe_rasterizer_state *rasterizer)
 {
-   struct pipe_rasterizer_state *state
-      = MALLOC(sizeof(struct pipe_rasterizer_state));
-   memcpy(state, setup, sizeof(struct pipe_rasterizer_state));
-   return state;
+   return mem_dup(rasterizer, sizeof(*rasterizer));
 }
 
 
 static void
-cell_bind_rasterizer_state(struct pipe_context *pipe, void *setup)
+cell_bind_rasterizer_state(struct pipe_context *pipe, void *rast)
 {
+   struct pipe_rasterizer_state *rasterizer =
+      (struct pipe_rasterizer_state *) rast;
    struct cell_context *cell = cell_context(pipe);
 
    /* pass-through to draw module */
-   draw_set_rasterizer_state(cell->draw, setup);
+   draw_set_rasterizer_state(cell->draw, rasterizer);
 
-   cell->rasterizer = (struct pipe_rasterizer_state *)setup;
+   cell->rasterizer = rasterizer;
 
    cell->dirty |= CELL_NEW_RASTERIZER;
 }
@@ -235,17 +213,24 @@ cell_bind_sampler_states(struct pipe_context *pipe,
                          unsigned num, void **samplers)
 {
    struct cell_context *cell = cell_context(pipe);
+   uint i, changed = 0x0;
 
    assert(num <= CELL_MAX_SAMPLERS);
 
    draw_flush(cell->draw);
 
-   memcpy(cell->sampler, samplers, num * sizeof(void *));
-   memset(&cell->sampler[num], 0, (CELL_MAX_SAMPLERS - num) *
-          sizeof(void *));
-   cell->num_samplers = num;
+   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+      struct pipe_sampler_state *new_samp = i < num ? samplers[i] : NULL;
+      if (cell->sampler[i] != new_samp) {
+         cell->sampler[i] = new_samp;
+         changed |= (1 << i);
+      }
+   }
 
-   cell->dirty |= CELL_NEW_SAMPLER;
+   if (changed) {
+      cell->dirty |= CELL_NEW_SAMPLER;
+      cell->dirty_samplers |= changed;
+   }
 }
 
 
@@ -263,30 +248,101 @@ cell_set_sampler_textures(struct pipe_context *pipe,
                           unsigned num, struct pipe_texture **texture)
 {
    struct cell_context *cell = cell_context(pipe);
-   uint i;
+   uint i, changed = 0x0;
 
    assert(num <= CELL_MAX_SAMPLERS);
 
-   /* Check for no-op */
-   if (num == cell->num_textures &&
-       !memcmp(cell->texture, texture, num * sizeof(struct pipe_texture *)))
-      return;
-
-   draw_flush(cell->draw);
-
    for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
-      struct pipe_texture *tex = i < num ? texture[i] : NULL;
+      struct cell_texture *new_tex = cell_texture(i < num ? texture[i] : NULL);
+      struct cell_texture *old_tex = cell->texture[i];
+      if (old_tex != new_tex) {
+
+         pipe_texture_reference((struct pipe_texture **) &cell->texture[i],
+                                (struct pipe_texture *) new_tex);
 
-      pipe_texture_reference((struct pipe_texture **) &cell->texture[i], tex);
+         changed |= (1 << i);
+      }
    }
+
    cell->num_textures = num;
 
-   cell_update_texture_mapping(cell);
+   if (changed) {
+      cell->dirty |= CELL_NEW_TEXTURE;
+      cell->dirty_textures |= changed;
+   }
+}
+
 
-   cell->dirty |= CELL_NEW_TEXTURE;
+/**
+ * Map color and z/stencil framebuffer surfaces.
+ */
+static void
+cell_map_surfaces(struct cell_context *cell)
+{
+   struct pipe_screen *screen = cell->pipe.screen;
+   uint i;
+
+   for (i = 0; i < 1; i++) {
+      struct pipe_surface *ps = cell->framebuffer.cbufs[i];
+      if (ps) {
+         cell->cbuf_transfer[i] =
+            screen->get_tex_transfer(screen, ps->texture, ps->face,
+                                     ps->level, ps->zslice,
+                                     PIPE_TRANSFER_READ_WRITE,
+                                     0, 0, ps->width, ps->height);
+
+         cell->cbuf_map[i] =
+            screen->transfer_map(screen, cell->cbuf_transfer[i]);
+      }
+   }
+
+   {
+      struct pipe_surface *ps = cell->framebuffer.zsbuf;
+      if (ps) {
+         cell->zsbuf_transfer =
+            screen->get_tex_transfer(screen, ps->texture, ps->face,
+                                     ps->level, ps->zslice,
+                                     PIPE_TRANSFER_READ_WRITE,
+                                     0, 0, ps->width, ps->height);
+
+         cell->zsbuf_map =
+            screen->transfer_map(screen, cell->zsbuf_transfer);
+      }
+   }
 }
 
 
+/**
+ * Unmap color and z/stencil framebuffer surfaces.
+ */
+static void
+cell_unmap_surfaces(struct cell_context *cell)
+{
+   struct pipe_screen *screen = cell->pipe.screen;
+   uint i;
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      if (cell->cbuf_transfer[i] && cell->cbuf_map[i]) {
+         /* unmap color buffer/surface [i] */
+         screen->transfer_unmap(screen, cell->cbuf_transfer[i]);
+         cell->cbuf_map[i] = NULL;
+
+         /* get rid of transfer object [i] */
+         screen->tex_transfer_release(screen, &cell->cbuf_transfer[i]);
+         assert(cell->cbuf_transfer[i] == NULL);
+      }
+   }
+
+   if (cell->zsbuf_transfer && cell->zsbuf_map) {
+      screen->transfer_unmap(screen, cell->zsbuf_transfer);
+      cell->zsbuf_map = NULL;
+
+      /* get rid of transfer object */
+      screen->tex_transfer_release(screen, &cell->zsbuf_transfer);
+      assert(cell->zsbuf_transfer == NULL);
+   }
+}
+
 
 static void
 cell_set_framebuffer_state(struct pipe_context *pipe,
@@ -295,24 +351,10 @@ cell_set_framebuffer_state(struct pipe_context *pipe,
    struct cell_context *cell = cell_context(pipe);
 
    if (1 /*memcmp(&cell->framebuffer, fb, sizeof(*fb))*/) {
-      struct pipe_surface *csurf = fb->cbufs[0];
-      struct pipe_surface *zsurf = fb->zsbuf;
       uint i;
-      uint flags = (PIPE_BUFFER_USAGE_GPU_WRITE |
-                    PIPE_BUFFER_USAGE_GPU_READ);
 
       /* unmap old surfaces */
-      for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
-         if (cell->framebuffer.cbufs[i] && cell->cbuf_map[i]) {
-            pipe_surface_unmap(cell->framebuffer.cbufs[i]);
-            cell->cbuf_map[i] = NULL;
-         }
-      }
-
-      if (cell->framebuffer.zsbuf && cell->zsbuf_map) {
-         pipe_surface_unmap(cell->framebuffer.zsbuf);
-         cell->zsbuf_map = NULL;
-      }
+      cell_unmap_surfaces(cell);
 
       /* Finish any pending rendering to the current surface before
        * installing a new surface!
@@ -324,18 +366,14 @@ cell_set_framebuffer_state(struct pipe_context *pipe,
        */
       cell->framebuffer.width = fb->width;
       cell->framebuffer.height = fb->height;
-      cell->framebuffer.num_cbufs = fb->num_cbufs;
+      cell->framebuffer.nr_cbufs = fb->nr_cbufs;
       for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
          pipe_surface_reference(&cell->framebuffer.cbufs[i], fb->cbufs[i]);
       }
       pipe_surface_reference(&cell->framebuffer.zsbuf, fb->zsbuf);
 
       /* map new surfaces */
-      if (csurf)
-         cell->cbuf_map[0] = pipe_surface_map(csurf, flags);
-
-      if (zsurf)
-         cell->zsbuf_map = pipe_surface_map(zsurf, flags);
+      cell_map_surfaces(cell);
 
       cell->dirty |= CELL_NEW_FRAMEBUFFER;
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
index dd25ae880e..79cb8df82f 100644
--- a/src/gallium/drivers/cell/ppu/cell_render.c
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -152,6 +152,7 @@ cell_flush_prim_buffer(struct cell_context *cell)
       struct cell_command_render *render = &cell_global.command[i].render;
       render->prim_type = PIPE_PRIM_TRIANGLES;
       render->num_verts = cell->prim_buffer.num_verts;
+      render->front_winding = cell->rasterizer->front_winding;
       render->vertex_size = cell->vertex_info->size * 4;
       render->xmin = cell->prim_buffer.xmin;
       render->ymin = cell->prim_buffer.ymin;
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index 139b3719b6..512d85d352 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -27,7 +27,8 @@
 
 
 #include "util/u_memory.h"
-#include "pipe/p_winsys.h"
+#include "util/u_simple_screen.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 
@@ -58,9 +59,9 @@ cell_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
       return CELL_MAX_SAMPLERS;
    case PIPE_CAP_NPOT_TEXTURES:
-      return 0;
+      return 1;
    case PIPE_CAP_TWO_SIDED_STENCIL:
-      return 0;
+      return 1;
    case PIPE_CAP_GLSL:
       return 1;
    case PIPE_CAP_S3TC:
@@ -68,19 +69,23 @@ cell_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
-      return 0;
+      return 1;
    case PIPE_CAP_MAX_RENDER_TARGETS:
       return 1;
    case PIPE_CAP_OCCLUSION_QUERY:
-      return 0;
+      return 1;
    case PIPE_CAP_TEXTURE_SHADOW_MAP:
-      return 0;
+      return 10;
    case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
-      return 12; /* max 2Kx2K */
+      return CELL_MAX_TEXTURE_LEVELS;
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
       return 8;  /* max 128x128x128 */
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-      return 12; /* max 2Kx2K */
+      return CELL_MAX_TEXTURE_LEVELS;
+   case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+      return 1; /* XXX not really true */
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+      return 0; /* XXX to do */
    default:
       return 0;
    }
@@ -165,6 +170,7 @@ cell_create_screen(struct pipe_winsys *winsys)
    screen->is_format_supported = cell_is_format_supported;
 
    cell_init_screen_texture_funcs(screen);
+   u_simple_screen_init(screen);
 
    return screen;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index 9508227e29..28e5e6d706 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -36,6 +36,7 @@
 #include "cell_spu.h"
 #include "pipe/p_format.h"
 #include "pipe/p_state.h"
+#include "util/u_memory.h"
 #include "cell/common.h"
 
 
@@ -52,6 +53,35 @@ struct cell_global_info cell_global;
 
 
 /**
+ * Scan /proc/cpuinfo to determine the timebase for the system.
+ * This is used by the SPUs to convert 'decrementer' ticks to seconds.
+ * There may be a better way to get this value...
+ */
+static unsigned
+get_timebase(void)
+{
+   FILE *f = fopen("/proc/cpuinfo", "r");
+   unsigned timebase;
+
+   assert(f);
+   while (!feof(f)) {
+      char line[80];
+      fgets(line, sizeof(line), f);
+      if (strncmp(line, "timebase", 8) == 0) {
+         char *colon = strchr(line, ':');
+         if (colon) {
+            timebase = atoi(colon + 2);
+            break;
+         }
+      }
+   }
+   fclose(f);
+
+   return timebase;
+}
+
+
+/**
  * Write a 1-word message to the given SPE mailbox.
  */
 void
@@ -114,6 +144,7 @@ cell_start_spus(struct cell_context *cell)
 {
    static boolean one_time_init = FALSE;
    uint i, j;
+   uint timebase = get_timebase();
 
    if (one_time_init) {
       fprintf(stderr, "PPU: Multiple rendering contexts not yet supported "
@@ -123,24 +154,29 @@ cell_start_spus(struct cell_context *cell)
 
    one_time_init = TRUE;
 
-   assert(cell->num_spus <= MAX_SPUS);
-
-   ASSERT_ALIGN16(&cell_global.command[0]);
-   ASSERT_ALIGN16(&cell_global.command[1]);
+   assert(cell->num_spus <= CELL_MAX_SPUS);
 
    ASSERT_ALIGN16(&cell_global.inits[0]);
    ASSERT_ALIGN16(&cell_global.inits[1]);
 
+   /*
+    * Initialize the global 'inits' structure for each SPU.
+    * A pointer to the init struct will be passed to each SPU.
+    * The SPUs will then each grab their init info with mfc_get().
+    */
    for (i = 0; i < cell->num_spus; i++) {
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
       cell_global.inits[i].debug_flags = cell->debug_flags;
-      cell_global.inits[i].cmd = &cell_global.command[i];
+      cell_global.inits[i].inv_timebase = 1000.0f / timebase;
+
       for (j = 0; j < CELL_NUM_BUFFERS; j++) {
          cell_global.inits[i].buffers[j] = cell->buffer[j];
       }
       cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0];
 
+      cell_global.inits[i].spu_functions = &cell->spu_functions;
+
       cell_global.spe_contexts[i] = spe_context_create(0, NULL);
       if (!cell_global.spe_contexts[i]) {
          fprintf(stderr, "spe_context_create() failed\n");
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
index 137f26612e..c93958a9ed 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.h
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -30,14 +30,12 @@
 
 
 #include <libspe2.h>
-#include <libmisc.h>
+#include <pthread.h>
 #include "cell/common.h"
 
 #include "cell_context.h"
 
 
-#define MAX_SPUS 8
-
 /**
  * Global vars, for now anyway.
  */
@@ -46,14 +44,13 @@ struct cell_global_info
    /**
     * SPU/SPE handles, etc
     */
-   spe_context_ptr_t spe_contexts[MAX_SPUS];
-   pthread_t spe_threads[MAX_SPUS];
+   spe_context_ptr_t spe_contexts[CELL_MAX_SPUS];
+   pthread_t spe_threads[CELL_MAX_SPUS];
 
    /**
-    * Data sent to SPUs
+    * Data sent to SPUs at start-up
     */
-   struct cell_init_info inits[MAX_SPUS];
-   struct cell_command command[MAX_SPUS];
+   struct cell_init_info inits[CELL_MAX_SPUS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h
index a7771a55a3..b193170f9c 100644
--- a/src/gallium/drivers/cell/ppu/cell_state.h
+++ b/src/gallium/drivers/cell/ppu/cell_state.h
@@ -44,8 +44,9 @@
 #define CELL_NEW_TEXTURE       0x800
 #define CELL_NEW_VERTEX        0x1000
 #define CELL_NEW_VS            0x2000
-#define CELL_NEW_CONSTANTS     0x4000
-#define CELL_NEW_VERTEX_INFO   0x8000
+#define CELL_NEW_VS_CONSTANTS  0x4000
+#define CELL_NEW_FS_CONSTANTS  0x8000
+#define CELL_NEW_VERTEX_INFO   0x10000
 
 
 extern void
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 2da3097983..ff529fe22c 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -25,26 +25,163 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_inlines.h"
 #include "util/u_memory.h"
 #include "cell_context.h"
 #include "cell_gen_fragment.h"
 #include "cell_state.h"
 #include "cell_state_emit.h"
-#include "cell_state_per_fragment.h"
 #include "cell_batch.h"
 #include "cell_texture.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 
 
+/**
+ * Find/create a cell_command_fragment_ops object corresponding to the
+ * current blend/stencil/z/colormask/etc. state.
+ */
+static struct cell_command_fragment_ops *
+lookup_fragment_ops(struct cell_context *cell)
+{
+   struct cell_fragment_ops_key key;
+   struct cell_command_fragment_ops *ops;
+
+   /*
+    * Build key
+    */
+   memset(&key, 0, sizeof(key));
+   key.blend = *cell->blend;
+   key.blend_color = cell->blend_color;
+   key.dsa = *cell->depth_stencil;
+
+   if (cell->framebuffer.cbufs[0])
+      key.color_format = cell->framebuffer.cbufs[0]->format;
+   else
+      key.color_format = PIPE_FORMAT_NONE;
+
+   if (cell->framebuffer.zsbuf)
+      key.zs_format = cell->framebuffer.zsbuf->format;
+   else
+      key.zs_format = PIPE_FORMAT_NONE;
+
+   /*
+    * Look up key in cache.
+    */
+   ops = (struct cell_command_fragment_ops *)
+      util_keymap_lookup(cell->fragment_ops_cache, &key);
+
+   /*
+    * If not found, create/save new fragment ops command.
+    */
+   if (!ops) {
+      struct spe_function spe_code_front, spe_code_back;
+      unsigned int facing_dependent, total_code_size;
+
+      if (0)
+         debug_printf("**** Create New Fragment Ops\n");
+
+      /* Prepare the buffer that will hold the generated code.  The
+       * "0" passed in for the size means that the SPE code will
+       * use a default size.
+       */
+      spe_init_func(&spe_code_front, 0);
+      spe_init_func(&spe_code_back, 0);
+
+      /* Generate new code.  Always generate new code for both front-facing
+       * and back-facing fragments, even if it's the same code in both
+       * cases.
+       */
+      cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front);
+      cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back);
+
+      /* Make sure the code is a multiple of 8 bytes long; this is
+       * required to ensure that the dual pipe instruction alignment
+       * is correct.  It's also important for the SPU unpacking,
+       * which assumes 8-byte boundaries.
+       */
+      unsigned int front_code_size = spe_code_size(&spe_code_front);
+      while (front_code_size % 8 != 0) {
+         spe_lnop(&spe_code_front);
+         front_code_size = spe_code_size(&spe_code_front);
+      }
+      unsigned int back_code_size = spe_code_size(&spe_code_back);
+      while (back_code_size % 8 != 0) {
+         spe_lnop(&spe_code_back);
+         back_code_size = spe_code_size(&spe_code_back);
+      }
+
+      /* Determine whether the code we generated is facing-dependent, by
+       * determining whether the generated code is different for the front-
+       * and back-facing fragments.
+       */
+      if (front_code_size == back_code_size && memcmp(spe_code_front.store, spe_code_back.store, front_code_size) == 0) {
+         /* Code is identical; only need one copy. */
+         facing_dependent = 0;
+         total_code_size = front_code_size;
+      }
+      else {
+         /* Code is different for front-facing and back-facing fragments.
+          * Need to send both copies.
+          */
+         facing_dependent = 1;
+         total_code_size = front_code_size + back_code_size;
+      }
+
+      /* alloc new fragment ops command.  Note that this structure
+       * has variant length based on the total code size required.
+       */
+      ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size);
+      /* populate the new cell_command_fragment_ops object */
+      ops->opcode[0] = CELL_CMD_STATE_FRAGMENT_OPS;
+      ops->total_code_size = total_code_size;
+      ops->front_code_index = 0;
+      memcpy(ops->code, spe_code_front.store, front_code_size);
+      if (facing_dependent) {
+        /* We have separate front- and back-facing code.  Append the
+         * back-facing code to the buffer.  Be careful because the code
+         * size is in bytes, but the buffer is of unsigned elements.
+         */
+        ops->back_code_index = front_code_size / sizeof(spe_code_front.store[0]);
+        memcpy(ops->code + ops->back_code_index, spe_code_back.store, back_code_size);
+      }
+      else {
+        /* Use the same code for front- and back-facing fragments */
+        ops->back_code_index = ops->front_code_index;
+      }
+
+      /* Set the fields for the fallback case.  Note that these fields
+       * (and the whole fallback case) will eventually go away.
+       */
+      ops->dsa = *cell->depth_stencil;
+      ops->blend = *cell->blend;
+      ops->blend_color = cell->blend_color;
+
+      /* insert cell_command_fragment_ops object into keymap/cache */
+      util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
+
+      /* release rtasm buffer */
+      spe_release_func(&spe_code_front);
+      spe_release_func(&spe_code_back);
+   }
+   else {
+      if (0)
+         debug_printf("**** Re-use Fragment Ops\n");
+   }
+
+   return ops;
+}
+
+
+
 static void
 emit_state_cmd(struct cell_context *cell, uint cmd,
                const void *state, uint state_size)
 {
-   uint64_t *dst = (uint64_t *) 
-       cell_batch_alloc(cell, ROUNDUP8(sizeof(uint64_t) + state_size));
+   uint32_t *dst = (uint32_t *) 
+       cell_batch_alloc16(cell, ROUNDUP16(sizeof(opcode_t) + state_size));
    *dst = cmd;
-   memcpy(dst + 1, state, state_size);
+   memcpy(dst + 4, state, state_size);
 }
 
 
@@ -58,9 +195,10 @@ cell_emit_state(struct cell_context *cell)
    if (cell->dirty & CELL_NEW_FRAMEBUFFER) {
       struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];
       struct pipe_surface *zbuf = cell->framebuffer.zsbuf;
+      STATIC_ASSERT(sizeof(struct cell_command_framebuffer) % 16 == 0);
       struct cell_command_framebuffer *fb
-         = cell_batch_alloc(cell, sizeof(*fb));
-      fb->opcode = CELL_CMD_STATE_FRAMEBUFFER;
+         = cell_batch_alloc16(cell, sizeof(*fb));
+      fb->opcode[0] = CELL_CMD_STATE_FRAMEBUFFER;
       fb->color_start = cell->cbuf_map[0];
       fb->color_format = cbuf->format;
       fb->depth_start = cell->zsbuf_map;
@@ -73,11 +211,20 @@ cell_emit_state(struct cell_context *cell)
 #endif
    }
 
+   if (cell->dirty & (CELL_NEW_RASTERIZER)) {
+      STATIC_ASSERT(sizeof(struct cell_command_rasterizer) % 16 == 0);
+      struct cell_command_rasterizer *rast =
+         cell_batch_alloc16(cell, sizeof(*rast));
+      rast->opcode[0] = CELL_CMD_STATE_RASTERIZER;
+      rast->rasterizer = *cell->rasterizer;
+   }
+
    if (cell->dirty & (CELL_NEW_FS)) {
       /* Send new fragment program to SPUs */
+      STATIC_ASSERT(sizeof(struct cell_command_fragment_program) % 16 == 0);
       struct cell_command_fragment_program *fp
-            = cell_batch_alloc(cell, sizeof(*fp));
-      fp->opcode = CELL_CMD_STATE_FRAGMENT_PROGRAM;
+            = cell_batch_alloc16(cell, sizeof(*fp));
+      fp->opcode[0] = CELL_CMD_STATE_FRAGMENT_PROGRAM;
       fp->num_inst = cell->fs->code.num_inst;
       memcpy(&fp->code, cell->fs->code.store,
              SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
@@ -90,59 +237,83 @@ cell_emit_state(struct cell_context *cell)
       }
    }
 
+   if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
+      const uint shader = PIPE_SHADER_FRAGMENT;
+      const uint num_const = cell->constants[shader].buffer->size / sizeof(float);
+      uint i, j;
+      float *buf = cell_batch_alloc16(cell, ROUNDUP16(32 + num_const * sizeof(float)));
+      uint32_t *ibuf = (uint32_t *) buf;
+      const float *constants = pipe_buffer_map(cell->pipe.screen,
+                                               cell->constants[shader].buffer,
+                                               PIPE_BUFFER_USAGE_CPU_READ);
+      ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
+      ibuf[4] = num_const;
+      j = 8;
+      for (i = 0; i < num_const; i++) {
+         buf[j++] = constants[i];
+      }
+      pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer);
+   }
+
    if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
                       CELL_NEW_DEPTH_STENCIL |
                       CELL_NEW_BLEND)) {
-      /* XXX we don't want to always do codegen here.  We should have
-       * a hash/lookup table to cache previous results...
-       */
-      struct cell_command_fragment_ops *fops
-            = cell_batch_alloc(cell, sizeof(*fops));
-      struct spe_function spe_code;
-
-      /* generate new code */
-      cell_gen_fragment_function(cell, &spe_code);
-      /* put the new code into the batch buffer */
-      fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
-      memcpy(&fops->code, spe_code.store,
-             SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-      fops->dsa = cell->depth_stencil->base;
-      fops->blend = cell->blend->base;
-      /* free codegen buffer */
-      spe_release_func(&spe_code);
+      struct cell_command_fragment_ops *fops, *fops_cmd;
+      /* Note that cell_command_fragment_ops is a variant-sized record */
+      fops = lookup_fragment_ops(cell);
+      fops_cmd = cell_batch_alloc16(cell, ROUNDUP16(sizeof(*fops_cmd) + fops->total_code_size));
+      memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size);
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
       uint i;
       for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
-         if (cell->sampler[i]) {
-            struct cell_command_sampler *sampler
-               = cell_batch_alloc(cell, sizeof(*sampler));
-            sampler->opcode = CELL_CMD_STATE_SAMPLER;
-            sampler->unit = i;
-            sampler->state = *cell->sampler[i];
+         if (cell->dirty_samplers & (1 << i)) {
+            if (cell->sampler[i]) {
+               STATIC_ASSERT(sizeof(struct cell_command_sampler) % 16 == 0);
+               struct cell_command_sampler *sampler
+                  = cell_batch_alloc16(cell, sizeof(*sampler));
+               sampler->opcode[0] = CELL_CMD_STATE_SAMPLER;
+               sampler->unit = i;
+               sampler->state = *cell->sampler[i];
+            }
          }
       }
+      cell->dirty_samplers = 0x0;
    }
 
    if (cell->dirty & CELL_NEW_TEXTURE) {
       uint i;
       for (i = 0;i < CELL_MAX_SAMPLERS; i++) {
-         struct cell_command_texture *texture
-            =  cell_batch_alloc(cell, sizeof(*texture));
-         texture->opcode = CELL_CMD_STATE_TEXTURE;
-         texture->unit = i;
-         if (cell->texture[i]) {
-            texture->start = cell->texture[i]->tiled_data;
-            texture->width = cell->texture[i]->base.width[0];
-            texture->height = cell->texture[i]->base.height[0];
-         }
-         else {
-            texture->start = NULL;
-            texture->width = 1;
-            texture->height = 1;
+         if (cell->dirty_textures & (1 << i)) {
+            STATIC_ASSERT(sizeof(struct cell_command_texture) % 16 == 0);
+            struct cell_command_texture *texture
+               =  (struct cell_command_texture *)cell_batch_alloc16(cell, sizeof(*texture));
+            texture->opcode[0] = CELL_CMD_STATE_TEXTURE;
+            texture->unit = i;
+            if (cell->texture[i]) {
+               uint level;
+               for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+                  texture->start[level] = cell->texture[i]->tiled_mapped[level];
+                  texture->width[level] = cell->texture[i]->base.width[level];
+                  texture->height[level] = cell->texture[i]->base.height[level];
+                  texture->depth[level] = cell->texture[i]->base.depth[level];
+               }
+               texture->target = cell->texture[i]->base.target;
+            }
+            else {
+               uint level;
+               for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+                  texture->start[level] = NULL;
+                  texture->width[level] = 0;
+                  texture->height[level] = 0;
+                  texture->depth[level] = 0;
+               }
+               texture->target = 0;
+            }
          }
       }
+      cell->dirty_textures = 0x0;
    }
 
    if (cell->dirty & CELL_NEW_VERTEX_INFO) {
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
index 78cb446c14..d97c22b2ef 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -297,7 +297,7 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
    int face_stencil = spe_allocate_available_register(f);
    int stencil_src = stencil;
    const unsigned ref = (dsa->stencil[face].ref_value
-                         & dsa->stencil[face].value_mask);
+                         & dsa->stencil[face].valuemask);
    boolean complement = FALSE;
    int stored;
    int tmp = spe_allocate_available_register(f);
@@ -305,9 +305,9 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
 
    if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
        && (dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
-       && (dsa->stencil[face].value_mask != 0x0ff)) {
+       && (dsa->stencil[face].valuemask != 0x0ff)) {
       stored = spe_allocate_available_register(f);
-      spe_andi(f, stored, stencil, dsa->stencil[face].value_mask);
+      spe_andi(f, stored, stencil, dsa->stencil[face].valuemask);
    } else {
       stored = stencil;
    }
@@ -395,7 +395,7 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
     * - For depth-pass if the stencil test is NEVER
     * - Any of the 3 conditions if the operation is KEEP
     */
-   if (dsa->stencil[face].write_mask != 0) {
+   if (dsa->stencil[face].writemask != 0) {
       if ((dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
           && (dsa->stencil[face].fail_op != PIPE_STENCIL_OP_KEEP)) {
          if (complement) {
@@ -449,10 +449,10 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
     */
    if (stencil_src == stencil) {
       spe_release_register(f, face_stencil);
-   } else if (dsa->stencil[face].write_mask != 0x0ff) {
+   } else if (dsa->stencil[face].writemask != 0x0ff) {
       int tmp = spe_allocate_available_register(f);
 
-      spe_il(f, tmp, dsa->stencil[face].write_mask);
+      spe_il(f, tmp, dsa->stencil[face].writemask);
       spe_selb(f, stencil_src, stencil, stencil_src, tmp);
 
       spe_release_register(f, tmp);
@@ -580,8 +580,8 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
                 dsa->stencil[i].zpass_op);
          printf("#    ref value / value mask / write mask: %02x %02x %02x\n",
                 dsa->stencil[i].ref_value,
-                dsa->stencil[i].value_mask,
-                dsa->stencil[i].write_mask);
+                dsa->stencil[i].valuemask,
+                dsa->stencil[i].writemask);
       }
 
       printf("\t.text\n");
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 3a0d066da2..bf517ea563 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -28,7 +28,7 @@
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "draw/draw_context.h"
 #include "tgsi/tgsi_parse.h"
 
@@ -186,18 +186,21 @@ cell_set_constant_buffer(struct pipe_context *pipe,
                          const struct pipe_constant_buffer *buf)
 {
    struct cell_context *cell = cell_context(pipe);
-   struct pipe_winsys *ws = pipe->winsys;
 
    assert(shader < PIPE_SHADER_TYPES);
    assert(index == 0);
 
-   /* note: reference counting */
-   winsys_buffer_reference(ws,
-                        &cell->constants[shader].buffer,
-                        buf->buffer);
-   cell->constants[shader].size = buf->size;
+   draw_flush(cell->draw);
 
-   cell->dirty |= CELL_NEW_CONSTANTS;
+   /* note: reference counting */
+   pipe_buffer_reference(pipe->screen,
+                         &cell->constants[shader].buffer,
+                         buf->buffer);
+
+   if (shader == PIPE_SHADER_VERTEX)
+      cell->dirty |= CELL_NEW_VS_CONSTANTS;
+   else if (shader == PIPE_SHADER_FRAGMENT)
+      cell->dirty |= CELL_NEW_FS_CONSTANTS;
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.c b/src/gallium/drivers/cell/ppu/cell_surface.c
index 732c64082e..c9203fee08 100644
--- a/src/gallium/drivers/cell/ppu/cell_surface.c
+++ b/src/gallium/drivers/cell/ppu/cell_surface.c
@@ -27,6 +27,7 @@
 
 #include "util/u_rect.h"
 #include "cell_context.h"
+#include "cell_surface.h"
 
 
 void
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index b6590dfb86..fa52e2cbea 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -28,12 +28,13 @@
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   *   Michel Dänzer <michel@tungstengraphics.com>
+  *   Brian Paul
   */
 
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
@@ -42,30 +43,31 @@
 #include "cell_texture.h"
 
 
-/* Simple, maximally packed layout.
- */
 
-static unsigned minify( unsigned d )
+static unsigned
+minify(unsigned d)
 {
    return MAX2(1, d>>1);
 }
 
 
 static void
-cell_texture_layout(struct cell_texture * spt)
+cell_texture_layout(struct cell_texture *ct)
 {
-   struct pipe_texture *pt = &spt->base;
+   struct pipe_texture *pt = &ct->base;
    unsigned level;
    unsigned width = pt->width[0];
    unsigned height = pt->height[0];
    unsigned depth = pt->depth[0];
 
-   spt->buffer_size = 0;
+   ct->buffer_size = 0;
 
    for ( level = 0 ; level <= pt->last_level ; level++ ) {
       unsigned size;
       unsigned w_tile, h_tile;
 
+      assert(level < CELL_MAX_TEXTURE_LEVELS);
+
       /* width, height, rounded up to tile size */
       w_tile = align(width, TILE_SIZE);
       h_tile = align(height, TILE_SIZE);
@@ -76,9 +78,9 @@ cell_texture_layout(struct cell_texture * spt)
       pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w_tile);  
       pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h_tile);  
 
-      spt->stride[level] = pt->nblocksx[level] * pt->block.size;
+      ct->stride[level] = pt->nblocksx[level] * pt->block.size;
 
-      spt->level_offset[level] = spt->buffer_size;
+      ct->level_offset[level] = ct->buffer_size;
 
       size = pt->nblocksx[level] * pt->nblocksy[level] * pt->block.size;
       if (pt->target == PIPE_TEXTURE_CUBE)
@@ -86,7 +88,7 @@ cell_texture_layout(struct cell_texture * spt)
       else
          size *= depth;
 
-      spt->buffer_size += size;
+      ct->buffer_size += size;
 
       width  = minify(width);
       height = minify(height);
@@ -100,26 +102,25 @@ cell_texture_create(struct pipe_screen *screen,
                     const struct pipe_texture *templat)
 {
    struct pipe_winsys *ws = screen->winsys;
-   struct cell_texture *spt = CALLOC_STRUCT(cell_texture);
-   if (!spt)
+   struct cell_texture *ct = CALLOC_STRUCT(cell_texture);
+   if (!ct)
       return NULL;
 
-   spt->base = *templat;
-   spt->base.refcount = 1;
-   spt->base.screen = screen;
+   ct->base = *templat;
+   ct->base.refcount = 1;
+   ct->base.screen = screen;
 
-   cell_texture_layout(spt);
+   cell_texture_layout(ct);
 
-   spt->buffer = ws->buffer_create(ws, 32,
-                                   PIPE_BUFFER_USAGE_PIXEL,
-                                   spt->buffer_size);
+   ct->buffer = ws->buffer_create(ws, 32, PIPE_BUFFER_USAGE_PIXEL,
+                                   ct->buffer_size);
 
-   if (!spt->buffer) {
-      FREE(spt);
+   if (!ct->buffer) {
+      FREE(ct);
       return NULL;
    }
 
-   return &spt->base;
+   return &ct->base;
 }
 
 
@@ -135,244 +136,514 @@ cell_texture_release(struct pipe_screen *screen,
        __FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
    */
    if (--(*pt)->refcount <= 0) {
-      struct cell_texture *spt = cell_texture(*pt);
+      /* Delete this texture now.
+       * But note that the underlying pipe_buffer may linger...
+       */
+      struct cell_texture *ct = cell_texture(*pt);
+      uint i;
 
       /*
-      DBG("%s deleting %p\n", __FUNCTION__, (void *) spt);
+      DBG("%s deleting %p\n", __FUNCTION__, (void *) ct);
       */
 
-      pipe_buffer_reference(screen, &spt->buffer, NULL);
+      pipe_buffer_reference(screen, &ct->buffer, NULL);
+
+      for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+         /* Unreference the tiled image buffer.
+          * It may not actually be deleted until a fence is hit.
+          */
+         if (ct->tiled_buffer[i]) {
+            ct->tiled_mapped[i] = NULL;
+            pipe_buffer_reference(screen, &ct->tiled_buffer[i], NULL);
+         }
+      }
 
-      FREE(spt);
+      FREE(ct);
    }
    *pt = NULL;
 }
 
 
-#if 0
+
+/**
+ * Convert image from linear layout to tiled layout.  4-byte pixels.
+ */
 static void
-cell_texture_update(struct pipe_context *pipe, struct pipe_texture *texture,
-                    uint face, uint levelsMask)
+twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+                   uint src_stride, const uint *src)
 {
-   /* XXX TO DO:  re-tile the texture data ... */
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = (h + tile_size - 1) / tile_size;
+   const uint w_t = (w + tile_size - 1) / tile_size;
 
-}
-#endif
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
 
+   src_stride /= 4; /* convert from bytes to pixels */
 
-static struct pipe_surface *
-cell_get_tex_surface(struct pipe_screen *screen,
-                     struct pipe_texture *pt,
-                     unsigned face, unsigned level, unsigned zslice,
-                     unsigned usage)
-{
-   struct pipe_winsys *ws = screen->winsys;
-   struct cell_texture *spt = cell_texture(pt);
-   struct pipe_surface *ps;
+   /* loop over dest tiles */
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* start of dest tile: */
+         uint *tdst = dst + (it * w_t + jt) * tile_size2;
 
-   ps = ws->surface_alloc(ws);
-   if (ps) {
-      assert(ps->refcount);
-      assert(ps->winsys);
-      winsys_buffer_reference(ws, &ps->buffer, spt->buffer);
-      ps->format = pt->format;
-      ps->block = pt->block;
-      ps->width = pt->width[level];
-      ps->height = pt->height[level];
-      ps->nblocksx = pt->nblocksx[level];
-      ps->nblocksy = pt->nblocksy[level];
-      ps->stride = spt->stride[level];
-      ps->offset = spt->level_offset[level];
-      ps->usage = usage;
+         /* compute size of this tile (may be smaller than tile_size) */
+         /* XXX note: a compiler bug was found here. That's why the code
+          * looks as it does.
+          */
+         uint tile_width = w - jt * tile_size;
+         tile_width = MIN2(tile_width, tile_size);
+         uint tile_height = h - it * tile_size;
+         tile_height = MIN2(tile_height, tile_size);
 
-      /* XXX may need to override usage flags (see sp_texture.c) */
+         /* loop over texels in the tile */
+         for (i = 0; i < tile_height; i++) {
+            for (j = 0; j < tile_width; j++) {
+               const uint srci = it * tile_size + i;
+               const uint srcj = jt * tile_size + j;
+               ASSERT(srci < h);
+               ASSERT(srcj < w);
+               tdst[i * tile_size + j] = src[srci * src_stride + srcj];
+            }
+         }
+      }
+   }
+}
 
-      pipe_texture_reference(&ps->texture, pt); 
-      ps->face = face;
-      ps->level = level;
-      ps->zslice = zslice;
 
-      if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
-	 ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
-		       ps->nblocksy *
-		       ps->stride;
-      }
-      else {
-	 assert(face == 0);
-	 assert(zslice == 0);
+/**
+ * For Cell.  Basically, rearrange the pixels/quads from this layout:
+ *  +--+--+--+--+
+ *  |p0|p1|p2|p3|....
+ *  +--+--+--+--+
+ *
+ * to this layout:
+ *  +--+--+
+ *  |p0|p1|....
+ *  +--+--+
+ *  |p2|p3|
+ *  +--+--+
+ */
+static void
+twiddle_tile(const uint *tileIn, uint *tileOut)
+{
+   int y, x;
+
+   for (y = 0; y < TILE_SIZE; y+=2) {
+      for (x = 0; x < TILE_SIZE; x+=2) {
+         int k = 4 * (y/2 * TILE_SIZE/2 + x/2);
+         tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k];
+         tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1];
+         tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2];
+         tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3];
       }
    }
-   return ps;
 }
 
 
-
 /**
- * Copy tile data from linear layout to tiled layout.
- * XXX this should be rolled into the future surface-creation code.
- * XXX also need "untile" code...
+ * Convert image from tiled layout to linear layout.  4-byte pixels.
  */
 static void
-tile_copy_data(uint w, uint h, uint tile_size, uint *dst, const uint *src)
+untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+                     uint dst_stride, const uint *src)
 {
    const uint tile_size2 = tile_size * tile_size;
-   const uint h_t = h / tile_size, w_t = w / tile_size;
-
+   const uint h_t = (h + tile_size - 1) / tile_size;
+   const uint w_t = (w + tile_size - 1) / tile_size;
+   uint *tile_buf;
    uint it, jt;  /* tile counters */
    uint i, j;    /* intra-tile counters */
 
-   /* loop over dest tiles */
+   dst_stride /= 4; /* convert from bytes to pixels */
+
+   tile_buf = align_malloc(tile_size * tile_size * 4, 16);
+   
+   /* loop over src tiles */
    for (it = 0; it < h_t; it++) {
       for (jt = 0; jt < w_t; jt++) {
-         /* start of dest tile: */
-         uint *tdst = dst + (it * w_t + jt) * tile_size2;
+         /* start of src tile: */
+         const uint *tsrc = src + (it * w_t + jt) * tile_size2;
+         
+         twiddle_tile(tsrc, tile_buf);
+         tsrc = tile_buf;
+
+         /* compute size of this tile (may be smaller than tile_size) */
+         /* XXX note: a compiler bug was found here. That's why the code
+          * looks as it does.
+          */
+         uint tile_width = w - jt * tile_size;
+         tile_width = MIN2(tile_width, tile_size);
+         uint tile_height = h - it * tile_size;
+         tile_height = MIN2(tile_height, tile_size);
+
          /* loop over texels in the tile */
-         for (i = 0; i < tile_size; i++) {
-            for (j = 0; j < tile_size; j++) {
-               const uint srci = it * tile_size + i;
-               const uint srcj = jt * tile_size + j;
-               *tdst++ = src[srci * w + srcj];
+         for (i = 0; i < tile_height; i++) {
+            for (j = 0; j < tile_width; j++) {
+               uint dsti = it * tile_size + i;
+               uint dstj = jt * tile_size + j;
+               ASSERT(dsti < h);
+               ASSERT(dstj < w);
+               dst[dsti * dst_stride + dstj] = tsrc[i * tile_size + j];
             }
          }
       }
    }
-}
 
+   align_free(tile_buf);
+}
 
 
 /**
  * Convert linear texture image data to tiled format for SPU usage.
- * XXX recast this in terms of pipe_surfaces (aka texture views).
  */
 static void
-cell_tile_texture(struct cell_context *cell,
-                  struct cell_texture *texture)
+cell_twiddle_texture(struct pipe_screen *screen,
+                     struct pipe_surface *surface)
 {
-   struct pipe_screen *screen = cell->pipe.screen;
-   uint face = 0, level = 0, zslice = 0;
-   struct pipe_surface *surf;
-   const uint w = texture->base.width[0], h = texture->base.height[0];
-   const uint *src;
-
-   /* temporary restrictions: */
-   assert(w >= TILE_SIZE);
-   assert(h >= TILE_SIZE);
-   assert(w % TILE_SIZE == 0);
-   assert(h % TILE_SIZE == 0);
-
-   surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice,
-                                  PIPE_BUFFER_USAGE_CPU_WRITE);
-   ASSERT(surf);
-
-   src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE);
-
-   if (texture->tiled_data) {
-      align_free(texture->tiled_data);
+#if 0 // XXX fix me
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+   const uint texWidth = ct->base.width[level];
+   const uint texHeight = ct->base.height[level];
+   const uint bufWidth = align(texWidth, TILE_SIZE);
+   const uint bufHeight = align(texHeight, TILE_SIZE);
+   const void *map = screen->surface_map(screen, surface, PIPE_BUFFER_USAGE_CPU_READ);
+   const uint *src = (const uint *) map;
+
+   switch (ct->base.format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      {
+         int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+         int offset = bufWidth * bufHeight * 4 * surface->face;
+         uint *dst;
+
+         if (!ct->tiled_buffer[level]) {
+            /* allocate buffer for tiled data now */
+            struct pipe_winsys *ws = screen->winsys;
+            uint bytes = bufWidth * bufHeight * 4 * numFaces;
+            ct->tiled_buffer[level] =
+               ws->buffer_create(ws, 16, PIPE_BUFFER_USAGE_PIXEL, bytes);
+            /* and map it */
+            ct->tiled_mapped[level] =
+               ws->buffer_map(ws, ct->tiled_buffer[level],
+                              PIPE_BUFFER_USAGE_GPU_READ);
+         }
+         dst = (uint *) ((ubyte *) ct->tiled_mapped[level] + offset);
+
+         twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+                            surface->stride, src);
+      }
+      break;
+   default:
+      printf("Cell: twiddle unsupported texture format %s\n",
+             pf_name(ct->base.format));
    }
-   texture->tiled_data = align_malloc(w * h * 4, 16);
 
-   tile_copy_data(w, h, TILE_SIZE, texture->tiled_data, src);
+   screen->surface_unmap(screen, surface);
+#endif
+}
+
 
-   pipe_surface_unmap(surf);
+/**
+ * Convert SPU tiled texture image data to linear format for app usage.
+ */
+static void
+cell_untwiddle_texture(struct pipe_screen *screen,
+                     struct pipe_surface *surface)
+{
+#if 0 // XXX fix me
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+   const uint texWidth = ct->base.width[level];
+   const uint texHeight = ct->base.height[level];
+   const void *map = screen->surface_map(screen, surface, PIPE_BUFFER_USAGE_CPU_READ);
+   const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
+
+   switch (ct->base.format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      {
+         int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+         int offset = surface->stride * texHeight * 4 * surface->face;
+         uint *dst;
+
+         if (!ct->untiled_data[level]) {
+            ct->untiled_data[level] =
+               align_malloc(surface->stride * texHeight * 4 * numFaces, 16);
+         }
 
-   pipe_surface_reference(&surf, NULL);
+         dst = (uint *) ((ubyte *) ct->untiled_data[level] + offset);
+
+         untwiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+                              surface->stride, src);
+      }
+      break;
+   default:
+      {
+         ct->untiled_data[level] = NULL;
+         printf("Cell: untwiddle unsupported texture format %s\n",
+                pf_name(ct->base.format));
+      }
+   }
+
+   screen->surface_unmap(screen, surface);
+#endif
 }
 
 
-void
-cell_update_texture_mapping(struct cell_context *cell)
+static struct pipe_surface *
+cell_get_tex_surface(struct pipe_screen *screen,
+                     struct pipe_texture *pt,
+                     unsigned face, unsigned level, unsigned zslice,
+                     unsigned usage)
 {
-#if 0
-   uint face = 0, level = 0, zslice = 0;
+   struct cell_texture *ct = cell_texture(pt);
+   struct pipe_surface *ps;
+
+   ps = CALLOC_STRUCT(pipe_surface);
+   if (ps) {
+      ps->refcount = 1;
+      pipe_texture_reference(&ps->texture, pt);
+      ps->format = pt->format;
+      //ps->block = pt->block;
+      ps->width = pt->width[level];
+      ps->height = pt->height[level];
+      //ps->nblocksx = pt->nblocksx[level];
+      //ps->nblocksy = pt->nblocksy[level];
+      //ps->stride = ct->stride[level];
+      ps->offset = ct->level_offset[level];
+      ps->usage = usage;
+
+      /* XXX may need to override usage flags (see sp_texture.c) */
+
+      pipe_texture_reference(&ps->texture, pt); 
+      ps->face = face;
+      ps->level = level;
+      ps->zslice = zslice;
+
+      if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
+#if 0 // XXX fix me
+         ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
+            ps->nblocksy *
+            ps->stride;
 #endif
-   uint i;
+      }
+      else {
+         assert(face == 0);
+         assert(zslice == 0);
+      }
 
-   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
-      if (cell->texture[i])
-         cell_tile_texture(cell, cell->texture[i]);
+      if (ps->usage & PIPE_BUFFER_USAGE_CPU_READ) {
+         /* convert from tiled to linear layout */
+         cell_untwiddle_texture(screen, ps);
+      }
    }
+   return ps;
+}
 
-#if 0
-   if (cell->tex_surf && cell->tex_map) {
-      pipe_surface_unmap(cell->tex_surf);
-      cell->tex_map = NULL;
+
+static void 
+cell_tex_surface_release(struct pipe_screen *screen, 
+                         struct pipe_surface **s)
+{
+   struct cell_texture *ct = cell_texture((*s)->texture);
+   const uint level = (*s)->level;
+   struct pipe_surface *surf = *s;
+
+   if ((surf->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level]))
+   {
+      align_free(ct->untiled_data[level]);
+      ct->untiled_data[level] = NULL;
    }
 
-   /* XXX free old surface */
+   if ((ct->base.tex_usage & PIPE_TEXTURE_USAGE_SAMPLER) &&
+       (surf->usage & PIPE_BUFFER_USAGE_CPU_WRITE)) {
+      /* convert from linear to tiled layout */
+      cell_twiddle_texture(screen, surf);
+   }
 
-   cell->tex_surf = cell_get_tex_surface(&cell->pipe,
-                                         &cell->texture[0]->base,
-                                         face, level, zslice);
+   /* XXX if done rendering to teximage, re-tile */
 
-   cell->tex_map = pipe_surface_map(cell->tex_surf);
-#endif
+   if (--surf->refcount == 0) {
+      pipe_texture_reference(&surf->texture, NULL);
+      FREE(surf);
+   }
+   *s = NULL;
+}
+
+
+static struct pipe_transfer *
+cell_get_tex_transfer(struct pipe_screen *screen,
+                      struct pipe_texture *texture,
+                      unsigned face, unsigned level, unsigned zslice,
+                      enum pipe_transfer_usage usage,
+                      unsigned x, unsigned y, unsigned w, unsigned h)
+{
+   struct cell_texture *ct = cell_texture(texture);
+   struct cell_transfer *ctrans;
+   struct pipe_transfer *pt;
+
+   assert(texture);
+   assert(level <= texture->last_level);
+
+   ctrans = CALLOC_STRUCT(cell_transfer);
+   pt = &ctrans->base;
+   if (ctrans) {
+      pt->refcount = 1;
+      pipe_texture_reference(&pt->texture, texture);
+      pt->format = texture->format;
+      pt->block = texture->block;
+      pt->x = x;
+      pt->y = y;
+      pt->width = w;
+      pt->height = h;
+      pt->nblocksx = texture->nblocksx[level];
+      pt->nblocksy = texture->nblocksy[level];
+      pt->stride = ct->stride[level];
+      ctrans->offset = ct->level_offset[level];
+      pt->usage = usage;
+      pt->face = face;
+      pt->level = level;
+      pt->zslice = zslice;
+
+      if (texture->target == PIPE_TEXTURE_CUBE ||
+          texture->target == PIPE_TEXTURE_3D) {
+         ctrans->offset += ((texture->target == PIPE_TEXTURE_CUBE) ? face :
+                         zslice) * pt->nblocksy * pt->stride;
+      }
+      else {
+         assert(face == 0);
+         assert(zslice == 0);
+      }
+   }
+   return pt;
 }
 
 
 static void 
-cell_tex_surface_release(struct pipe_screen *screen, 
-                         struct pipe_surface **s)
+cell_tex_transfer_release(struct pipe_screen *screen, 
+                          struct pipe_transfer **t)
 {
+   struct cell_transfer *transfer = cell_transfer(*t);
    /* Effectively do the texture_update work here - if texture images
     * needed post-processing to put them into hardware layout, this is
-    * where it would happen.  For softpipe, nothing to do.
+    * where it would happen.  For cell, nothing to do.
     */
-   assert ((*s)->texture);
-   pipe_texture_reference(&(*s)->texture, NULL); 
-
-   screen->winsys->surface_release(screen->winsys, s);
+   assert (transfer->base.texture);
+   if (--transfer->base.refcount == 0) {
+      pipe_texture_reference(&transfer->base.texture, NULL);
+      FREE(transfer);
+   }
+   *t = NULL;
 }
 
 
 static void *
-cell_surface_map( struct pipe_screen *screen,
-                  struct pipe_surface *surface,
-                  unsigned flags )
+cell_transfer_map( struct pipe_screen *screen,
+                       struct pipe_transfer *transfer )
 {
    ubyte *map;
+   struct cell_texture *spt;
+   unsigned flags = 0;
 
-   if (flags & ~surface->usage) {
-      assert(0);
-      return NULL;
+   assert(transfer->texture);
+   spt = cell_texture(transfer->texture);
+
+   if (transfer->usage != PIPE_TRANSFER_READ) {
+      flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
    }
 
-   map = pipe_buffer_map( screen, surface->buffer, flags );
+   if (transfer->usage != PIPE_TRANSFER_WRITE) {
+      flags |= PIPE_BUFFER_USAGE_CPU_READ;
+   }
+
+   map = pipe_buffer_map(screen, spt->buffer, flags);
    if (map == NULL)
       return NULL;
 
    /* May want to different things here depending on read/write nature
     * of the map:
     */
-   if (surface->texture &&
-       (flags & PIPE_BUFFER_USAGE_CPU_WRITE)) 
+   if (transfer->texture && transfer->usage != PIPE_TRANSFER_READ) 
    {
       /* Do something to notify sharing contexts of a texture change.
-       * In softpipe, that would mean flushing the texture cache.
+       * In cell, that would mean flushing the texture cache.
        */
-#if 0
+#if 00
       cell_screen(screen)->timestamp++;
 #endif
    }
    
-   return map + surface->offset;
+   return map + cell_transfer(transfer)->offset +
+      transfer->y / transfer->block.height * transfer->stride +
+      transfer->x / transfer->block.width * transfer->block.size;
 }
 
 
 static void
-cell_surface_unmap(struct pipe_screen *screen,
-                   struct pipe_surface *surface)
+cell_transfer_unmap(struct pipe_screen *screen,
+                       struct pipe_transfer *transfer)
 {
-   pipe_buffer_unmap( screen, surface->buffer );
+   struct cell_texture *spt;
+
+   assert(transfer->texture);
+   spt = cell_texture(transfer->texture);
+
+   pipe_buffer_unmap( screen, spt->buffer );
 }
 
 
-void
-cell_init_texture_functions(struct cell_context *cell)
+static void *
+cell_surface_map(struct pipe_screen *screen,
+                 struct pipe_surface *surface,
+                 unsigned flags)
 {
-   /*cell->pipe.texture_update = cell_texture_update;*/
+   ubyte *map;
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+
+   assert(ct);
+
+#if 0
+   if (flags & ~surface->usage) {
+      assert(0);
+      return NULL;
+   }
+#endif
+
+   map = pipe_buffer_map( screen, ct->buffer, flags );
+   if (map == NULL) {
+      return NULL;
+   }
+   else {
+      if ((surface->usage & PIPE_BUFFER_USAGE_CPU_READ) &&
+          (ct->untiled_data[level])) {
+         return (void *) ((ubyte *) ct->untiled_data[level] + surface->offset);
+      }
+      else {
+         return (void *) (map + surface->offset);
+      }
+   }
 }
 
 
+static void
+cell_surface_unmap(struct pipe_screen *screen,
+                   struct pipe_surface *surface)
+{
+   struct cell_texture *ct = cell_texture(surface->texture);
+
+   assert(ct);
+
+   pipe_buffer_unmap( screen, ct->buffer );
+}
+
+
+
 void
 cell_init_screen_texture_funcs(struct pipe_screen *screen)
 {
@@ -382,6 +653,8 @@ cell_init_screen_texture_funcs(struct pipe_screen *screen)
    screen->get_tex_surface = cell_get_tex_surface;
    screen->tex_surface_release = cell_tex_surface_release;
 
-   screen->surface_map = cell_surface_map;
-   screen->surface_unmap = cell_surface_unmap;
+   screen->get_tex_transfer = cell_get_tex_transfer;
+   screen->tex_transfer_release = cell_tex_transfer_release;
+   screen->transfer_map = cell_transfer_map;
+   screen->transfer_unmap = cell_transfer_unmap;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index 6d37e95ebc..fc6486adbe 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -40,15 +40,31 @@ struct cell_texture
 {
    struct pipe_texture base;
 
-   unsigned long level_offset[PIPE_MAX_TEXTURE_LEVELS];
-   unsigned long stride[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned long level_offset[CELL_MAX_TEXTURE_LEVELS];
+   unsigned long stride[CELL_MAX_TEXTURE_LEVELS];
 
    /* The data is held here:
     */
    struct pipe_buffer *buffer;
    unsigned long buffer_size;
 
-   void *tiled_data;  /* XXX this may be temporary */ /*ALIGN16*/
+   /** Texture data in tiled layout is held here */
+   struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS];
+   /** Mapped, tiled texture data */
+   void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS];
+
+   struct pipe_transfer *transfer;
+
+   /** The original, linear texture data */
+   void *untiled_data[CELL_MAX_TEXTURE_LEVELS];
+};
+
+
+struct cell_transfer
+{
+   struct pipe_transfer base;
+
+   unsigned long offset;
 };
 
 
@@ -60,13 +76,12 @@ cell_texture(struct pipe_texture *pt)
 }
 
 
-
-extern void
-cell_update_texture_mapping(struct cell_context *cell);
-
-
-extern void
-cell_init_texture_functions(struct cell_context *cell);
+/** cast wrapper */
+static INLINE struct cell_transfer *
+cell_transfer(struct pipe_transfer *pt)
+{
+   return (struct cell_transfer *) pt;
+}
 
 
 extern void
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index aa63435b93..cfaffb52a8 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -38,6 +38,7 @@
 
 #include "cell_batch.h"
 #include "cell_context.h"
+#include "cell_fence.h"
 #include "cell_flush.h"
 #include "cell_spu.h"
 #include "cell_vbuf.h"
@@ -61,6 +62,7 @@ struct cell_vbuf_render
    uint vertex_size;     /**< in bytes */
    void *vertex_buffer;  /**< just for debug, really */
    uint vertex_buf;      /**< in [0, CELL_NUM_BUFFERS-1] */
+   uint vertex_buffer_size;  /**< size in bytes */
 };
 
 
@@ -81,24 +83,26 @@ cell_vbuf_get_vertex_info(struct vbuf_render *vbr)
 }
 
 
-static void *
+static boolean
 cell_vbuf_allocate_vertices(struct vbuf_render *vbr,
                             ushort vertex_size, ushort nr_vertices)
 {
    struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   unsigned size = vertex_size * nr_vertices;
    /*printf("Alloc verts %u * %u\n", vertex_size, nr_vertices);*/
 
    assert(cvbr->vertex_buf == ~0);
    cvbr->vertex_buf = cell_get_empty_buffer(cvbr->cell);
    cvbr->vertex_buffer = cvbr->cell->buffer[cvbr->vertex_buf];
+   cvbr->vertex_buffer_size = size;
    cvbr->vertex_size = vertex_size;
-   return cvbr->vertex_buffer;
+
+   return cvbr->vertex_buffer != NULL;
 }
 
 
 static void
-cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, 
-                           unsigned vertex_size, unsigned vertices_used)
+cell_vbuf_release_vertices(struct vbuf_render *vbr)
 {
    struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
    struct cell_context *cell = cvbr->cell;
@@ -108,23 +112,47 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
           __FUNCTION__, cvbr->vertex_buf, vertices_used);
    */
 
+   /* Make sure texture buffers aren't released until we're done rendering
+    * with them.
+    */
+   cell_add_fenced_textures(cell);
+
    /* Tell SPUs they can release the vert buf */
    if (cvbr->vertex_buf != ~0U) {
+      STATIC_ASSERT(sizeof(struct cell_command_release_verts) % 16 == 0);
       struct cell_command_release_verts *release
          = (struct cell_command_release_verts *)
-         cell_batch_alloc(cell, sizeof(struct cell_command_release_verts));
-      release->opcode = CELL_CMD_RELEASE_VERTS;
+         cell_batch_alloc16(cell, sizeof(struct cell_command_release_verts));
+      release->opcode[0] = CELL_CMD_RELEASE_VERTS;
       release->vertex_buf = cvbr->vertex_buf;
    }
 
    cvbr->vertex_buf = ~0;
    cell_flush_int(cell, 0x0);
 
-   assert(vertices == cvbr->vertex_buffer);
    cvbr->vertex_buffer = NULL;
 }
 
 
+static void *
+cell_vbuf_map_vertices(struct vbuf_render *vbr)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   return cvbr->vertex_buffer;
+}
+
+
+static void 
+cell_vbuf_unmap_vertices(struct vbuf_render *vbr, 
+                         ushort min_index,
+                         ushort max_index )
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   assert( cvbr->vertex_buffer_size >= (max_index+1) * cvbr->vertex_size );
+   /* do nothing */
+}
+
+
 
 static boolean
 cell_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
@@ -204,15 +232,16 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
    /* build/insert batch RENDER command */
    {
-      const uint index_bytes = ROUNDUP8(nr_indices * 2);
-      const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
+      const uint index_bytes = ROUNDUP16(nr_indices * 2);
+      const uint vertex_bytes = ROUNDUP16(nr_vertices * 4 * cell->vertex_info.size);
+      STATIC_ASSERT(sizeof(struct cell_command_render) % 16 == 0);
       const uint batch_size = sizeof(struct cell_command_render) + index_bytes;
 
       struct cell_command_render *render
          = (struct cell_command_render *)
-         cell_batch_alloc(cell, batch_size);
+         cell_batch_alloc16(cell, batch_size);
 
-      render->opcode = CELL_CMD_RENDER;
+      render->opcode[0] = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
 
       render->num_indexes = nr_indices;
@@ -230,7 +259,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
           min_index == 0 &&
           vertex_bytes + 16 <= cell_batch_free_space(cell)) {
          /* vertex data inlined, after indices, at 16-byte boundary */
-         void *dst = cell_batch_alloc_aligned(cell, vertex_bytes, 16);
+         void *dst = cell_batch_alloc16(cell, vertex_bytes);
          memcpy(dst, vertices, vertex_bytes);
          render->inline_verts = TRUE;
          render->vertex_buf = ~0;
@@ -287,6 +316,8 @@ cell_init_vbuf(struct cell_context *cell)
 
    cell->vbuf_render->base.get_vertex_info = cell_vbuf_get_vertex_info;
    cell->vbuf_render->base.allocate_vertices = cell_vbuf_allocate_vertices;
+   cell->vbuf_render->base.map_vertices = cell_vbuf_map_vertices;
+   cell->vbuf_render->base.unmap_vertices = cell_vbuf_unmap_vertices;
    cell->vbuf_render->base.set_primitive = cell_vbuf_set_primitive;
    cell->vbuf_render->base.draw = cell_vbuf_draw;
    cell->vbuf_render->base.release_vertices = cell_vbuf_release_vertices;
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 566df7f59e..9cba537d9e 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -73,8 +73,8 @@ emit_matrix_transpose(struct spe_function *p,
    int col3;
 
 
-   spe_lqd(p, shuf_hi, shuf_ptr, 3);
-   spe_lqd(p, shuf_lo, shuf_ptr, 4);
+   spe_lqd(p, shuf_hi, shuf_ptr, 3*16);
+   spe_lqd(p, shuf_lo, shuf_ptr, 4*16);
    spe_shufb(p, t1, row0, row2, shuf_hi);
    spe_shufb(p, t2, row0, row2, shuf_lo);
 
@@ -122,13 +122,13 @@ emit_matrix_transpose(struct spe_function *p,
     */
    switch (count) {
    case 4:
-      spe_stqd(p, col3, dest_ptr, 3);
+      spe_stqd(p, col3, dest_ptr, 3 * 16);
    case 3:
-      spe_stqd(p, col2, dest_ptr, 2);
+      spe_stqd(p, col2, dest_ptr, 2 * 16);
    case 2:
-      spe_stqd(p, col1, dest_ptr, 1);
+      spe_stqd(p, col1, dest_ptr, 1 * 16);
    case 1:
-      spe_stqd(p, col0, dest_ptr, 0);
+      spe_stqd(p, col0, dest_ptr, 0 * 16);
    }
 
 
@@ -145,6 +145,8 @@ emit_matrix_transpose(struct spe_function *p,
 }
 
 
+#if 0
+/* This appears to not be used currently */
 static void
 emit_fetch(struct spe_function *p,
 	   unsigned in_ptr, unsigned *offset,
@@ -166,17 +168,17 @@ emit_fetch(struct spe_function *p,
    float scale_signed = 0.0;
    float scale_unsigned = 0.0;
 
-   spe_lqd(p, v0, in_ptr, 0 + offset[0]);
-   spe_lqd(p, v1, in_ptr, 1 + offset[0]);
-   spe_lqd(p, v2, in_ptr, 2 + offset[0]);
-   spe_lqd(p, v3, in_ptr, 3 + offset[0]);
+   spe_lqd(p, v0, in_ptr, (0 + offset[0]) * 16);
+   spe_lqd(p, v1, in_ptr, (1 + offset[0]) * 16);
+   spe_lqd(p, v2, in_ptr, (2 + offset[0]) * 16);
+   spe_lqd(p, v3, in_ptr, (3 + offset[0]) * 16);
    offset[0] += 4;
    
    switch (bytes) {
    case 1:
       scale_signed = 1.0f / 127.0f;
       scale_unsigned = 1.0f / 255.0f;
-      spe_lqd(p, tmp, shuf_ptr, 1);
+      spe_lqd(p, tmp, shuf_ptr, 1 * 16);
       spe_shufb(p, v0, v0, v0, tmp);
       spe_shufb(p, v1, v1, v1, tmp);
       spe_shufb(p, v2, v2, v2, tmp);
@@ -185,7 +187,7 @@ emit_fetch(struct spe_function *p,
    case 2:
       scale_signed = 1.0f / 32767.0f;
       scale_unsigned = 1.0f / 65535.0f;
-      spe_lqd(p, tmp, shuf_ptr, 2);
+      spe_lqd(p, tmp, shuf_ptr, 2 * 16);
       spe_shufb(p, v0, v0, v0, tmp);
       spe_shufb(p, v1, v1, v1, tmp);
       spe_shufb(p, v2, v2, v2, tmp);
@@ -241,11 +243,11 @@ emit_fetch(struct spe_function *p,
 
    switch (count) {
    case 1:
-      spe_stqd(p, float_zero, out_ptr, 1);
+      spe_stqd(p, float_zero, out_ptr, 1 * 16);
    case 2:
-      spe_stqd(p, float_zero, out_ptr, 2);
+      spe_stqd(p, float_zero, out_ptr, 2 * 16);
    case 3:
-      spe_stqd(p, float_one, out_ptr, 3);
+      spe_stqd(p, float_one, out_ptr, 3 * 16);
    }
 
    if (float_zero != -1) {
@@ -256,6 +258,7 @@ emit_fetch(struct spe_function *p,
       spe_release_register(p, float_one);
    }
 }
+#endif
 
 
 void cell_update_vertex_fetch(struct draw_context *draw)
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
index 2b10c116fa..403cf6d50f 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
@@ -31,7 +31,7 @@
 
 #include "pipe/p_defines.h"
 #include "pipe/p_context.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_math.h"
 
 #include "cell_context.h"
diff --git a/src/gallium/drivers/cell/spu/.gitignore b/src/gallium/drivers/cell/spu/.gitignore
new file mode 100644
index 0000000000..2be9a2d324
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/.gitignore
@@ -0,0 +1 @@
+g3d_spu
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index 1ae0dfb8c1..3cc52301da 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -16,8 +16,10 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 
 SOURCES = \
-	spu_main.c \
+	spu_command.c \
 	spu_dcache.c \
+	spu_funcs.c \
+	spu_main.c \
 	spu_per_fragment_op.c \
 	spu_render.c \
 	spu_texture.c \
@@ -31,9 +33,10 @@ OLD_SOURCES = \
 	spu_vertex_shader.c
 
 
-SPU_OBJECTS = $(SOURCES:.c=.o) \
+SPU_OBJECTS = $(SOURCES:.c=.o)
+
+SPU_ASM_OUT = $(SOURCES:.c=.s)
 
-SPU_ASM_OUT = $(SOURCES:.c=.s) \
 
 INCLUDE_DIRS = \
 	-I$(TOP)/src/mesa \
diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h
index fd8dc6ded3..d7ce005524 100644
--- a/src/gallium/drivers/cell/spu/spu_colorpack.h
+++ b/src/gallium/drivers/cell/spu/spu_colorpack.h
@@ -31,6 +31,7 @@
 #define SPU_COLORPACK_H
 
 
+#include <transpose_matrix4x4.h>
 #include <spu_intrinsics.h>
 
 
@@ -84,10 +85,10 @@ spu_unpack_B8G8R8A8(uint color)
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
                           ((vector unsigned char) {
-                             10, 10, 10, 10,
-                             5, 5, 5, 5,
+                             2, 2, 2, 2,
+                             1, 1, 1, 1,
                              0, 0, 0, 0,
-                             15, 15, 15, 15}) );
+                             3, 3, 3, 3}) );
    return spu_convtf(color_u4, 32);
 }
 
@@ -98,13 +99,47 @@ spu_unpack_A8R8G8B8(uint color)
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
                           ((vector unsigned char) {
-                             5, 5, 5, 5,
-                             10, 10, 10, 10,
-                             15, 15, 15, 15,
+                             1, 1, 1, 1,
+                             2, 2, 2, 2,
+                             3, 3, 3, 3,
                              0, 0, 0, 0}) );
-
    return spu_convtf(color_u4, 32);
 }
 
 
+/**
+ * \param color_in - array of 32-bit packed ARGB colors
+ * \param color_out - returns float colors in RRRR, GGGG, BBBB, AAAA order
+ */
+static INLINE void
+spu_unpack_A8R8G8B8_transpose4(const vector unsigned int color_in[4],
+                               vector float color_out[4])
+{
+   vector unsigned int c0;
+
+   c0 = spu_shuffle(color_in[0], color_in[0],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[0] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[1], color_in[1],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[1] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[2], color_in[2],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[2] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[3], color_in[3],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[3] = spu_convtf(c0, 32);
+
+   _transpose_matrix4x4(color_out, color_out);
+}
+
+
+
 #endif /* SPU_COLORPACK_H */
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
new file mode 100644
index 0000000000..5c0179d954
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -0,0 +1,815 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU command processing code
+ */
+
+
+#include <stdio.h>
+#include <libmisc.h>
+
+#include "pipe/p_defines.h"
+
+#include "spu_command.h"
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_per_fragment_op.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_vertex_shader.h"
+#include "spu_dcache.h"
+#include "cell/common.h"
+
+
+struct spu_vs_context draw;
+
+
+/**
+ * Buffers containing dynamically generated SPU code:
+ */
+static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
+    ALIGN16_ATTRIB;
+
+
+
+static INLINE int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+
+
+/**
+ * Tell the PPU that this SPU has finished copying a buffer to
+ * local store and that it may be reused by the PPU.
+ * This is done by writting a 16-byte batch-buffer-status block back into
+ * main memory (in cell_context->buffer_status[]).
+ */
+static void
+release_buffer(uint buffer)
+{
+   /* Evidently, using less than a 16-byte status doesn't work reliably */
+   static const vector unsigned int status = {CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE};
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
+   uint *dst = spu.init.buffer_status + index;
+
+   ASSERT(buffer < CELL_NUM_BUFFERS);
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_MISC,            /* tag is unimportant */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+/**
+ * Write CELL_FENCE_SIGNALLED back to the fence status qword in main memory.
+ * There's a qword of status per SPU.
+ */
+static void
+cmd_fence(struct cell_command_fence *fence_cmd)
+{
+   static const vector unsigned int status = {CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED};
+   uint *dst = (uint *) fence_cmd->fence;
+   dst += 4 * spu.init.id;  /* main store/memory address, not local store */
+   ASSERT_ALIGN16(dst);
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_FENCE,           /* tag */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+static void
+cmd_clear_surface(const struct cell_command_clear_surface *clear)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
+
+   if (clear->surface == 0) {
+      spu.fb.color_clear_value = clear->value;
+      if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
+         uint x = (spu.init.id << 4) | (spu.init.id << 12) |
+            (spu.init.id << 20) | (spu.init.id << 28);
+         spu.fb.color_clear_value ^= x;
+      }
+   }
+   else {
+      spu.fb.depth_clear_value = clear->value;
+   }
+
+#define CLEAR_OPT 1
+#if CLEAR_OPT
+
+   /* Simply set all tiles' status to CLEAR.
+    * When we actually begin rendering into a tile, we'll initialize it to
+    * the clear value.  If any tiles go untouched during the frame,
+    * really_clear_tiles() will set them to the clear value.
+    */
+   if (clear->surface == 0) {
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
+   }
+   else {
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
+   }
+
+#else
+
+   /*
+    * This path clears the whole framebuffer to the clear color right now.
+    */
+
+   /*
+   printf("SPU: %s num=%d w=%d h=%d\n",
+          __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
+   */
+
+   /* init a single tile to the clear value */
+   if (clear->surface == 0) {
+      clear_c_tile(&spu.ctile);
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+   }
+
+   /* walk over my tiles, writing the 'clear' tile's data */
+   {
+      const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+      uint i;
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (clear->surface == 0)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         else
+            put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+   if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
+      wait_on_mask(1 << TAG_SURFACE_CLEAR);
+   }
+
+#endif /* CLEAR_OPT */
+
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF done\n");
+}
+
+
+static void
+cmd_release_verts(const struct cell_command_release_verts *release)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "RELEASE VERTS %u\n", release->vertex_buf);
+   ASSERT(release->vertex_buf != ~0U);
+   release_buffer(release->vertex_buf);
+}
+
+
+/**
+ * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
+ * This involves installing new fragment ops SPU code.
+ * If this function is never called, we'll use a regular C fallback function
+ * for fragment processing.
+ */
+static void
+cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
+
+   /* Copy state info (for fallback case only - this will eventually
+    * go away when the fallback case goes away)
+    */
+   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
+   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+   memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color));
+
+   /* Make sure the SPU knows which buffers it's expected to read when
+    * it's told to pull tiles.
+    */
+   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
+
+   /* If we're forcing the fallback code to be used (for debug purposes),
+    * install that.  Otherwise install the incoming SPU code.
+    */
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) != 0) {
+      static unsigned int warned = 0;
+      if (!warned) {
+         fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
+         warned = 1;
+      }
+      /* The following two lines aren't really necessary if you
+       * know the debug flags won't change during a run, and if you
+       * know that the function pointers are initialized correctly.
+       * We set them here to allow a person to change the debug
+       * flags during a run (from inside a debugger).
+       */
+      spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+      spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+      return;
+   }
+
+   /* Make sure the SPU code buffer is large enough to hold the incoming code.
+    * Note that we *don't* use align_malloc() and align_free(), because
+    * those utility functions are *not* available in SPU code.
+    * */
+   if (spu.fragment_ops_code_size < fops->total_code_size) {
+      if (spu.fragment_ops_code != NULL) {
+         free(spu.fragment_ops_code);
+      }
+      spu.fragment_ops_code_size = fops->total_code_size;
+      spu.fragment_ops_code = malloc(fops->total_code_size);
+      if (spu.fragment_ops_code == NULL) {
+         /* Whoops. */
+         fprintf(stderr, "CELL Warning: failed to allocate fragment ops code (%d bytes) - using fallback\n", fops->total_code_size);
+         spu.fragment_ops_code = NULL;
+         spu.fragment_ops_code_size = 0;
+         spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+         spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+         return;
+      }
+   }
+
+   /* Copy the SPU code from the command buffer to the spu buffer */
+   memcpy(spu.fragment_ops_code, fops->code, fops->total_code_size);
+
+   /* Set the pointers for the front-facing and back-facing fragments
+    * to the specified offsets within the code.  Note that if the
+    * front-facing and back-facing code are the same, they'll have
+    * the same offset.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->front_code_index];
+   spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->back_code_index];
+}
+
+static void
+cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_PROGRAM\n");
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_program_code, fp->code,
+          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
+#if 01
+   /* Point function pointer at new code */
+   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
+#endif
+}
+
+
+static uint
+cmd_state_fs_constants(const qword *buffer, uint pos)
+{
+   const uint num_const = spu_extract((vector unsigned int)buffer[pos+1], 0);
+   const float *constants = (const float *) &buffer[pos+2];
+   uint i;
+
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+
+   /* Expand each float to float[4] for SOA execution */
+   for (i = 0; i < num_const; i++) {
+      D_PRINTF(CELL_DEBUG_CMD, "  const[%u] = %f\n", i, constants[i]);
+      spu.constants[i] = spu_splats(constants[i]);
+   }
+
+   /* return new buffer pos (in 16-byte words) */
+   return pos + 2 + (ROUNDUP16(num_const * sizeof(float)) / 16);
+}
+
+
+static void
+cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
+             cmd->width,
+             cmd->height,
+             cmd->color_start,
+             cmd->color_format,
+             cmd->depth_format);
+
+   ASSERT_ALIGN16(cmd->color_start);
+   ASSERT_ALIGN16(cmd->depth_start);
+
+   spu.fb.color_start = cmd->color_start;
+   spu.fb.depth_start = cmd->depth_start;
+   spu.fb.color_format = cmd->color_format;
+   spu.fb.depth_format = cmd->depth_format;
+   spu.fb.width = cmd->width;
+   spu.fb.height = cmd->height;
+   spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
+   spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
+
+   switch (spu.fb.depth_format) {
+   case PIPE_FORMAT_Z32_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0xffffffffu;
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0x00ffffffu;
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      spu.fb.zsize = 2;
+      spu.fb.zscale = (float) 0xffffu;
+      break;
+   default:
+      spu.fb.zsize = 0;
+      break;
+   }
+}
+
+
+/**
+ * Tex texture mask_s/t and scale_s/t fields depend on the texture size and
+ * sampler wrap modes.
+ */
+static void
+update_tex_masks(struct spu_texture *texture,
+                 const struct pipe_sampler_state *sampler)
+{
+   uint i;
+
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      int width = texture->level[i].width;
+      int height = texture->level[i].height;
+
+      if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_s = spu_splats(width - 1);
+      else
+         texture->level[i].mask_s = spu_splats(~0);
+
+      if (sampler->wrap_t == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_t = spu_splats(height - 1);
+      else
+         texture->level[i].mask_t = spu_splats(~0);
+
+      if (sampler->normalized_coords) {
+         texture->level[i].scale_s = spu_splats((float) width);
+         texture->level[i].scale_t = spu_splats((float) height);
+      }
+      else {
+         texture->level[i].scale_s = spu_splats(1.0f);
+         texture->level[i].scale_t = spu_splats(1.0f);
+      }
+   }
+}
+
+
+static void
+cmd_state_sampler(const struct cell_command_sampler *sampler)
+{
+   uint unit = sampler->unit;
+
+   D_PRINTF(CELL_DEBUG_CMD, "SAMPLER [%u]\n", unit);
+
+   spu.sampler[unit] = sampler->state;
+
+   switch (spu.sampler[unit].min_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_nearest;
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   switch (spu.sampler[sampler->unit].mag_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_nearest;
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   switch (spu.sampler[sampler->unit].min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      spu.sample_texture_2d[unit] = sample_texture_2d_lod;
+      break;
+   case PIPE_TEX_MIPFILTER_NONE:
+      spu.sample_texture_2d[unit] = spu.mag_sample_texture_2d[unit];
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+}
+
+
+static void
+cmd_state_texture(const struct cell_command_texture *texture)
+{
+   const uint unit = texture->unit;
+   uint i;
+
+   D_PRINTF(CELL_DEBUG_CMD, "TEXTURE [%u]\n", texture->unit);
+
+   spu.texture[unit].max_level = 0;
+   spu.texture[unit].target = texture->target;
+
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      uint width = texture->width[i];
+      uint height = texture->height[i];
+      uint depth = texture->depth[i];
+
+      D_PRINTF(CELL_DEBUG_CMD, "  LEVEL %u: at %p  size[0] %u x %u\n", i,
+             texture->start[i], texture->width[i], texture->height[i]);
+
+      spu.texture[unit].level[i].start = texture->start[i];
+      spu.texture[unit].level[i].width = width;
+      spu.texture[unit].level[i].height = height;
+      spu.texture[unit].level[i].depth = depth;
+
+      spu.texture[unit].level[i].tiles_per_row =
+         (width + TILE_SIZE - 1) / TILE_SIZE;
+
+      spu.texture[unit].level[i].bytes_per_image =
+         4 * align(width, TILE_SIZE) * align(height, TILE_SIZE) * depth;
+
+      spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
+      spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
+
+      if (texture->start[i])
+         spu.texture[unit].max_level = i;
+   }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+}
+
+
+static void
+cmd_state_vertex_info(const struct vertex_info *vinfo)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
+   ASSERT(vinfo->num_attribs >= 1);
+   ASSERT(vinfo->num_attribs <= 8);
+   memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
+}
+
+
+static void
+cmd_state_vs_array_info(const struct cell_array_info *vs_info)
+{
+   const unsigned attr = vs_info->attr;
+
+   ASSERT(attr < PIPE_MAX_ATTRIBS);
+   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
+   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
+   draw.vertex_fetch.size[attr] = vs_info->size;
+   draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
+   draw.vertex_fetch.dirty = 1;
+}
+
+
+static void
+cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
+{
+   mfc_get(attribute_fetch_code_buffer,
+           (unsigned int) code->base,  /* src */
+           code->size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   draw.vertex_fetch.code = attribute_fetch_code_buffer;
+}
+
+
+static void
+cmd_finish(void)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "FINISH\n");
+   really_clear_tiles(0);
+   /* wait for all outstanding DMAs to finish */
+   mfc_write_tag_mask(~0);
+   mfc_read_tag_status_all();
+   /* send mbox message to PPU */
+   spu_write_out_mbox(CELL_CMD_FINISH);
+}
+
+
+/**
+ * Execute a batch of commands which was sent to us by the PPU.
+ * See the cell_emit_state.c code to see where the commands come from.
+ *
+ * The opcode param encodes the location of the buffer and its size.
+ */
+static void
+cmd_batch(uint opcode)
+{
+   const uint buf = (opcode >> 8) & 0xff;
+   uint size = (opcode >> 16);
+   qword buffer[CELL_BUFFER_SIZE / 16] ALIGN16_ATTRIB;
+   const unsigned usize = ROUNDUP16(size) / sizeof(buffer[0]);
+   uint pos;
+
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n",
+             buf, size, spu.init.buffers[buf]);
+
+   ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   size = ROUNDUP16(size);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   mfc_get(buffer,  /* dest */
+           (unsigned int) spu.init.buffers[buf],  /* src */
+           size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   /* Tell PPU we're done copying the buffer to local store */
+   D_PRINTF(CELL_DEBUG_CMD, "release batch buf %u\n", buf);
+   release_buffer(buf);
+
+   /*
+    * Loop over commands in the batch buffer
+    */
+   for (pos = 0; pos < usize; /* no incr */) {
+      switch (si_to_uint(buffer[pos])) {
+      /*
+       * rendering commands
+       */
+      case CELL_CMD_CLEAR_SURFACE:
+         {
+            struct cell_command_clear_surface *clr
+               = (struct cell_command_clear_surface *) &buffer[pos];
+            cmd_clear_surface(clr);
+            pos += sizeof(*clr) / 16;
+         }
+         break;
+      case CELL_CMD_RENDER:
+         {
+            struct cell_command_render *render
+               = (struct cell_command_render *) &buffer[pos];
+            uint pos_incr;
+            cmd_render(render, &pos_incr);
+            pos += ((pos_incr+1)&~1) / 2; // should 'fix' cmd_render return
+         }
+         break;
+      /*
+       * state-update commands
+       */
+      case CELL_CMD_STATE_FRAMEBUFFER:
+         {
+            struct cell_command_framebuffer *fb
+               = (struct cell_command_framebuffer *) &buffer[pos];
+            cmd_state_framebuffer(fb);
+            pos += sizeof(*fb) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_OPS:
+         {
+            struct cell_command_fragment_ops *fops
+               = (struct cell_command_fragment_ops *) &buffer[pos];
+            cmd_state_fragment_ops(fops);
+            /* This is a variant-sized command */
+            pos += ROUNDUP16(sizeof(*fops) + fops->total_code_size) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_PROGRAM:
+         {
+            struct cell_command_fragment_program *fp
+               = (struct cell_command_fragment_program *) &buffer[pos];
+            cmd_state_fragment_program(fp);
+            pos += sizeof(*fp) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_FS_CONSTANTS:
+         pos = cmd_state_fs_constants(buffer, pos);
+         break;
+      case CELL_CMD_STATE_RASTERIZER:
+         {
+            struct cell_command_rasterizer *rast =
+               (struct cell_command_rasterizer *) &buffer[pos];
+            spu.rasterizer = rast->rasterizer;
+            pos += sizeof(*rast) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_SAMPLER:
+         {
+            struct cell_command_sampler *sampler
+               = (struct cell_command_sampler *) &buffer[pos];
+            cmd_state_sampler(sampler);
+            pos += sizeof(*sampler) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_TEXTURE:
+         {
+            struct cell_command_texture *texture
+               = (struct cell_command_texture *) &buffer[pos];
+            cmd_state_texture(texture);
+            pos += sizeof(*texture) / 16;
+         }
+         break;
+      case CELL_CMD_STATE_VERTEX_INFO:
+         cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
+         pos += 1 + ROUNDUP16(sizeof(struct vertex_info)) / 16;
+         break;
+      case CELL_CMD_STATE_VIEWPORT:
+         (void) memcpy(& draw.viewport, &buffer[pos+1],
+                       sizeof(struct pipe_viewport_state));
+         pos += 1 + ROUNDUP16(sizeof(struct pipe_viewport_state)) / 16;
+         break;
+      case CELL_CMD_STATE_UNIFORMS:
+         draw.constants = (const float (*)[4]) (uintptr_t)spu_extract((vector unsigned int)buffer[pos+1],0);
+         pos += 2;
+         break;
+      case CELL_CMD_STATE_VS_ARRAY_INFO:
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_array_info)) / 16;
+         break;
+      case CELL_CMD_STATE_BIND_VS:
+#if 0
+         spu_bind_vertex_shader(&draw,
+                                (struct cell_shader_info *) &buffer[pos+1]);
+#endif
+         pos += 1 + ROUNDUP16(sizeof(struct cell_shader_info)) / 16;
+         break;
+      case CELL_CMD_STATE_ATTRIB_FETCH:
+         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
+                                &buffer[pos+1]);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_attribute_fetch_code)) / 16;
+         break;
+      /*
+       * misc commands
+       */
+      case CELL_CMD_FINISH:
+         cmd_finish();
+         pos += 1;
+         break;
+      case CELL_CMD_FENCE:
+         {
+            struct cell_command_fence *fence_cmd =
+               (struct cell_command_fence *) &buffer[pos];
+            cmd_fence(fence_cmd);
+            pos += sizeof(*fence_cmd) / 16;
+         }
+         break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            pos += sizeof(*release) / 16;
+         }
+         break;
+      case CELL_CMD_FLUSH_BUFFER_RANGE: {
+	 struct cell_buffer_range *br = (struct cell_buffer_range *)
+	     &buffer[pos+1];
+
+	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_buffer_range)) / 16;
+	 break;
+      }
+      default:
+         printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, si_to_uint(buffer[pos]));
+         ASSERT(0);
+         break;
+      }
+   }
+
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH complete\n");
+}
+
+
+#define PERF 0
+
+
+/**
+ * Main loop for SPEs: Get a command, execute it, repeat.
+ */
+void
+command_loop(void)
+{
+   int exitFlag = 0;
+   uint t0, t1;
+
+   D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
+
+   while (!exitFlag) {
+      unsigned opcode;
+
+      D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
+
+      if (PERF)
+         spu_write_decrementer(~0);
+
+      /* read/wait from mailbox */
+      opcode = (unsigned int) spu_read_in_mbox();
+      D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
+
+      if (PERF)
+         t0 = spu_read_decrementer();
+
+      switch (opcode & CELL_CMD_OPCODE_MASK) {
+      case CELL_CMD_EXIT:
+         D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
+         exitFlag = 1;
+         break;
+      case CELL_CMD_VS_EXECUTE:
+#if 0
+         spu_execute_vertex_shader(&draw, &cmd.vs);
+#endif
+         break;
+      case CELL_CMD_BATCH:
+         cmd_batch(opcode);
+         break;
+      default:
+         printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
+      }
+
+      if (PERF) {
+         t1 = spu_read_decrementer();
+         printf("wait mbox time: %gms   batch time: %gms\n",
+                (~0u - t0) * spu.init.inv_timebase,
+                (t0 - t1) * spu.init.inv_timebase);
+      }
+   }
+
+   D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
+
+   if (spu.init.debug_flags & CELL_DEBUG_CACHE)
+      spu_dcache_report();
+}
+
+/* Initialize this module; we manage the fragment ops buffer here. */
+void
+spu_command_init(void)
+{
+   /* Install default/fallback fragment processing function.
+    * This will normally be overriden by a code-gen'd function
+    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+
+   /* Set up the basic empty buffer for code-gen'ed fragment ops */
+   spu.fragment_ops_code = NULL;
+   spu.fragment_ops_code_size = 0;
+}
+
+void
+spu_command_close(void)
+{
+   /* Deallocate the code-gen buffer for fragment ops, and reset the
+    * fragment ops functions to their initial setting (just to leave
+    * things in a good state).
+    */
+   if (spu.fragment_ops_code != NULL) {
+      free(spu.fragment_ops_code);
+   }
+   spu_command_init();
+}
diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h
new file mode 100644
index 0000000000..83dcdade28
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+extern void
+command_loop(void);
+
+extern void
+spu_command_init(void);
+
+extern void
+spu_command_close(void);
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
index 167404cdc5..a6d67634fd 100644
--- a/src/gallium/drivers/cell/spu/spu_dcache.c
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -36,7 +36,9 @@
 #define CACHE_SET_TAGID(set)  (((set) & 0x03) + TAG_DCACHE0)
 #define CACHE_LOG2NNWAY       2
 #define CACHE_LOG2NSETS       6
-/*#define CACHE_STATS           1*/
+#ifdef DEBUG
+#define CACHE_STATS           1
+#endif
 #include <cache-api.h>
 
 /* Yes folks, this is ugly.
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
new file mode 100644
index 0000000000..ff3d609d25
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -0,0 +1,173 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU functions accessed by shaders.
+ *
+ * Authors: Brian Paul
+ */
+
+
+#include <string.h>
+#include <libmisc.h>
+#include <math.h>
+#include <cos14_v.h>
+#include <sin14_v.h>
+#include <simdmath/exp2f4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
+
+#include "cell/common.h"
+#include "spu_main.h"
+#include "spu_funcs.h"
+#include "spu_texture.h"
+
+
+/** For "return"-ing four vectors */
+struct vec_4x4
+{
+   vector float v[4];
+};
+
+
+static vector float
+spu_cos(vector float x)
+{
+   return _cos14_v(x);
+}
+
+static vector float
+spu_sin(vector float x)
+{
+   return _sin14_v(x);
+}
+
+static vector float
+spu_pow(vector float x, vector float y)
+{
+   return _powf4(x, y);
+}
+
+static vector float
+spu_exp2(vector float x)
+{
+   return _exp2f4(x);
+}
+
+static vector float
+spu_log2(vector float x)
+{
+   return _log2f4(x);
+}
+
+
+static struct vec_4x4
+spu_tex_2d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_3d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_cube(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) q;
+   sample_texture_cube(s, t, r, unit, colors.v);
+   return colors;
+}
+
+
+/**
+ * Add named function to list of "exported" functions that will be
+ * made available to the PPU-hosted code generator.
+ */
+static void
+export_func(struct cell_spu_function_info *spu_functions,
+            const char *name, void *addr)
+{
+   uint n = spu_functions->num;
+   ASSERT(strlen(name) < 16);
+   strcpy(spu_functions->names[n], name);
+   spu_functions->addrs[n] = (uint) addr;
+   spu_functions->num++;
+   ASSERT(spu_functions->num <= 16);
+}
+
+
+/**
+ * Return info about the SPU's function to the PPU / main memory.
+ * The PPU needs to know the address of some SPU-side functions so
+ * that we can generate shader code with function calls.
+ */
+void
+return_function_info(void)
+{
+   struct cell_spu_function_info funcs ALIGN16_ATTRIB;
+   int tag = TAG_MISC;
+
+   ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
+
+   funcs.num = 0;
+   export_func(&funcs, "spu_cos", &spu_cos);
+   export_func(&funcs, "spu_sin", &spu_sin);
+   export_func(&funcs, "spu_pow", &spu_pow);
+   export_func(&funcs, "spu_exp2", &spu_exp2);
+   export_func(&funcs, "spu_log2", &spu_log2);
+   export_func(&funcs, "spu_tex_2d", &spu_tex_2d);
+   export_func(&funcs, "spu_tex_3d", &spu_tex_3d);
+   export_func(&funcs, "spu_tex_cube", &spu_tex_cube);
+
+   /* Send the function info back to the PPU / main memory */
+   mfc_put((void *) &funcs,  /* src in local store */
+           (unsigned int) spu.init.spu_functions, /* dst in main memory */
+           sizeof(funcs),  /* bytes */
+           tag,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << tag);
+}
+
+
+
diff --git a/src/gallium/drivers/cell/ppu/cell_winsys.c b/src/gallium/drivers/cell/spu/spu_funcs.h
index d570bbd2f9..3adb6ae99f 100644
--- a/src/gallium/drivers/cell/ppu/cell_winsys.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -25,16 +25,11 @@
  * 
  **************************************************************************/
 
+#ifndef SPU_FUNCS_H
+#define SPU_FUNCS_H
 
-#include "util/u_memory.h"
-#include "cell_winsys.h"
+extern void
+return_function_info(void);
 
+#endif
 
-struct cell_winsys *
-cell_get_winsys(uint format)
-{
-   struct cell_winsys *cws = CALLOC_STRUCT(cell_winsys);
-   if (cws)
-      cws->preferredFormat = format;
-   return cws;
-}
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 78260c4259..97c86d194d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -32,16 +32,15 @@
 #include <stdio.h>
 #include <libmisc.h>
 
+#include "pipe/p_defines.h"
+
+#include "spu_funcs.h"
+#include "spu_command.h"
 #include "spu_main.h"
-#include "spu_render.h"
 #include "spu_per_fragment_op.h"
 #include "spu_texture.h"
-#include "spu_tile.h"
 //#include "spu_test.h"
-#include "spu_vertex_shader.h"
-#include "spu_dcache.h"
 #include "cell/common.h"
-#include "pipe/p_defines.h"
 
 
 /*
@@ -50,600 +49,8 @@ helpful headers:
 /opt/cell/sdk/usr/include/libmisc.h
 */
 
-boolean Debug = FALSE;
-
 struct spu_global spu;
 
-struct spu_vs_context draw;
-
-
-/**
- * Buffers containing dynamically generated SPU code:
- */
-static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
-    ALIGN16_ATTRIB;
-
-
-
-/**
- * Tell the PPU that this SPU has finished copying a buffer to
- * local store and that it may be reused by the PPU.
- * This is done by writting a 16-byte batch-buffer-status block back into
- * main memory (in cell_context->buffer_status[]).
- */
-static void
-release_buffer(uint buffer)
-{
-   /* Evidently, using less than a 16-byte status doesn't work reliably */
-   static const uint status[4] ALIGN16_ATTRIB
-      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
-
-   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
-   uint *dst = spu.init.buffer_status + index;
-
-   ASSERT(buffer < CELL_NUM_BUFFERS);
-
-   mfc_put((void *) &status,    /* src in local memory */
-           (unsigned int) dst,  /* dst in main memory */
-           sizeof(status),      /* size */
-           TAG_MISC,            /* tag is unimportant */
-           0, /* tid */
-           0  /* rid */);
-}
-
-
-/**
- * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
- * tiles back to the main framebuffer.
- */
-static void
-really_clear_tiles(uint surfaceIndex)
-{
-   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-   uint i;
-
-   if (surfaceIndex == 0) {
-      clear_c_tile(&spu.ctile);
-
-      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
-         uint tx = i % spu.fb.width_tiles;
-         uint ty = i / spu.fb.width_tiles;
-         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
-            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
-         }
-      }
-   }
-   else {
-      clear_z_tile(&spu.ztile);
-
-      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
-         uint tx = i % spu.fb.width_tiles;
-         uint ty = i / spu.fb.width_tiles;
-         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
-            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
-      }
-   }
-
-#if 0
-   wait_on_mask(1 << TAG_SURFACE_CLEAR);
-#endif
-}
-
-
-static void
-cmd_clear_surface(const struct cell_command_clear_surface *clear)
-{
-   if (Debug)
-      printf("SPU %u: CLEAR SURF %u to 0x%08x\n", spu.init.id,
-             clear->surface, clear->value);
-
-   if (clear->surface == 0) {
-      spu.fb.color_clear_value = clear->value;
-      if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
-         uint x = (spu.init.id << 4) | (spu.init.id << 12) |
-            (spu.init.id << 20) | (spu.init.id << 28);
-         spu.fb.color_clear_value ^= x;
-      }
-   }
-   else {
-      spu.fb.depth_clear_value = clear->value;
-   }
-
-#define CLEAR_OPT 1
-#if CLEAR_OPT
-
-   /* Simply set all tiles' status to CLEAR.
-    * When we actually begin rendering into a tile, we'll initialize it to
-    * the clear value.  If any tiles go untouched during the frame,
-    * really_clear_tiles() will set them to the clear value.
-    */
-   if (clear->surface == 0) {
-      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
-   }
-   else {
-      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
-   }
-
-#else
-
-   /*
-    * This path clears the whole framebuffer to the clear color right now.
-    */
-
-   /*
-   printf("SPU: %s num=%d w=%d h=%d\n",
-          __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
-   */
-
-   /* init a single tile to the clear value */
-   if (clear->surface == 0) {
-      clear_c_tile(&spu.ctile);
-   }
-   else {
-      clear_z_tile(&spu.ztile);
-   }
-
-   /* walk over my tiles, writing the 'clear' tile's data */
-   {
-      const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-      uint i;
-      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
-         uint tx = i % spu.fb.width_tiles;
-         uint ty = i / spu.fb.width_tiles;
-         if (clear->surface == 0)
-            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
-         else
-            put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
-      }
-   }
-
-   if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
-      wait_on_mask(1 << TAG_SURFACE_CLEAR);
-   }
-
-#endif /* CLEAR_OPT */
-
-   if (Debug)
-      printf("SPU %u: CLEAR SURF done\n", spu.init.id);
-}
-
-
-static void
-cmd_release_verts(const struct cell_command_release_verts *release)
-{
-   if (Debug)
-      printf("SPU %u: RELEASE VERTS %u\n",
-             spu.init.id, release->vertex_buf);
-   ASSERT(release->vertex_buf != ~0U);
-   release_buffer(release->vertex_buf);
-}
-
-
-/**
- * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
- * This involves installing new fragment ops SPU code.
- * If this function is never called, we'll use a regular C fallback function
- * for fragment processing.
- */
-static void
-cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
-{
-   if (Debug)
-      printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
-   /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   /* Copy state info (for fallback case only) */
-   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
-   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
-
-   /* Point function pointer at new code */
-   spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
-
-   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
-   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
-}
-
-
-static void
-cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
-{
-   if (Debug)
-      printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id);
-   /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_program_code, fp->code,
-          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
-#if 01
-   /* Point function pointer at new code */
-   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
-#endif
-}
-
-
-static void
-cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
-{
-   if (Debug)
-      printf("SPU %u: FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
-             spu.init.id,
-             cmd->width,
-             cmd->height,
-             cmd->color_start,
-             cmd->color_format,
-             cmd->depth_format);
-
-   ASSERT_ALIGN16(cmd->color_start);
-   ASSERT_ALIGN16(cmd->depth_start);
-
-   spu.fb.color_start = cmd->color_start;
-   spu.fb.depth_start = cmd->depth_start;
-   spu.fb.color_format = cmd->color_format;
-   spu.fb.depth_format = cmd->depth_format;
-   spu.fb.width = cmd->width;
-   spu.fb.height = cmd->height;
-   spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
-   spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
-
-   switch (spu.fb.depth_format) {
-   case PIPE_FORMAT_Z32_UNORM:
-      spu.fb.zsize = 4;
-      spu.fb.zscale = (float) 0xffffffffu;
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-   case PIPE_FORMAT_S8Z24_UNORM:
-   case PIPE_FORMAT_Z24X8_UNORM:
-   case PIPE_FORMAT_X8Z24_UNORM:
-      spu.fb.zsize = 4;
-      spu.fb.zscale = (float) 0x00ffffffu;
-      break;
-   case PIPE_FORMAT_Z16_UNORM:
-      spu.fb.zsize = 2;
-      spu.fb.zscale = (float) 0xffffu;
-      break;
-   default:
-      spu.fb.zsize = 0;
-      break;
-   }
-}
-
-
-static void
-cmd_state_sampler(const struct cell_command_sampler *sampler)
-{
-   if (Debug)
-      printf("SPU %u: SAMPLER [%u]\n",
-             spu.init.id, sampler->unit);
-
-   spu.sampler[sampler->unit] = sampler->state;
-   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
-      spu.sample_texture[sampler->unit] = sample_texture_bilinear;
-   else
-      spu.sample_texture[sampler->unit] = sample_texture_nearest;
-}
-
-
-static void
-cmd_state_texture(const struct cell_command_texture *texture)
-{
-   const uint unit = texture->unit;
-   const uint width = texture->width;
-   const uint height = texture->height;
-
-   if (Debug) {
-      printf("SPU %u: TEXTURE [%u] at %p  size %u x %u\n", spu.init.id,
-             texture->unit, texture->start,
-             texture->width, texture->height);
-   }
-
-   spu.texture[unit].start = texture->start;
-   spu.texture[unit].width = width;
-   spu.texture[unit].height = height;
-
-   spu.texture[unit].tiles_per_row = width / TILE_SIZE;
-
-   spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
-   spu.texture[unit].tex_size_mask = (vector unsigned int)
-         { width - 1, height - 1, 0, 0 };
-   spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
-   spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
-}
-
-
-static void
-cmd_state_vertex_info(const struct vertex_info *vinfo)
-{
-   if (Debug) {
-      printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id,
-             vinfo->num_attribs);
-   }
-   ASSERT(vinfo->num_attribs >= 1);
-   ASSERT(vinfo->num_attribs <= 8);
-   memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
-}
-
-
-static void
-cmd_state_vs_array_info(const struct cell_array_info *vs_info)
-{
-   const unsigned attr = vs_info->attr;
-
-   ASSERT(attr < PIPE_MAX_ATTRIBS);
-   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
-   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
-   draw.vertex_fetch.size[attr] = vs_info->size;
-   draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
-   draw.vertex_fetch.dirty = 1;
-}
-
-
-static void
-cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
-{
-   mfc_get(attribute_fetch_code_buffer,
-           (unsigned int) code->base,  /* src */
-           code->size,
-           TAG_BATCH_BUFFER,
-           0, /* tid */
-           0  /* rid */);
-   wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-   draw.vertex_fetch.code = attribute_fetch_code_buffer;
-}
-
-
-static void
-cmd_finish(void)
-{
-   if (Debug)
-      printf("SPU %u: FINISH\n", spu.init.id);
-   really_clear_tiles(0);
-   /* wait for all outstanding DMAs to finish */
-   mfc_write_tag_mask(~0);
-   mfc_read_tag_status_all();
-   /* send mbox message to PPU */
-   spu_write_out_mbox(CELL_CMD_FINISH);
-}
-
-
-/**
- * Execute a batch of commands which was sent to us by the PPU.
- * See the cell_emit_state.c code to see where the commands come from.
- *
- * The opcode param encodes the location of the buffer and its size.
- */
-static void
-cmd_batch(uint opcode)
-{
-   const uint buf = (opcode >> 8) & 0xff;
-   uint size = (opcode >> 16);
-   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
-   const unsigned usize = size / sizeof(buffer[0]);
-   uint pos;
-
-   if (Debug)
-      printf("SPU %u: BATCH buffer %u, len %u, from %p\n",
-             spu.init.id, buf, size, spu.init.buffers[buf]);
-
-   ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
-
-   ASSERT_ALIGN16(spu.init.buffers[buf]);
-
-   size = ROUNDUP16(size);
-
-   ASSERT_ALIGN16(spu.init.buffers[buf]);
-
-   mfc_get(buffer,  /* dest */
-           (unsigned int) spu.init.buffers[buf],  /* src */
-           size,
-           TAG_BATCH_BUFFER,
-           0, /* tid */
-           0  /* rid */);
-   wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-   /* Tell PPU we're done copying the buffer to local store */
-   if (Debug)
-      printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
-   release_buffer(buf);
-
-   /*
-    * Loop over commands in the batch buffer
-    */
-   for (pos = 0; pos < usize; /* no incr */) {
-      switch (buffer[pos]) {
-      /*
-       * rendering commands
-       */
-      case CELL_CMD_CLEAR_SURFACE:
-         {
-            struct cell_command_clear_surface *clr
-               = (struct cell_command_clear_surface *) &buffer[pos];
-            cmd_clear_surface(clr);
-            pos += sizeof(*clr) / 8;
-         }
-         break;
-      case CELL_CMD_RENDER:
-         {
-            struct cell_command_render *render
-               = (struct cell_command_render *) &buffer[pos];
-            uint pos_incr;
-            cmd_render(render, &pos_incr);
-            pos += pos_incr;
-         }
-         break;
-      /*
-       * state-update commands
-       */
-      case CELL_CMD_STATE_FRAMEBUFFER:
-         {
-            struct cell_command_framebuffer *fb
-               = (struct cell_command_framebuffer *) &buffer[pos];
-            cmd_state_framebuffer(fb);
-            pos += sizeof(*fb) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_FRAGMENT_OPS:
-         {
-            struct cell_command_fragment_ops *fops
-               = (struct cell_command_fragment_ops *) &buffer[pos];
-            cmd_state_fragment_ops(fops);
-            pos += sizeof(*fops) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_FRAGMENT_PROGRAM:
-         {
-            struct cell_command_fragment_program *fp
-               = (struct cell_command_fragment_program *) &buffer[pos];
-            cmd_state_fragment_program(fp);
-            pos += sizeof(*fp) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_SAMPLER:
-         {
-            struct cell_command_sampler *sampler
-               = (struct cell_command_sampler *) &buffer[pos];
-            cmd_state_sampler(sampler);
-            pos += sizeof(*sampler) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_TEXTURE:
-         {
-            struct cell_command_texture *texture
-               = (struct cell_command_texture *) &buffer[pos];
-            cmd_state_texture(texture);
-            pos += sizeof(*texture) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_VERTEX_INFO:
-         cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
-         break;
-      case CELL_CMD_STATE_VIEWPORT:
-         (void) memcpy(& draw.viewport, &buffer[pos+1],
-                       sizeof(struct pipe_viewport_state));
-         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
-         break;
-      case CELL_CMD_STATE_UNIFORMS:
-         draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
-         pos += 2;
-         break;
-      case CELL_CMD_STATE_VS_ARRAY_INFO:
-         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
-         break;
-      case CELL_CMD_STATE_BIND_VS:
-#if 0
-         spu_bind_vertex_shader(&draw,
-                                (struct cell_shader_info *) &buffer[pos+1]);
-#endif
-         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
-         break;
-      case CELL_CMD_STATE_ATTRIB_FETCH:
-         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
-                                &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
-         break;
-      /*
-       * misc commands
-       */
-      case CELL_CMD_FINISH:
-         cmd_finish();
-         pos += 1;
-         break;
-      case CELL_CMD_RELEASE_VERTS:
-         {
-            struct cell_command_release_verts *release
-               = (struct cell_command_release_verts *) &buffer[pos];
-            cmd_release_verts(release);
-            pos += sizeof(*release) / 8;
-         }
-         break;
-      case CELL_CMD_FLUSH_BUFFER_RANGE: {
-	 struct cell_buffer_range *br = (struct cell_buffer_range *)
-	     &buffer[pos+1];
-
-	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
-	 break;
-      }
-      default:
-         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
-         ASSERT(0);
-         break;
-      }
-   }
-
-   if (Debug)
-      printf("SPU %u: BATCH complete\n", spu.init.id);
-}
-
-
-/**
- * Temporary/simple main loop for SPEs: Get a command, execute it, repeat.
- */
-static void
-main_loop(void)
-{
-   struct cell_command cmd;
-   int exitFlag = 0;
-
-   if (Debug)
-      printf("SPU %u: Enter main loop\n", spu.init.id);
-
-   ASSERT((sizeof(struct cell_command) & 0xf) == 0);
-   ASSERT_ALIGN16(&cmd);
-
-   while (!exitFlag) {
-      unsigned opcode;
-      int tag = 0;
-
-      if (Debug)
-         printf("SPU %u: Wait for cmd...\n", spu.init.id);
-
-      /* read/wait from mailbox */
-      opcode = (unsigned int) spu_read_in_mbox();
-
-      if (Debug)
-         printf("SPU %u: got cmd 0x%x\n", spu.init.id, opcode);
-
-      /* command payload */
-      mfc_get(&cmd,  /* dest */
-              (unsigned int) spu.init.cmd, /* src */
-              sizeof(struct cell_command), /* bytes */
-              tag,
-              0, /* tid */
-              0  /* rid */);
-      wait_on_mask( 1 << tag );
-
-      /*
-       * NOTE: most commands should be contained in a batch buffer
-       */
-
-      switch (opcode & CELL_CMD_OPCODE_MASK) {
-      case CELL_CMD_EXIT:
-         if (Debug)
-            printf("SPU %u: EXIT\n", spu.init.id);
-         exitFlag = 1;
-         break;
-      case CELL_CMD_VS_EXECUTE:
-#if 0
-         spu_execute_vertex_shader(&draw, &cmd.vs);
-#endif
-         break;
-      case CELL_CMD_BATCH:
-         cmd_batch(opcode);
-         break;
-      default:
-         printf("Bad opcode!\n");
-      }
-
-   }
-
-   if (Debug)
-      printf("SPU %u: Exit main loop\n", spu.init.id);
-
-   spu_dcache_report();
-}
-
-
 
 static void
 one_time_init(void)
@@ -651,15 +58,8 @@ one_time_init(void)
    memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
    memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
-
-   /* Install default/fallback fragment processing function.
-    * This will normally be overriden by a code-gen'd function.
-    */
-   spu.fragment_ops = spu_fallback_fragment_ops;
 }
 
-
-
 /* In some versions of the SDK the SPE main takes 'unsigned long' as a
  * parameter.  In others it takes 'unsigned long long'.  Use a define to
  * select between the two.
@@ -682,12 +82,16 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
+   ASSERT(sizeof(struct cell_command_fragment_ops) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
 
    one_time_init();
+   spu_command_init();
 
-   if (Debug)
-      printf("SPU: main() speid=%lu\n", (unsigned long) speid);
+   D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid);
+   D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
 
+   /* get initialization data */
    mfc_get(&spu.init,  /* dest */
            (unsigned int) argp, /* src */
            sizeof(struct cell_init_info), /* bytes */
@@ -696,12 +100,18 @@ main(main_param_t speid, main_param_t argp)
            0  /* rid */);
    wait_on_mask( 1 << tag );
 
+   if (spu.init.id == 0) {
+      return_function_info();
+   }
+
 #if 0
    if (spu.init.id==0)
-      spu_test_misc();
+      spu_test_misc(spu.init.id);
 #endif
 
-   main_loop();
+   command_loop();
+
+   spu_command_close();
 
    return 0;
 }
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 2c7b625840..33767e7c51 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -36,9 +36,18 @@
 #include "pipe/p_state.h"
 
 
-
-#define MAX_WIDTH 1024
-#define MAX_HEIGHT 1024
+#if DEBUG
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define D_PRINTF(flag, format,...) \
+   if (spu.init.debug_flags & (flag)) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#else
+#define D_PRINTF(...)
+#endif
 
 
 /**
@@ -61,8 +70,11 @@ typedef union {
 
 
 /** Function for sampling textures */
-typedef vector float (*spu_sample_texture_func)(uint unit,
-                                                vector float texcoord);
+typedef void (*spu_sample_texture_2d_func)(vector float s,
+                                           vector float t,
+                                           uint unit, uint level, uint face,
+                                           vector float colors[4]);
+
 
 /** Function for performing per-fragment ops */
 typedef void (*spu_fragment_ops_func)(uint x, uint y,
@@ -76,9 +88,9 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector unsigned int mask);
 
 /** Function for running fragment program */
-typedef void (*spu_fragment_program_func)(vector float *inputs,
-                                          vector float *outputs,
-                                          vector float *constants);
+typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
+                                                         vector float *outputs,
+                                                         vector float *constants);
 
 
 struct spu_framebuffer
@@ -98,15 +110,27 @@ struct spu_framebuffer
 } ALIGN16_ATTRIB;
 
 
-struct spu_texture
+/** per-texture level info */
+struct spu_texture_level
 {
    void *start;
-   ushort width, height;
+   ushort width, height, depth;
    ushort tiles_per_row;
-   vector float tex_size;
-   vector unsigned int tex_size_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
+   uint bytes_per_image;
+   /** texcoord scale factors */
+   vector float scale_s, scale_t, scale_r;
+   /** texcoord masks (if REPEAT then size-1, else ~0) */
+   vector signed int mask_s, mask_t, mask_r;
+   /** texcoord clamp limits */
+   vector signed int max_s, max_t, max_r;
+} ALIGN16_ATTRIB;
+
+
+struct spu_texture
+{
+   struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
+   uint max_level;
+   uint target;  /**< PIPE_TEXTURE_x */
 } ALIGN16_ATTRIB;
 
 
@@ -124,7 +148,9 @@ struct spu_global
    struct spu_framebuffer fb;
    struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
    struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_rasterizer_state rasterizer;
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
    struct vertex_info vertex_info;
 
@@ -133,39 +159,38 @@ struct spu_global
    tile_t ztile ALIGN16_ATTRIB;
 
    /** Read depth/stencil tiles? */
-   boolean read_depth;
-   boolean read_stencil;
+   boolean read_depth_stencil;
 
    /** Current tiles' status */
    ubyte cur_ctile_status, cur_ztile_status;
 
    /** Status of all tiles in framebuffer */
-   ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-   ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
-   /** Current fragment ops machine code */
-   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS];
-   /** Current fragment ops function */
-   spu_fragment_ops_func fragment_ops;
+   /** Current fragment ops machine code, at 8-byte boundary */
+   uint *fragment_ops_code;
+   uint fragment_ops_code_size;
+   /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */
+   spu_fragment_ops_func fragment_ops[2];
 
-   /** Current fragment program machine code */
-   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+   /** Current fragment program machine code, at 8-byte boundary */
+   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
    /** Current fragment ops function */
    spu_fragment_program_func fragment_program;
 
    /** Current texture sampler function */
-   spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func min_sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func mag_sample_texture_2d[CELL_MAX_SAMPLERS];
 
-   /** Fragment program constants (XXX preliminary/used) */
-#define MAX_CONSTANTS 32
-   vector float constants[MAX_CONSTANTS];
+   /** Fragment program constants */
+   vector float constants[4 * CELL_MAX_CONSTANTS];
 
 } ALIGN16_ATTRIB;
 
 
 extern struct spu_global spu;
-extern boolean Debug;
-
 
 
 
@@ -184,7 +209,7 @@ extern boolean Debug;
 #define TAG_DCACHE1           21
 #define TAG_DCACHE2           22
 #define TAG_DCACHE3           23
-
+#define TAG_FENCE             24
 
 
 static INLINE void
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 03dd547845..eba9f95cf1 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -40,6 +40,24 @@
 #define LINEAR_QUAD_LAYOUT 1
 
 
+static INLINE vector float
+spu_min(vector float a, vector float b)
+{
+   vector unsigned int m;
+   m = spu_cmpgt(a, b);    /* m = a > b ? ~0 : 0 */
+   return spu_sel(a, b, m);
+}
+
+
+static INLINE vector float
+spu_max(vector float a, vector float b)
+{
+   vector unsigned int m;
+   m = spu_cmpgt(a, b);    /* m = a > b ? ~0 : 0 */
+   return spu_sel(b, a, m);
+}
+
+
 /**
  * Called by rasterizer for each quad after the shader has run.  Do
  * all the per-fragment operations including alpha test, z test,
@@ -60,11 +78,14 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector unsigned int mask)
 {
    vector float frag_aos[4];
-   unsigned int c0, c1, c2, c3;
+   unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
+   unsigned int fragc0, fragc1, fragc2, fragc3;  /* fragment colors */
 
-   /* do alpha test */
+   /*
+    * Do alpha test
+    */
    if (spu.depth_stencil_alpha.alpha.enabled) {
-      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
+      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref_value);
       vector unsigned int amask;
 
       switch (spu.depth_stencil_alpha.alpha.func) {
@@ -102,7 +123,10 @@ spu_fallback_fragment_ops(uint x, uint y,
       mask = spu_and(mask, amask);
    }
 
-   /* Z and/or stencil testing... */
+
+   /*
+    * Z and/or stencil testing...
+    */
    if (spu.depth_stencil_alpha.depth.enabled ||
        spu.depth_stencil_alpha.stencil[0].enabled) {
 
@@ -178,6 +202,32 @@ spu_fallback_fragment_ops(uint x, uint y,
       }
    }
 
+
+   /*
+    * If we'll need the current framebuffer/tile colors for blending
+    * or logicop or colormask, fetch them now.
+    */
+   if (spu.blend.blend_enable ||
+       spu.blend.logicop_enable ||
+       spu.blend.colormask != 0xf) {
+
+#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
+      fbc0 = colorTile->ui[y][x*2+0];
+      fbc1 = colorTile->ui[y][x*2+1];
+      fbc2 = colorTile->ui[y][x*2+2];
+      fbc3 = colorTile->ui[y][x*2+3];
+#else
+      fbc0 = colorTile->ui[y+0][x+0];
+      fbc1 = colorTile->ui[y+0][x+1];
+      fbc2 = colorTile->ui[y+1][x+0];
+      fbc3 = colorTile->ui[y+1][x+1];
+#endif
+   }
+
+
+   /*
+    * Do blending
+    */
    if (spu.blend.blend_enable) {
       /* blending terms, misc regs */
       vector float term1r, term1g, term1b, term1a;
@@ -186,43 +236,30 @@ spu_fallback_fragment_ops(uint x, uint y,
 
       vector float fbRGBA[4];  /* current framebuffer colors */
 
-      /* get colors from framebuffer/tile */
+      /* convert framebuffer colors from packed int to vector float */
       {
-         vector float fc[4];
-         uint c0, c1, c2, c3;
-
-#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
-         c0 = colorTile->ui[y][x*2+0];
-         c1 = colorTile->ui[y][x*2+1];
-         c2 = colorTile->ui[y][x*2+2];
-         c3 = colorTile->ui[y][x*2+3];
-#else
-         c0 = colorTile->ui[y+0][x+0];
-         c1 = colorTile->ui[y+0][x+1];
-         c2 = colorTile->ui[y+1][x+0];
-         c3 = colorTile->ui[y+1][x+1];
-#endif
+         vector float temp[4]; /* float colors in AOS form */
          switch (spu.fb.color_format) {
          case PIPE_FORMAT_B8G8R8A8_UNORM:
-            fc[0] = spu_unpack_B8G8R8A8(c0);
-            fc[1] = spu_unpack_B8G8R8A8(c1);
-            fc[2] = spu_unpack_B8G8R8A8(c2);
-            fc[3] = spu_unpack_B8G8R8A8(c3);
+            temp[0] = spu_unpack_B8G8R8A8(fbc0);
+            temp[1] = spu_unpack_B8G8R8A8(fbc1);
+            temp[2] = spu_unpack_B8G8R8A8(fbc2);
+            temp[3] = spu_unpack_B8G8R8A8(fbc3);
             break;
          case PIPE_FORMAT_A8R8G8B8_UNORM:
-            fc[0] = spu_unpack_A8R8G8B8(c0);
-            fc[1] = spu_unpack_A8R8G8B8(c1);
-            fc[2] = spu_unpack_A8R8G8B8(c2);
-            fc[3] = spu_unpack_A8R8G8B8(c3);
+            temp[0] = spu_unpack_A8R8G8B8(fbc0);
+            temp[1] = spu_unpack_A8R8G8B8(fbc1);
+            temp[2] = spu_unpack_A8R8G8B8(fbc2);
+            temp[3] = spu_unpack_A8R8G8B8(fbc3);
             break;
          default:
             ASSERT(0);
          }
-         _transpose_matrix4x4(fbRGBA, fc);
+         _transpose_matrix4x4(fbRGBA, temp); /* fbRGBA = transpose(temp) */
       }
 
       /*
-       * Compute Src RGB terms
+       * Compute Src RGB terms (fragment color * factor)
        */
       switch (spu.blend.rgb_src_factor) {
       case PIPE_BLENDFACTOR_ONE:
@@ -245,13 +282,33 @@ spu_fallback_fragment_ops(uint x, uint y,
          term1g = spu_mul(fragG, fragA);
          term1b = spu_mul(fragB, fragA);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term1r = spu_mul(fragR, fbRGBA[0]);
+         term1g = spu_mul(fragG, fbRGBA[1]);
+         term1b = spu_mul(fragB, fbRGBA[1]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1r = spu_mul(fragR, fbRGBA[3]);
+         term1g = spu_mul(fragG, fbRGBA[3]);
+         term1b = spu_mul(fragB, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[0]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[1]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[3]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Src Alpha term
+       * Compute Src Alpha term (fragment alpha * factor)
        */
       switch (spu.blend.alpha_src_factor) {
       case PIPE_BLENDFACTOR_ONE:
@@ -263,19 +320,29 @@ spu_fallback_fragment_ops(uint x, uint y,
       case PIPE_BLENDFACTOR_SRC_ALPHA:
          term1a = spu_mul(fragA, fragA);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1a = spu_mul(fragA, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1a = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Dest RGB terms
+       * Compute Dest RGB terms (framebuffer color * factor)
        */
       switch (spu.blend.rgb_dst_factor) {
       case PIPE_BLENDFACTOR_ONE:
-         term2r = fragR;
-         term2g = fragG;
-         term2b = fragB;
+         term2r = fbRGBA[0];
+         term2g = fbRGBA[1];
+         term2b = fbRGBA[2];
          break;
       case PIPE_BLENDFACTOR_ZERO:
          term2r =
@@ -299,17 +366,37 @@ spu_fallback_fragment_ops(uint x, uint y,
          term2g = spu_mul(fbRGBA[1], tmp);
          term2b = spu_mul(fbRGBA[2], tmp);
          break;
-      /* XXX more cases */
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[0]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[1]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[2]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[3]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[3]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[0]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[1]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[3]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[3]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[3]));
+         break;
+       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Dest Alpha term
+       * Compute Dest Alpha term (framebuffer alpha * factor)
        */
       switch (spu.blend.alpha_dst_factor) {
       case PIPE_BLENDFACTOR_ONE:
-         term2a = fragA;
+         term2a = fbRGBA[3];
          break;
       case PIPE_BLENDFACTOR_SRC_COLOR:
          term2a = spu_splats(0.0f);
@@ -322,6 +409,16 @@ spu_fallback_fragment_ops(uint x, uint y,
          tmp = spu_sub(one, fragA);
          term2a = spu_mul(fbRGBA[3], tmp);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
@@ -341,7 +438,21 @@ spu_fallback_fragment_ops(uint x, uint y,
          fragG = spu_sub(term1g, term2g);
          fragB = spu_sub(term1b, term2b);
          break;
-      /* XXX more cases */
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         fragR = spu_sub(term2r, term1r);
+         fragG = spu_sub(term2g, term1g);
+         fragB = spu_sub(term2b, term1b);
+         break;
+      case PIPE_BLEND_MIN:
+         fragR = spu_min(term1r, term2r);
+         fragG = spu_min(term1g, term2g);
+         fragB = spu_min(term1b, term2b);
+         break;
+      case PIPE_BLEND_MAX:
+         fragR = spu_max(term1r, term2r);
+         fragG = spu_max(term1g, term2g);
+         fragB = spu_max(term1b, term2b);
+         break;
       default:
          ASSERT(0);
       }
@@ -356,7 +467,15 @@ spu_fallback_fragment_ops(uint x, uint y,
       case PIPE_BLEND_SUBTRACT:
          fragA = spu_sub(term1a, term2a);
          break;
-      /* XXX more cases */
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         fragA = spu_sub(term2a, term1a);
+         break;
+      case PIPE_BLEND_MIN:
+         fragA = spu_min(term1a, term2a);
+         break;
+      case PIPE_BLEND_MAX:
+         fragA = spu_max(term1a, term2a);
+         break;
       default:
          ASSERT(0);
       }
@@ -384,21 +503,20 @@ spu_fallback_fragment_ops(uint x, uint y,
 #endif
 
    /*
-    * Pack float colors into 32-bit RGBA words.
+    * Pack fragment float colors into 32-bit RGBA words.
     */
    switch (spu.fb.color_format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      c0 = spu_pack_A8R8G8B8(frag_aos[0]);
-      c1 = spu_pack_A8R8G8B8(frag_aos[1]);
-      c2 = spu_pack_A8R8G8B8(frag_aos[2]);
-      c3 = spu_pack_A8R8G8B8(frag_aos[3]);
+      fragc0 = spu_pack_A8R8G8B8(frag_aos[0]);
+      fragc1 = spu_pack_A8R8G8B8(frag_aos[1]);
+      fragc2 = spu_pack_A8R8G8B8(frag_aos[2]);
+      fragc3 = spu_pack_A8R8G8B8(frag_aos[3]);
       break;
-
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      c0 = spu_pack_B8G8R8A8(frag_aos[0]);
-      c1 = spu_pack_B8G8R8A8(frag_aos[1]);
-      c2 = spu_pack_B8G8R8A8(frag_aos[2]);
-      c3 = spu_pack_B8G8R8A8(frag_aos[3]);
+      fragc0 = spu_pack_B8G8R8A8(frag_aos[0]);
+      fragc1 = spu_pack_B8G8R8A8(frag_aos[1]);
+      fragc2 = spu_pack_B8G8R8A8(frag_aos[2]);
+      fragc3 = spu_pack_B8G8R8A8(frag_aos[3]);
       break;
    default:
       fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
@@ -407,20 +525,57 @@ spu_fallback_fragment_ops(uint x, uint y,
 
 
    /*
-    * Color masking
+    * Do color masking
     */
    if (spu.blend.colormask != 0xf) {
-      /* XXX to do */
-      /* apply color mask to 32-bit packed colors */
+      uint cmask = 0x0; /* each byte corresponds to a color channel */
+
+      /* Form bitmask depending on color buffer format and colormask bits */
+      switch (spu.fb.color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         if (spu.blend.colormask & PIPE_MASK_R)
+            cmask |= 0x00ff0000; /* red */
+         if (spu.blend.colormask & PIPE_MASK_G)
+            cmask |= 0x0000ff00; /* green */
+         if (spu.blend.colormask & PIPE_MASK_B)
+            cmask |= 0x000000ff; /* blue */
+         if (spu.blend.colormask & PIPE_MASK_A)
+            cmask |= 0xff000000; /* alpha */
+         break;
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         if (spu.blend.colormask & PIPE_MASK_R)
+            cmask |= 0x0000ff00; /* red */
+         if (spu.blend.colormask & PIPE_MASK_G)
+            cmask |= 0x00ff0000; /* green */
+         if (spu.blend.colormask & PIPE_MASK_B)
+            cmask |= 0xff000000; /* blue */
+         if (spu.blend.colormask & PIPE_MASK_A)
+            cmask |= 0x000000ff; /* alpha */
+         break;
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Apply color mask to the 32-bit packed colors.
+       * if (cmask[i])
+       *    frag color[i] = frag color[i];
+       * else
+       *    frag color[i] = framebuffer color[i];
+       */
+      fragc0 = (fragc0 & cmask) | (fbc0 & ~cmask);
+      fragc1 = (fragc1 & cmask) | (fbc1 & ~cmask);
+      fragc2 = (fragc2 & cmask) | (fbc2 & ~cmask);
+      fragc3 = (fragc3 & cmask) | (fbc3 & ~cmask);
    }
 
 
    /*
-    * Logic Ops
+    * Do logic ops
     */
    if (spu.blend.logicop_enable) {
       /* XXX to do */
-      /* apply logicop to 32-bit packed colors */
+      /* apply logicop to 32-bit packed colors (fragcx and fbcx) */
    }
 
 
@@ -431,45 +586,46 @@ spu_fallback_fragment_ops(uint x, uint y,
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
    }
    else {
+      /* write no fragments */
       return;
    }
 
 
    /*
-    * Write new quad colors to the framebuffer/tile.
+    * Write new fragment/quad colors to the framebuffer/tile.
     * Only write pixels where the corresponding mask word is set.
     */
 #if LINEAR_QUAD_LAYOUT
    /*
     * Quad layout:
     *  +--+--+--+--+
-    *  |p0|p1|p2|p3|
+    *  |p0|p1|p2|p3|...
     *  +--+--+--+--+
     */
    if (spu_extract(mask, 0))
-      colorTile->ui[y][x*2] = c0;
+      colorTile->ui[y][x*2] = fragc0;
    if (spu_extract(mask, 1))
-      colorTile->ui[y][x*2+1] = c1;
+      colorTile->ui[y][x*2+1] = fragc1;
    if (spu_extract(mask, 2))
-      colorTile->ui[y][x*2+2] = c2;
+      colorTile->ui[y][x*2+2] = fragc2;
    if (spu_extract(mask, 3))
-      colorTile->ui[y][x*2+3] = c3;
+      colorTile->ui[y][x*2+3] = fragc3;
 #else
    /*
     * Quad layout:
     *  +--+--+
-    *  |p0|p1|
+    *  |p0|p1|...
     *  +--+--+
-    *  |p2|p3|
+    *  |p2|p3|...
     *  +--+--+
     */
    if (spu_extract(mask, 0))
-      colorTile->ui[y+0][x+0] = c0;
+      colorTile->ui[y+0][x+0] = fragc0;
    if (spu_extract(mask, 1))
-      colorTile->ui[y+0][x+1] = c1;
+      colorTile->ui[y+0][x+1] = fragc1;
    if (spu_extract(mask, 2))
-      colorTile->ui[y+1][x+0] = c2;
+      colorTile->ui[y+1][x+0] = fragc2;
    if (spu_extract(mask, 3))
-      colorTile->ui[y+1][x+1] = c3;
+      colorTile->ui[y+1][x+1] = fragc3;
 #endif
 }
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 305dc98881..7c225e2f27 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -98,7 +98,7 @@ my_tile(uint tx, uint ty)
 static INLINE void
 get_cz_tiles(uint tx, uint ty)
 {
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
          //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
          get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
@@ -153,7 +153,7 @@ static INLINE void
 wait_put_cz_tiles(void)
 {
    wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       wait_on_mask(1 << TAG_WRITE_TILE_Z);
    }
 }
@@ -175,22 +175,14 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    const ubyte *vertices;
    const ushort *indexes;
    uint i, j;
+   uint num_tiles;
 
-
-   if (Debug) {
-      printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
-             "inline_vert=%u\n",
-             spu.init.id,
-             render->prim_type,
-             render->num_verts,
-             render->num_indexes,
-             render->inline_verts);
-
-      /*
-      printf("       bound: %g, %g .. %g, %g\n",
-             render->xmin, render->ymin, render->xmax, render->ymax);
-      */
-   }
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n",
+            render->prim_type,
+            render->num_verts,
+            render->num_indexes,
+            render->inline_verts);
 
    ASSERT(sizeof(*render) % 4 == 0);
    ASSERT(total_vertex_bytes % 16 == 0);
@@ -251,6 +243,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
 
 
+   num_tiles = 0;
+
    /**
     ** loop over tiles, rendering tris
     **/
@@ -264,6 +258,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       if (!my_tile(tx, ty))
          continue;
 
+      num_tiles++;
+
       spu.cur_ctile_status = spu.ctile_status[ty][tx];
       spu.cur_ztile_status = spu.ztile_status[ty][tx];
 
@@ -293,9 +289,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       spu.ztile_status[ty][tx] = spu.cur_ztile_status;
    }
 
-   if (Debug)
-      printf("SPU %u: RENDER done\n",
-             spu.init.id);
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER done (%u tiles hit)\n",
+            num_tiles);
 }
-
-
diff --git a/src/gallium/drivers/cell/spu/spu_shuffle.h b/src/gallium/drivers/cell/spu/spu_shuffle.h
new file mode 100644
index 0000000000..74f2a0b6d2
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_shuffle.h
@@ -0,0 +1,186 @@
+#ifndef SPU_SHUFFLE_H
+#define SPU_SHUFFLE_H
+
+/*
+ * Generate shuffle patterns with minimal fuss.
+ *
+ * Based on ideas from 
+ * http://www.insomniacgames.com/tech/articles/0408/files/shuffles.pdf
+ *
+ * A-P indicates 0-15th position in first vector
+ * a-p indicates 0-15th position in second vector
+ *
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * |00|01|02|03|04|05|06|07|08|09|0a|0b|0c|0d|0e|0f|
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * |          A|          B|          C|          D|
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |    A|    B|    C|    D|    E|    F|    G|    H|
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * | A| B| C| D| E| F| G| H| I| J| K| L| M| N| O| P|
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ *
+ * x or X indicates 0xff
+ * 8 indicates 0x80
+ * 0 indicates 0x00
+ *
+ * The macros SHUFFLE4() SHUFFLE8() and SHUFFLE16() provide a const vector 
+ * unsigned char literal suitable for use with spu_shuffle().
+ *
+ * The macros SHUFB4() SHUFB8() and SHUFB16() provide a const qword vector 
+ * literal suitable for use with si_shufb().
+ *
+ *
+ * For example :
+ * SHUFB4(A,A,A,A)
+ * expands to :
+ * ((const qword){0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3})
+ * 
+ * SHUFFLE8(A,B,a,b,C,c,8,8)
+ * expands to :
+ * ((const vector unsigned char){0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,
+ *				 0x04,0x05,0x14,0x15,0xe0,0xe0,0xe0,0xe0})
+ *
+ */
+
+#include <spu_intrinsics.h>
+
+#define SHUFFLE_PATTERN_4_A__  0x00, 0x01, 0x02, 0x03
+#define SHUFFLE_PATTERN_4_B__  0x04, 0x05, 0x06, 0x07
+#define SHUFFLE_PATTERN_4_C__  0x08, 0x09, 0x0a, 0x0b
+#define SHUFFLE_PATTERN_4_D__  0x0c, 0x0d, 0x0e, 0x0f
+#define SHUFFLE_PATTERN_4_a__  0x10, 0x11, 0x12, 0x13
+#define SHUFFLE_PATTERN_4_b__  0x14, 0x15, 0x16, 0x17
+#define SHUFFLE_PATTERN_4_c__  0x18, 0x19, 0x1a, 0x1b
+#define SHUFFLE_PATTERN_4_d__  0x1c, 0x1d, 0x1e, 0x1f
+#define SHUFFLE_PATTERN_4_X__  0xc0, 0xc0, 0xc0, 0xc0
+#define SHUFFLE_PATTERN_4_x__  0xc0, 0xc0, 0xc0, 0xc0
+#define SHUFFLE_PATTERN_4_0__  0x80, 0x80, 0x80, 0x80
+#define SHUFFLE_PATTERN_4_8__  0xe0, 0xe0, 0xe0, 0xe0
+
+#define SHUFFLE_VECTOR_4__(A, B, C, D) \
+   SHUFFLE_PATTERN_4_##A##__, \
+   SHUFFLE_PATTERN_4_##B##__, \
+   SHUFFLE_PATTERN_4_##C##__, \
+   SHUFFLE_PATTERN_4_##D##__
+
+#define SHUFFLE4(A, B, C, D) \
+   ((const vector unsigned char){ \
+      SHUFFLE_VECTOR_4__(A, B, C, D) \
+   })
+
+#define SHUFB4(A, B, C, D) \
+   ((const qword){ \
+      SHUFFLE_VECTOR_4__(A, B, C, D) \
+   })
+
+
+#define SHUFFLE_PATTERN_8_A__  0x00, 0x01
+#define SHUFFLE_PATTERN_8_B__  0x02, 0x03
+#define SHUFFLE_PATTERN_8_C__  0x04, 0x05
+#define SHUFFLE_PATTERN_8_D__  0x06, 0x07
+#define SHUFFLE_PATTERN_8_E__  0x08, 0x09
+#define SHUFFLE_PATTERN_8_F__  0x0a, 0x0b
+#define SHUFFLE_PATTERN_8_G__  0x0c, 0x0d
+#define SHUFFLE_PATTERN_8_H__  0x0e, 0x0f
+#define SHUFFLE_PATTERN_8_a__  0x10, 0x11
+#define SHUFFLE_PATTERN_8_b__  0x12, 0x13
+#define SHUFFLE_PATTERN_8_c__  0x14, 0x15
+#define SHUFFLE_PATTERN_8_d__  0x16, 0x17
+#define SHUFFLE_PATTERN_8_e__  0x18, 0x19
+#define SHUFFLE_PATTERN_8_f__  0x1a, 0x1b
+#define SHUFFLE_PATTERN_8_g__  0x1c, 0x1d
+#define SHUFFLE_PATTERN_8_h__  0x1e, 0x1f
+#define SHUFFLE_PATTERN_8_X__  0xc0, 0xc0
+#define SHUFFLE_PATTERN_8_x__  0xc0, 0xc0
+#define SHUFFLE_PATTERN_8_0__  0x80, 0x80
+#define SHUFFLE_PATTERN_8_8__  0xe0, 0xe0
+
+
+#define SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \
+   SHUFFLE_PATTERN_8_##A##__, \
+   SHUFFLE_PATTERN_8_##B##__, \
+   SHUFFLE_PATTERN_8_##C##__, \
+   SHUFFLE_PATTERN_8_##D##__, \
+   SHUFFLE_PATTERN_8_##E##__, \
+   SHUFFLE_PATTERN_8_##F##__, \
+   SHUFFLE_PATTERN_8_##G##__, \
+   SHUFFLE_PATTERN_8_##H##__
+
+#define SHUFFLE8(A, B, C, D, E, F, G, H) \
+   ((const vector unsigned char){ \
+      SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \
+   })
+
+#define SHUFB8(A, B, C, D, E, F, G, H) \
+   ((const qword){ \
+      SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \
+   })
+
+
+#define SHUFFLE_PATTERN_16_A__  0x00
+#define SHUFFLE_PATTERN_16_B__  0x01
+#define SHUFFLE_PATTERN_16_C__  0x02
+#define SHUFFLE_PATTERN_16_D__  0x03
+#define SHUFFLE_PATTERN_16_E__  0x04
+#define SHUFFLE_PATTERN_16_F__  0x05
+#define SHUFFLE_PATTERN_16_G__  0x06
+#define SHUFFLE_PATTERN_16_H__  0x07
+#define SHUFFLE_PATTERN_16_I__  0x08
+#define SHUFFLE_PATTERN_16_J__  0x09
+#define SHUFFLE_PATTERN_16_K__  0x0a
+#define SHUFFLE_PATTERN_16_L__  0x0b
+#define SHUFFLE_PATTERN_16_M__  0x0c
+#define SHUFFLE_PATTERN_16_N__  0x0d
+#define SHUFFLE_PATTERN_16_O__  0x0e
+#define SHUFFLE_PATTERN_16_P__  0x0f
+#define SHUFFLE_PATTERN_16_a__  0x10
+#define SHUFFLE_PATTERN_16_b__  0x11
+#define SHUFFLE_PATTERN_16_c__  0x12
+#define SHUFFLE_PATTERN_16_d__  0x13
+#define SHUFFLE_PATTERN_16_e__  0x14
+#define SHUFFLE_PATTERN_16_f__  0x15
+#define SHUFFLE_PATTERN_16_g__  0x16
+#define SHUFFLE_PATTERN_16_h__  0x17
+#define SHUFFLE_PATTERN_16_i__  0x18
+#define SHUFFLE_PATTERN_16_j__  0x19
+#define SHUFFLE_PATTERN_16_k__  0x1a
+#define SHUFFLE_PATTERN_16_l__  0x1b
+#define SHUFFLE_PATTERN_16_m__  0x1c
+#define SHUFFLE_PATTERN_16_n__  0x1d
+#define SHUFFLE_PATTERN_16_o__  0x1e
+#define SHUFFLE_PATTERN_16_p__  0x1f
+#define SHUFFLE_PATTERN_16_X__  0xc0
+#define SHUFFLE_PATTERN_16_x__  0xc0
+#define SHUFFLE_PATTERN_16_0__  0x80
+#define SHUFFLE_PATTERN_16_8__  0xe0
+
+#define SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   SHUFFLE_PATTERN_16_##A##__, \
+   SHUFFLE_PATTERN_16_##B##__, \
+   SHUFFLE_PATTERN_16_##C##__, \
+   SHUFFLE_PATTERN_16_##D##__, \
+   SHUFFLE_PATTERN_16_##E##__, \
+   SHUFFLE_PATTERN_16_##F##__, \
+   SHUFFLE_PATTERN_16_##G##__, \
+   SHUFFLE_PATTERN_16_##H##__, \
+   SHUFFLE_PATTERN_16_##I##__, \
+   SHUFFLE_PATTERN_16_##J##__, \
+   SHUFFLE_PATTERN_16_##K##__, \
+   SHUFFLE_PATTERN_16_##L##__, \
+   SHUFFLE_PATTERN_16_##M##__, \
+   SHUFFLE_PATTERN_16_##N##__, \
+   SHUFFLE_PATTERN_16_##O##__, \
+   SHUFFLE_PATTERN_16_##P##__
+
+#define SHUFFLE16(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   ((const vector unsigned char){ \
+      SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   })
+
+#define SHUFB16(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   ((const qword){ \
+      SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+   })
+
+#endif
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 117b8a36f8..69784c8978 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -26,6 +26,8 @@
  **************************************************************************/
 
 
+#include <math.h>
+
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -40,37 +42,19 @@
 void
 invalidate_tex_cache(void)
 {
-   uint unit = 0;
-   uint bytes = 4 * spu.texture[unit].width
-      * spu.texture[unit].height;
-
-   spu_dcache_mark_dirty((unsigned) spu.texture[unit].start, bytes);
-}
+   uint lvl;
+   for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) {
+      uint unit = 0;
+      uint bytes = 4 * spu.texture[unit].level[lvl].width
+         * spu.texture[unit].level[lvl].height;
 
+      if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
+         bytes *= 6;
+      else if (spu.texture[unit].target == PIPE_TEXTURE_3D)
+         bytes *= spu.texture[unit].level[lvl].depth;
 
-/**
- * XXX look into getting texels for all four pixels in a quad at once.
- */
-static uint
-get_texel(uint unit, vec_uint4 coordinate)
-{
-   /*
-    * XXX we could do the "/ TILE_SIZE" and "% TILE_SIZE" operations as
-    * SIMD since X and Y are already in a SIMD register.
-    */
-   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
-   ushort x = spu_extract(coordinate, 0);
-   ushort y = spu_extract(coordinate, 1);
-   unsigned tile_offset = sizeof(tile_t)
-      * ((y / TILE_SIZE * spu.texture[unit].tiles_per_row) + (x / TILE_SIZE));
-   ushort texel_offset = (ushort) 4
-      * (ushort) (((ushort) (y % TILE_SIZE) * (ushort) TILE_SIZE) + (x % TILE_SIZE));
-   vec_uint4 tmp;
-
-   spu_dcache_fetch_unaligned((qword *) & tmp,
-                              texture_ea + tile_offset + texel_offset,
-                              4);
-   return spu_extract(tmp, 0);
+      spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
+   }
 }
 
 
@@ -88,15 +72,17 @@ get_texel(uint unit, vec_uint4 coordinate)
  * a time.
  */
 static void
-get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
+get_four_texels(const struct spu_texture_level *tlevel, uint face,
+                vec_int4 x, vec_int4 y,
+                vec_uint4 *texels)
 {
-   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
-   vec_uint4 tile_x = spu_rlmask(x, -5);
-   vec_uint4 tile_y = spu_rlmask(y, -5);
-   const qword offset_x = si_andi((qword) x, 0x1f);
-   const qword offset_y = si_andi((qword) y, 0x1f);
+   unsigned texture_ea = (uintptr_t) tlevel->start;
+   const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
+   const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
-   const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
+   const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row);
    const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
@@ -107,6 +93,8 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
    
    vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
    
+   texture_ea = texture_ea + face * tlevel->bytes_per_image;
+
    spu_dcache_fetch_unaligned((qword *) & texels[0],
                               texture_ea + spu_extract(offset, 0), 4);
    spu_dcache_fetch_unaligned((qword *) & texels[1],
@@ -118,83 +106,536 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 }
 
 
+/** clamp vec to [0, max] */
+static INLINE vector signed int
+spu_clamp(vector signed int vec, vector signed int max)
+{
+   static const vector signed int zero = {0,0,0,0};
+   vector unsigned int c;
+   c = spu_cmpgt(vec, zero);    /* c = vec > zero ? ~0 : 0 */
+   vec = spu_sel(zero, vec, c);
+   c = spu_cmpgt(vec, max);    /* c = vec > max ? ~0 : 0 */
+   vec = spu_sel(vec, max, c);
+   return vec;
+}
+
+
+
 /**
- * Get texture sample at texcoord.
+ * Do nearest texture sampling for four pixels.
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
  */
-vector float
-sample_texture_nearest(uint unit, vector float texcoord)
+void
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4])
 {
-   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
-   vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
-   itc = spu_and(itc, spu.texture[unit].tex_size_mask); /* mask (GL_REPEAT) */
-   uint texel = get_texel(unit, itc);
-   return spu_unpack_A8R8G8B8(texel);
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   vector float ss = spu_mul(s, tlevel->scale_s);
+   vector float tt = spu_mul(t, tlevel->scale_t);
+   vector signed int is = spu_convts(ss, 0);
+   vector signed int it = spu_convts(tt, 0);
+   vec_uint4 texels[4];
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is = spu_and(is, tlevel->mask_s);
+   it = spu_and(it, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is = spu_clamp(is, tlevel->max_s);
+   it = spu_clamp(it, tlevel->max_t);
+
+   get_four_texels(tlevel, face, is, it, texels);
+
+   /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
+   spu_unpack_A8R8G8B8_transpose4(texels, colors);
 }
 
 
-vector float
-sample_texture_bilinear(uint unit, vector float texcoord)
+/**
+ * Do bilinear texture sampling for four pixels.
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
+void
+sample_texture_2d_bilinear(vector float s, vector float t,
+                           uint unit, uint level, uint face,
+                           vector float colors[4])
 {
-   static const vec_uint4 offset_x = {0, 0, 1, 1};
-   static const vec_uint4 offset_y = {0, 1, 0, 1};
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
 
-   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
-   tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
 
-   /* integer texcoords S,T: */
-   vec_uint4 itc = spu_convtu(tc, 0);  /* convert to int */
+   vector signed int is0 = spu_convts(ss, 0);
+   vector signed int it0 = spu_convts(tt, 0);
 
-   vec_uint4 texels[4];
-   
-   /* setup texcoords for quad:
-    *  +-----+-----+
-    *  |x0,y0|x1,y1|
-    *  +-----+-----+
-    *  |x2,y2|x3,y3|
-    *  +-----+-----+
-    */
-   vec_uint4 x = spu_splats(spu_extract(itc, 0));
-   vec_uint4 y = spu_splats(spu_extract(itc, 1));
-   x = spu_add(x, offset_x);
-   y = spu_add(y, offset_y);
+   /* is + 1, it + 1 */
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
 
-   /* GL_REPEAT wrap mode: */
-   x = spu_and(x, spu.texture[unit].tex_size_x_mask);
-   y = spu_and(y, spu.texture[unit].tex_size_y_mask);
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
 
-   get_four_texels(unit, x, y, texels);
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
 
-   /* integer A8R8G8B8 to float texel conversion */
-   vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
-   vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
-   vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0));
-   vector float texel11 = spu_unpack_A8R8G8B8(spu_extract(texels[3], 0));
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(tlevel, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(tlevel, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(tlevel, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
 
+   /* convert packed int texels to float colors */
+   vector float ftexels[16];
+   spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
+   spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
+   spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
+   spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
 
    /* Compute weighting factors in [0,1]
     * Multiply texcoord by 1024, AND with 1023, convert back to float.
     */
-   vector float tc1024 = spu_mul(tc, spu_splats(1024.0f));
-   vector signed int itc1024 = spu_convts(tc1024, 0);
-   itc1024 = spu_and(itc1024, spu_splats((1 << 10) - 1));
-   vector float weight = spu_convtf(itc1024, 10);
-
-   /* smeared frac and 1-frac */
-   vector float sfrac = spu_splats(spu_extract(weight, 0));
-   vector float tfrac = spu_splats(spu_extract(weight, 1));
-   vector float sfrac1 = spu_sub(spu_splats(1.0f), sfrac);
-   vector float tfrac1 = spu_sub(spu_splats(1.0f), tfrac);
-
-   /* multiply the samples (colors) by the S/T weights */
-   texel00 = spu_mul(spu_mul(texel00, sfrac1), tfrac1);
-   texel10 = spu_mul(spu_mul(texel10, sfrac ), tfrac1);
-   texel01 = spu_mul(spu_mul(texel01, sfrac1), tfrac );
-   texel11 = spu_mul(spu_mul(texel11, sfrac ), tfrac );
-
-   /* compute sum of weighted samples */
-   vector float texel_sum = spu_add(texel00, texel01);
-   texel_sum = spu_add(texel_sum, texel10);
-   texel_sum = spu_add(texel_sum, texel11);
-
-   return texel_sum;
+   vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
+   vector signed int iss1024 = spu_convts(ss1024, 0);
+   iss1024 = spu_and(iss1024, 1023);
+   vector float sWeights0 = spu_convtf(iss1024, 10);
+
+   vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
+   vector signed int itt1024 = spu_convts(tt1024, 0);
+   itt1024 = spu_and(itt1024, 1023);
+   vector float tWeights0 = spu_convtf(itt1024, 10);
+
+   /* 1 - sWeight and 1 - tWeight */
+   vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
+   vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
+
+   /* reds, for four pixels */
+   ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
+                       spu_add(ftexels[8], ftexels[12]));
+
+   /* greens, for four pixels */
+   ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
+                       spu_add(ftexels[9], ftexels[13]));
+
+   /* blues, for four pixels */
+   ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
+                       spu_add(ftexels[10], ftexels[14]));
+
+   /* alphas, for four pixels */
+   ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
+                       spu_add(ftexels[11], ftexels[15]));
+}
+
+
+
+/**
+ * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
+ */
+static INLINE void
+transpose(vector unsigned int *mOut0,
+          vector unsigned int *mOut1,
+          vector unsigned int *mOut2,
+          vector unsigned int *mOut3,
+          vector unsigned int *mIn)
+{
+  vector unsigned int abcd, efgh, ijkl, mnop;	/* input vectors */
+  vector unsigned int aeim, bfjn, cgko, dhlp;	/* output vectors */
+  vector unsigned int aibj, ckdl, emfn, gohp;	/* intermediate vectors */
+
+  vector unsigned char shufflehi = ((vector unsigned char) {
+					       0x00, 0x01, 0x02, 0x03,
+					       0x10, 0x11, 0x12, 0x13,
+					       0x04, 0x05, 0x06, 0x07,
+					       0x14, 0x15, 0x16, 0x17});
+  vector unsigned char shufflelo = ((vector unsigned char) {
+					       0x08, 0x09, 0x0A, 0x0B,
+					       0x18, 0x19, 0x1A, 0x1B,
+					       0x0C, 0x0D, 0x0E, 0x0F,
+					       0x1C, 0x1D, 0x1E, 0x1F});
+  abcd = *(mIn+0);
+  efgh = *(mIn+1);
+  ijkl = *(mIn+2);
+  mnop = *(mIn+3);
+
+  aibj = spu_shuffle(abcd, ijkl, shufflehi);
+  ckdl = spu_shuffle(abcd, ijkl, shufflelo);
+  emfn = spu_shuffle(efgh, mnop, shufflehi);
+  gohp = spu_shuffle(efgh, mnop, shufflelo);
+
+  aeim = spu_shuffle(aibj, emfn, shufflehi);
+  bfjn = spu_shuffle(aibj, emfn, shufflelo);
+  cgko = spu_shuffle(ckdl, gohp, shufflehi);
+  dhlp = spu_shuffle(ckdl, gohp, shufflelo);
+
+  *mOut0 = aeim;
+  *mOut1 = bfjn;
+  *mOut2 = cgko;
+  *mOut3 = dhlp;
+}
+
+
+/**
+ * Bilinear filtering, using int instead of float arithmetic for computing
+ * sample weights.
+ */
+void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4])
+{
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
+   /* Scale texcoords by size of texture, and add half pixel bias */
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
+
+   /* convert float coords to fixed-pt coords with 7 fraction bits */
+   vector signed int is = spu_convts(ss, 7);  /* XXX really need floor() here */
+   vector signed int it = spu_convts(tt, 7);  /* XXX really need floor() here */
+
+   /* compute integer texel weights in [0, 127] */
+   vector signed int sWeights0 = spu_and(is, 127);
+   vector signed int tWeights0 = spu_and(it, 127);
+   vector signed int sWeights1 = spu_sub(127, sWeights0);
+   vector signed int tWeights1 = spu_sub(127, tWeights0);
+
+   /* texel coords: is0 = is / 128, it0 = is / 128 */
+   vector signed int is0 = spu_rlmask(is, -7);
+   vector signed int it0 = spu_rlmask(it, -7);
+
+   /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
+
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(tlevel, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(tlevel, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(tlevel, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
+
+   /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
+   {
+      static const unsigned char ZERO = 0x80;
+      int i;
+      for (i = 0; i < 16; i++) {
+         texels[i] = spu_shuffle(texels[i], texels[i],
+                                 ((vector unsigned char) {
+                                    ZERO, ZERO, ZERO, 1,
+                                    ZERO, ZERO, ZERO, 2,
+                                    ZERO, ZERO, ZERO, 3,
+                                    ZERO, ZERO, ZERO, 0}));
+      }
+   }
+
+   /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
+   vector unsigned int texel0, texel1, texel2, texel3, texel4, texel5, texel6, texel7,
+      texel8, texel9, texel10, texel11, texel12, texel13, texel14, texel15;
+   transpose(&texel0, &texel1, &texel2, &texel3, texels + 0);
+   transpose(&texel4, &texel5, &texel6, &texel7, texels + 4);
+   transpose(&texel8, &texel9, &texel10, &texel11, texels + 8);
+   transpose(&texel12, &texel13, &texel14, &texel15, texels + 12);
+
+   /* computed weighted colors */
+   vector unsigned int c0, c1, c2, c3, cSum;
+
+   /* red */
+   c0 = (vector unsigned int) si_mpy((qword) texel0, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel4, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel8, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel12, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[0] = spu_convtf(cSum, 22);
+
+   /* green */
+   c0 = (vector unsigned int) si_mpy((qword) texel1, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel5, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel9, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel13, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[1] = spu_convtf(cSum, 22);
+
+   /* blue */
+   c0 = (vector unsigned int) si_mpy((qword) texel2, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel6, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel10, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel14, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[2] = spu_convtf(cSum, 22);
+
+   /* alpha */
+   c0 = (vector unsigned int) si_mpy((qword) texel3, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel7, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel11, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel15, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[3] = spu_convtf(cSum, 22);
+}
+
+
+
+/**
+ * Compute level of detail factor from texcoords.
+ */
+static INLINE float
+compute_lambda_2d(uint unit, vector float s, vector float t)
+{
+   uint baseLevel = 0;
+   float width = spu.texture[unit].level[baseLevel].width;
+   float height = spu.texture[unit].level[baseLevel].width;
+   float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
+   float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
+   float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
+   float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
+#if 0
+   /* ideal value */
+   float x = dsdx * dsdx + dtdx * dtdx;
+   float y = dsdy * dsdy + dtdy * dtdy;
+   float rho = x > y ? x : y;
+   rho = sqrtf(rho);
+#else
+   /* approximation */
+   dsdx = fabsf(dsdx);
+   dsdy = fabsf(dsdy);
+   dtdx = fabsf(dtdx);
+   dtdy = fabsf(dtdy);
+   float rho = (dsdx + dsdy + dtdx + dtdy) * 0.5;
+#endif
+   float lambda = logf(rho) * 1.442695f; /* compute logbase2(rho) */
+   return lambda;
+}
+
+
+/**
+ * Blend two sets of colors according to weight.
+ */
+static void
+blend_colors(vector float c0[4], const vector float c1[4], float weight)
+{
+   vector float t = spu_splats(weight);
+   vector float dc0 = spu_sub(c1[0], c0[0]);
+   vector float dc1 = spu_sub(c1[1], c0[1]);
+   vector float dc2 = spu_sub(c1[2], c0[2]);
+   vector float dc3 = spu_sub(c1[3], c0[3]);
+   c0[0] = spu_madd(dc0, t, c0[0]);
+   c0[1] = spu_madd(dc1, t, c0[1]);
+   c0[2] = spu_madd(dc2, t, c0[2]);
+   c0[3] = spu_madd(dc3, t, c0[3]);
+}
+
+
+/**
+ * Texture sampling with level of detail selection and possibly mipmap
+ * interpolation.
+ */
+void
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level_ignored, uint face,
+                      vector float colors[4])
+{
+   /*
+    * Note that we're computing a lambda/lod here that's used for all
+    * four pixels in the quad.
+    */
+   float lambda = compute_lambda_2d(unit, s, t);
+
+   (void) face;
+   (void) level_ignored;
+
+   /* apply lod bias */
+   lambda += spu.sampler[unit].lod_bias;
+
+   /* clamp */
+   if (lambda < spu.sampler[unit].min_lod)
+      lambda = spu.sampler[unit].min_lod;
+   else if (lambda > spu.sampler[unit].max_lod)
+      lambda = spu.sampler[unit].max_lod;
+
+   if (lambda <= 0.0f) {
+      /* magnify */
+      spu.mag_sample_texture_2d[unit](s, t, unit, 0, face, colors);
+   }
+   else {
+      /* minify */
+      if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+         /* sample two mipmap levels and interpolate */
+         int level = (int) lambda;
+         if (level > (int) spu.texture[unit].max_level)
+            level = spu.texture[unit].max_level;
+         spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+         if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+            /* sample second mipmap level */
+            float weight = lambda - (float) level;
+            level++;
+            if (level <= (int) spu.texture[unit].max_level) {
+               vector float colors2[4];
+               spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors2);
+               blend_colors(colors, colors2, weight);
+            }
+         }
+      }
+      else {
+         /* sample one mipmap level */
+         int level = (int) (lambda + 0.5f);
+         if (level > (int) spu.texture[unit].max_level)
+            level = spu.texture[unit].max_level;
+         spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+      }
+   }
+}
+
+
+/** XXX need a SIMD version of this */
+static unsigned
+choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+{
+   /*
+      major axis
+      direction     target                             sc     tc    ma
+      ----------    -------------------------------    ---    ---   ---
+       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   const float arx = fabsf(rx);
+   const float ary = fabsf(ry);
+   const float arz = fabsf(rz);
+   unsigned face;
+   float sc, tc, ma;
+
+   if (arx > ary && arx > arz) {
+      if (rx >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_X;
+         sc = -rz;
+         tc = -ry;
+         ma = arx;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_X;
+         sc = rz;
+         tc = -ry;
+         ma = arx;
+      }
+   }
+   else if (ary > arx && ary > arz) {
+      if (ry >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_Y;
+         sc = rx;
+         tc = rz;
+         ma = ary;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Y;
+         sc = rx;
+         tc = -rz;
+         ma = ary;
+      }
+   }
+   else {
+      if (rz > 0.0F) {
+         face = PIPE_TEX_FACE_POS_Z;
+         sc = rx;
+         tc = -ry;
+         ma = arz;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Z;
+         sc = -rx;
+         tc = -ry;
+         ma = arz;
+      }
+   }
+
+   *newS = (sc / ma + 1.0F) * 0.5F;
+   *newT = (tc / ma + 1.0F) * 0.5F;
+
+   return face;
+}
+
+
+
+void
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4])
+{
+   uint p, faces[4], level = 0;
+   float newS[4], newT[4];
+
+   /* Compute cube faces referenced by the four sets of texcoords.
+    * XXX we should SIMD-ize this.
+    */
+   for (p = 0; p < 4; p++) {      
+      float rx = spu_extract(s, p);
+      float ry = spu_extract(t, p);
+      float rz = spu_extract(r, p);
+      faces[p] = choose_cube_face(rx, ry, rz, &newS[p], &newT[p]);
+   }
+
+   if (faces[0] == faces[1] &&
+       faces[0] == faces[2] &&
+       faces[0] == faces[3]) {
+      /* GOOD!  All four texcoords refer to the same cube face */
+      s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
+      t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
+      spu.sample_texture_2d[unit](s, t, unit, level, faces[0], colors);
+   }
+   else {
+      /* BAD!  The four texcoords refer to different faces */
+      for (p = 0; p < 4; p++) {      
+         vector float c[4];
+
+         spu.sample_texture_2d[unit](spu_splats(newS[p]), spu_splats(newT[p]),
+                                     unit, level, faces[p], c);
+
+         float red = spu_extract(c[0], p);
+         float green = spu_extract(c[1], p);
+         float blue = spu_extract(c[2], p);
+         float alpha = spu_extract(c[3], p);
+
+         colors[0] = spu_insert(red,   colors[0], p);
+         colors[1] = spu_insert(green, colors[1], p);
+         colors[2] = spu_insert(blue,  colors[2], p);
+         colors[3] = spu_insert(alpha, colors[3], p);
+      }
+   }
 }
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index f7c9738be8..7b75b007b5 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -36,12 +36,32 @@ extern void
 invalidate_tex_cache(void);
 
 
-extern vector float
-sample_texture_nearest(uint unit, vector float texcoord);
+extern void
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4]);
+
+
+extern void
+sample_texture_2d_bilinear(vector float s, vector float t,
+                           uint unit, uint level, uint face,
+                           vector float colors[4]);
+
+extern void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4]);
+
 
+extern void
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level, uint face,
+                      vector float colors[4]);
 
-extern vector float
-sample_texture_bilinear(uint unit, vector float texcoord);
+
+extern void
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4]);
 
 
 #endif /* SPU_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tile.c b/src/gallium/drivers/cell/spu/spu_tile.c
index 216a33126b..6905015a48 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.c
+++ b/src/gallium/drivers/cell/spu/spu_tile.c
@@ -87,3 +87,40 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
            0  /* rid */);
 }
 
+
+/**
+ * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
+ * tiles back to the main framebuffer.
+ */
+void
+really_clear_tiles(uint surfaceIndex)
+{
+   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   uint i;
+
+   if (surfaceIndex == 0) {
+      clear_c_tile(&spu.ctile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         }
+      }
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+#if 0
+   wait_on_mask(1 << TAG_SURFACE_CLEAR);
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_tile.h b/src/gallium/drivers/cell/spu/spu_tile.h
index 1b5491112d..7bfb52be8f 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.h
+++ b/src/gallium/drivers/cell/spu/spu_tile.h
@@ -36,12 +36,14 @@
 
 
 
-void
+extern void
 get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
 
-void
+extern void
 put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf);
 
+extern void
+really_clear_tiles(uint surfaceIndex);
 
 
 static INLINE void
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 8b93878192..d727268475 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -29,12 +29,12 @@
  * Triangle rendering within a tile.
  */
 
-#include <transpose_matrix4x4.h>
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "util/u_math.h"
 #include "spu_colorpack.h"
 #include "spu_main.h"
+#include "spu_shuffle.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
 #include "spu_tri.h"
@@ -43,11 +43,6 @@
 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
 typedef vector unsigned int mask_t;
 
-typedef union
-{
-   vector float v;
-   float f[4];
-} float4;
 
 
 /**
@@ -61,7 +56,7 @@ struct vertex_header {
 
 /* XXX fix this */
 #undef CEILF
-#define CEILF(X) ((float) (int) ((X) + 0.99999))
+#define CEILF(X) ((float) (int) ((X) + 0.99999f))
 
 
 #define QUAD_TOP_LEFT     0
@@ -75,14 +70,25 @@ struct vertex_header {
 #define MASK_ALL          0xf
 
 
+#define CHAN0 0
+#define CHAN1 1
+#define CHAN2 2
+#define CHAN3 3
+
+
 #define DEBUG_VERTS 0
 
 /**
  * Triangle edge info
  */
 struct edge {
-   float dx;		/**< X(v1) - X(v0), used only during setup */
-   float dy;		/**< Y(v1) - Y(v0), used only during setup */
+   union {
+      struct {
+         float dx;	/**< X(v1) - X(v0), used only during setup */
+         float dy;	/**< Y(v1) - Y(v0), used only during setup */
+      };
+      vec_float4 ds;    /**< vector accessor for dx and dy */
+   };
    float dxdy;		/**< dx/dy */
    float sx, sy;	/**< first sample point coord */
    int lines;		/**< number of lines on this edge */
@@ -91,9 +97,9 @@ struct edge {
 
 struct interp_coef
 {
-   float4 a0;
-   float4 dadx;
-   float4 dady;
+   vector float a0;
+   vector float dadx;
+   vector float dady;
 };
 
 
@@ -107,34 +113,32 @@ struct setup_stage {
     * turn.  Currently fixed at 4 floats, but should change in time.
     * Codegen will help cope with this.
     */
-   const struct vertex_header *vmax;
-   const struct vertex_header *vmid;
-   const struct vertex_header *vmin;
-   const struct vertex_header *vprovoke;
+   union {
+      struct {
+         const struct vertex_header *vmin;
+         const struct vertex_header *vmid;
+         const struct vertex_header *vmax;
+         const struct vertex_header *vprovoke;
+      };
+      qword vertex_headers;
+   };
 
    struct edge ebot;
    struct edge etop;
    struct edge emaj;
 
-   float oneoverarea;
+   float oneOverArea;  /* XXX maybe make into vector? */
 
-   uint tx, ty;
+   uint facing;
+
+   uint tx, ty;  /**< position of current tile (x, y) */
 
    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
 
-#if 0
-   struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
-#else
    struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
-#endif
-
-#if 0
-   struct quad_header quad; 
-#endif
 
    struct {
-      int left[2];   /**< [0] = row0, [1] = row1 */
-      int right[2];
+      vec_int4 quad; /**< [0] = row0, [1] = row1; {left[0],left[1],right[0],right[1]} */
       int y;
       unsigned y_flags;
       unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
@@ -142,118 +146,103 @@ struct setup_stage {
 };
 
 
-
 static struct setup_stage setup;
 
 
+static INLINE vector float
+splatx(vector float v)
+{
+   return spu_splats(spu_extract(v, CHAN0));
+}
 
-
-#if 0
-/**
- * Basically a cast wrapper.
- */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
+static INLINE vector float
+splaty(vector float v)
 {
-   return (struct setup_stage *)stage;
+   return spu_splats(spu_extract(v, CHAN1));
 }
-#endif
 
-#if 0
-/**
- * Clip setup.quad against the scissor/surface bounds.
- */
-static INLINE void
-quad_clip(struct setup_stage *setup)
+static INLINE vector float
+splatz(vector float v)
 {
-   const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect;
-   const int minx = (int) cliprect->minx;
-   const int maxx = (int) cliprect->maxx;
-   const int miny = (int) cliprect->miny;
-   const int maxy = (int) cliprect->maxy;
-
-   if (setup.quad.x0 >= maxx ||
-       setup.quad.y0 >= maxy ||
-       setup.quad.x0 + 1 < minx ||
-       setup.quad.y0 + 1 < miny) {
-      /* totally clipped */
-      setup.quad.mask = 0x0;
-      return;
-   }
-   if (setup.quad.x0 < minx)
-      setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-   if (setup.quad.y0 < miny)
-      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-   if (setup.quad.x0 == maxx - 1)
-      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-   if (setup.quad.y0 == maxy - 1)
-      setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+   return spu_splats(spu_extract(v, CHAN2));
 }
-#endif
 
-#if 0
-/**
- * Emit a quad (pass to next stage) with clipping.
- */
-static INLINE void
-clip_emit_quad(struct setup_stage *setup)
+static INLINE vector float
+splatw(vector float v)
 {
-   quad_clip(setup);
-   if (setup.quad.mask) {
-      struct softpipe_context *sp = setup.softpipe;
-      sp->quad.first->run(sp->quad.first, &setup.quad);
-   }
+   return spu_splats(spu_extract(v, CHAN3));
 }
-#endif
+
 
 /**
- * Evaluate attribute coefficients (plane equations) to compute
- * attribute values for the four fragments in a quad.
- * Eg: four colors will be computed (in AoS format).
+ * Setup fragment shader inputs by evaluating triangle's vertex
+ * attribute coefficient info.
+ * \param x  quad x pos
+ * \param y  quad y pos
+ * \param fragZ  returns quad Z values
+ * \param fragInputs  returns fragment program inputs
+ * Note: this code could be incorporated into the fragment program
+ * itself to avoid the loop and switch.
  */
-static INLINE void
-eval_coeff(uint slot, float x, float y, vector float result[4])
+static void
+eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[])
 {
-   switch (spu.vertex_info.interp_mode[slot]) {
-   case INTERP_CONSTANT:
-      result[QUAD_TOP_LEFT] =
-      result[QUAD_TOP_RIGHT] =
-      result[QUAD_BOTTOM_LEFT] =
-      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
-      break;
+   static const vector float deltaX = (const vector float) {0, 1, 0, 1};
+   static const vector float deltaY = (const vector float) {0, 0, 1, 1};
+
+   const uint posSlot = 0;
+   const vector float pos = setup.coef[posSlot].a0;
+   const vector float dposdx = setup.coef[posSlot].dadx;
+   const vector float dposdy = setup.coef[posSlot].dady;
+   const vector float fragX = spu_splats(x) + deltaX;
+   const vector float fragY = spu_splats(y) + deltaY;
+   vector float fragW, wInv;
+   uint i;
 
-   case INTERP_LINEAR:
-      /* fall-through, for now */
-   default:
-      {
-         register vector float dadx = setup.coef[slot].dadx.v;
-         register vector float dady = setup.coef[slot].dady.v;
-         register vector float topLeft
-            = spu_add(setup.coef[slot].a0.v,
-                      spu_add(spu_mul(spu_splats(x), dadx),
-                              spu_mul(spu_splats(y), dady)));
-
-         result[QUAD_TOP_LEFT] = topLeft;
-         result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
-         result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
-         result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
+   *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy);
+   fragW =  splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy);
+   wInv = spu_re(fragW);  /* 1 / w */
+
+   /* loop over fragment program inputs */
+   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+      uint attr = i + 1;
+      enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode;
+
+      /* constant term */
+      vector float a0 = setup.coef[attr].a0;
+      vector float r0 = splatx(a0);
+      vector float r1 = splaty(a0);
+      vector float r2 = splatz(a0);
+      vector float r3 = splatw(a0);
+
+      if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) {
+         /* linear term */
+         vector float dadx = setup.coef[attr].dadx;
+         vector float dady = setup.coef[attr].dady;
+         /* Use SPU intrinsics here to get slightly better code.
+          * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady);
+          */
+         r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0));
+         r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1));
+         r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2));
+         r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3));
+         if (interp == INTERP_PERSPECTIVE) {
+            /* perspective term */
+            r0 *= wInv;
+            r1 *= wInv;
+            r2 *= wInv;
+            r3 *= wInv;
+         }
       }
+      fragInputs[CHAN0] = r0;
+      fragInputs[CHAN1] = r1;
+      fragInputs[CHAN2] = r2;
+      fragInputs[CHAN3] = r3;
+      fragInputs += 4;
    }
 }
 
 
-static INLINE vector float
-eval_z(float x, float y)
-{
-   const uint slot = 0;
-   const float dzdx = setup.coef[slot].dadx.f[2];
-   const float dzdy = setup.coef[slot].dady.f[2];
-   const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
-   const vector float topLeftv = spu_splats(topLeft);
-   const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
-   return spu_add(topLeftv, derivs);
-}
-
-
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  * Note: about 1/5 to 1/7 of the time, mask is zero and this function
@@ -261,120 +250,51 @@ eval_z(float x, float y)
  * overall.
  */
 static INLINE void
-emit_quad( int x, int y, mask_t mask )
+emit_quad( int x, int y, mask_t mask)
 {
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
-      vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
-      if (spu.texture[0].start) {
-         /* texture mapping */
-         const uint unit = 0;
-         vector float texcoords[4];
-         eval_coeff(2, (float) x, (float) y, texcoords);
-
-         if (spu_extract(mask, 0))
-            colors[0] = spu.sample_texture[unit](unit, texcoords[0]);
-         if (spu_extract(mask, 1))
-            colors[1] = spu.sample_texture[unit](unit, texcoords[1]);
-         if (spu_extract(mask, 2))
-            colors[2] = spu.sample_texture[unit](unit, texcoords[2]);
-         if (spu_extract(mask, 3))
-            colors[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
-
-         if (spu.texture[1].start) {
-            /* multi-texture mapping */
-            const uint unit = 1;
-            vector float colors1[4];
-
-            eval_coeff(2, (float) x, (float) y, texcoords);
-
-            if (spu_extract(mask, 0))
-               colors1[0] = spu.sample_texture[unit](unit, texcoords[0]);
-            if (spu_extract(mask, 1))
-               colors1[1] = spu.sample_texture[unit](unit, texcoords[1]);
-            if (spu_extract(mask, 2))
-               colors1[2] = spu.sample_texture[unit](unit, texcoords[2]);
-            if (spu_extract(mask, 3))
-               colors1[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
-            /* hack: modulate first texture by second */
-            colors[0] = spu_mul(colors[0], colors1[0]);
-            colors[1] = spu_mul(colors[1], colors1[1]);
-            colors[2] = spu_mul(colors[2], colors1[2]);
-            colors[3] = spu_mul(colors[3], colors1[3]);
-         }
-
-      }
-      else {
-         /* simple shading */
-#if 0
-         eval_coeff(1, (float) x, (float) y, colors);
-
-#else
-         /* XXX new fragment program code */
-
-         if (spu.fragment_program) {
-            vector float inputs[4*4], outputs[2*4];
-
-            /* setup inputs */
-            eval_coeff(1, (float) x, (float) y, inputs);
-
-            /* Execute the current fragment program */
-            spu.fragment_program(inputs, outputs, spu.constants);
-
-            /* Copy outputs */
-            colors[0] = outputs[0*4+0];
-            colors[1] = outputs[0*4+1];
-            colors[2] = outputs[0*4+2];
-            colors[3] = outputs[0*4+3];
-
-            if (0 && spu.init.id==0 && y == 48) {
-               printf("colors[0] = %f %f %f %f\n",
-                      spu_extract(colors[0], 0),
-                      spu_extract(colors[0], 1),
-                      spu_extract(colors[0], 2),
-                      spu_extract(colors[0], 3));
-               printf("colors[1] = %f %f %f %f\n",
-                      spu_extract(colors[1], 0),
-                      spu_extract(colors[1], 1),
-                      spu_extract(colors[1], 2),
-                      spu_extract(colors[1], 3));
-            }
-
-         }
-#endif
-      }
-
-
       {
-         /* Convert fragment data from AoS to SoA format.
-          * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
-          * This is temporary!
+         /*
+          * Run fragment shader, execute per-fragment ops, update fb/tile.
           */
-         vector float soa_frag[4];
-         _transpose_matrix4x4(soa_frag, colors);
+         vector float inputs[4*4], outputs[2*4];
+         vector unsigned int kill_mask;
+         vector float fragZ;
+
+         eval_inputs((float) x, (float) y, &fragZ, inputs);
 
-         float4 fragZ;
+         ASSERT(spu.fragment_program);
+         ASSERT(spu.fragment_ops);
 
-         fragZ.v = eval_z((float) x, (float) y);
+         /* Execute the current fragment program */
+         kill_mask = spu.fragment_program(inputs, outputs, spu.constants);
 
-         /* Do all per-fragment/quad operations here, including:
-          *  alpha test, z test, stencil test, blend and framebuffer writing.
+         mask = spu_andc(mask, kill_mask);
+
+         /* Execute per-fragment/quad operations, including:
+          * alpha test, z test, stencil test, blend and framebuffer writing.
+          * Note that there are two different fragment operations functions
+          * that can be called, one for front-facing fragments, and one
+          * for back-facing fragments.  (Often the two are the same;
+          * but in some cases, like two-sided stenciling, they can be
+          * very different.)  So choose the correct function depending
+          * on the calculated facing.
           */
-         spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
-                          fragZ.v,
-                          soa_frag[0], soa_frag[1],
-                          soa_frag[2], soa_frag[3],
+         spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
+                          fragZ,
+                          outputs[0*4+0],
+                          outputs[0*4+1],
+                          outputs[0*4+2],
+                          outputs[0*4+3],
                           mask);
       }
-
    }
 }
 
@@ -383,64 +303,49 @@ emit_quad( int x, int y, mask_t mask )
  * Given an X or Y coordinate, return the block/quad coordinate that it
  * belongs to.
  */
-static INLINE int block( int x )
+static INLINE int
+block(int x)
 {
    return x & ~1;
 }
 
 
 /**
- * Compute mask which indicates which pixels in the 2x2 quad are actually inside
- * the triangle's bounds.
- * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
- */
-static INLINE mask_t calculate_mask( int x )
-{
-   /* This is a little tricky.
-    * Use & instead of && to avoid branches.
-    * Use negation to convert true/false to ~0/0 values.
-    */
-   mask_t mask;
-   mask = spu_insert(-((x   >= setup.span.left[0]) & (x   < setup.span.right[0])), mask, 0);
-   mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
-   mask = spu_insert(-((x   >= setup.span.left[1]) & (x   < setup.span.right[1])), mask, 2);
-   mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
-   return mask;
-}
-
-
-/**
  * Render a horizontal span of quads
  */
-static void flush_spans( void )
+static void
+flush_spans(void)
 {
    int minleft, maxright;
-   int x;
+
+   const int l0 = spu_extract(setup.span.quad, 0);
+   const int l1 = spu_extract(setup.span.quad, 1);
+   const int r0 = spu_extract(setup.span.quad, 2);
+   const int r1 = spu_extract(setup.span.quad, 3);
 
    switch (setup.span.y_flags) {
    case 0x3:
       /* both odd and even lines written (both quad rows) */
-      minleft = MIN2(setup.span.left[0], setup.span.left[1]);
-      maxright = MAX2(setup.span.right[0], setup.span.right[1]);
+      minleft = MIN2(l0, l1);
+      maxright = MAX2(r0, r1);
       break;
 
    case 0x1:
       /* only even line written (quad top row) */
-      minleft = setup.span.left[0];
-      maxright = setup.span.right[0];
+      minleft = l0;
+      maxright = r0;
       break;
 
    case 0x2:
       /* only odd line written (quad bottom row) */
-      minleft = setup.span.left[1];
-      maxright = setup.span.right[1];
+      minleft = l1;
+      maxright = r1;
       break;
 
    default:
       return;
    }
 
-
    /* OK, we're very likely to need the tile data now.
     * clear or finish waiting if needed.
     */
@@ -457,7 +362,7 @@ static void flush_spans( void )
    }
    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
          /* wait for mfc_get() to complete */
          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
@@ -472,93 +377,119 @@ static void flush_spans( void )
       ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
    }
 
-   /* XXX this loop could be moved into the above switch cases and
-    * calculate_mask() could be simplified a bit...
-    */
-   for (x = block(minleft); x <= block(maxright); x += 2) {
-#if 1
-      emit_quad( x, setup.span.y, calculate_mask( x ) );
-#endif
+   /* XXX this loop could be moved into the above switch cases... */
+   
+   /* Setup for mask calculation */
+   const vec_int4 quad_LlRr = setup.span.quad;
+   const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8);
+   const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B));
+   const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B));
+
+   const vec_int4 twos = spu_splats(2);
+
+   const int x = block(minleft);
+   vec_int4 xs = {x, x+1, x, x+1};
+
+   for (; spu_extract(xs, 0) <= block(maxright); xs += twos) {
+      /**
+       * Computes mask to indicate which pixels in the 2x2 quad are actually
+       * inside the triangle's bounds.
+       */
+      
+      /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */
+      const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs);
+      const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); 
+      
+      /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */
+      const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs);
+
+      /* Combine results to create mask */
+      const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs);
+
+      emit_quad(spu_extract(xs, 0), setup.span.y, mask);
    }
 
    setup.span.y = 0;
    setup.span.y_flags = 0;
-   setup.span.right[0] = 0;
-   setup.span.right[1] = 0;
+   /* Zero right elements */
+   setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
 }
 
+
 #if DEBUG_VERTS
-static void print_vertex(const struct vertex_header *v)
+static void
+print_vertex(const struct vertex_header *v)
 {
-   int i;
-   fprintf(stderr, "Vertex: (%p)\n", v);
-   for (i = 0; i < setup.quad.nr_attrs; i++) {
-      fprintf(stderr, "  %d: %f %f %f %f\n",  i, 
-              v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
+   uint i;
+   fprintf(stderr, "  Vertex: (%p)\n", v);
+   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+      fprintf(stderr, "    %d: %f %f %f %f\n",  i, 
+              spu_extract(v->data[i], 0),
+              spu_extract(v->data[i], 1),
+              spu_extract(v->data[i], 2),
+              spu_extract(v->data[i], 3));
    }
 }
 #endif
 
 
-static boolean setup_sort_vertices(const struct vertex_header *v0,
-                                   const struct vertex_header *v1,
-                                   const struct vertex_header *v2)
+/**
+ * Sort vertices from top to bottom.
+ * Compute area and determine front vs. back facing.
+ * Do coarse clip test against tile bounds
+ * \return  FALSE if tri is totally outside tile, TRUE otherwise
+ */
+static boolean
+setup_sort_vertices(const struct vertex_header *v0,
+                    const struct vertex_header *v1,
+                    const struct vertex_header *v2)
 {
+   float area, sign;
 
 #if DEBUG_VERTS
-   fprintf(stderr, "Triangle:\n");
-   print_vertex(v0);
-   print_vertex(v1);
-   print_vertex(v2);
+   if (spu.init.id==0) {
+      fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
+      print_vertex(v0);
+      print_vertex(v1);
+      print_vertex(v2);
+   }
 #endif
 
-   setup.vprovoke = v2;
-
    /* determine bottom to top order of vertices */
    {
-      float y0 = spu_extract(v0->data[0], 1);
-      float y1 = spu_extract(v1->data[0], 1);
-      float y2 = spu_extract(v2->data[0], 1);
-      if (y0 <= y1) {
-	 if (y1 <= y2) {
-	    /* y0<=y1<=y2 */
-	    setup.vmin = v0;   
-	    setup.vmid = v1;   
-	    setup.vmax = v2;
-	 }
-	 else if (y2 <= y0) {
-	    /* y2<=y0<=y1 */
-	    setup.vmin = v2;   
-	    setup.vmid = v0;   
-	    setup.vmax = v1;   
-	 }
-	 else {
-	    /* y0<=y2<=y1 */
-	    setup.vmin = v0;   
-	    setup.vmid = v2;   
-	    setup.vmax = v1;  
-	 }
-      }
-      else {
-	 if (y0 <= y2) {
-	    /* y1<=y0<=y2 */
-	    setup.vmin = v1;   
-	    setup.vmid = v0;   
-	    setup.vmax = v2;  
-	 }
-	 else if (y2 <= y1) {
-	    /* y2<=y1<=y0 */
-	    setup.vmin = v2;   
-	    setup.vmid = v1;   
-	    setup.vmax = v0;  
-	 }
-	 else {
-	    /* y1<=y2<=y0 */
-	    setup.vmin = v1;   
-	    setup.vmid = v2;   
-	    setup.vmax = v0;
-	 }
-      }
+      /* A table of shuffle patterns for putting vertex_header pointers into
+         correct order.  Quite magical. */
+      const vec_uchar16 sort_order_patterns[] = {
+         SHUFFLE4(A,B,C,C),
+         SHUFFLE4(C,A,B,C),
+         SHUFFLE4(A,C,B,C),
+         SHUFFLE4(B,C,A,C),
+         SHUFFLE4(B,A,C,C),
+         SHUFFLE4(C,B,A,C) };
+
+      /* The vertex_header pointers, packed for easy shuffling later */
+      const vec_uint4 vs = {(unsigned)v0, (unsigned)v1, (unsigned)v2};
+
+      /* Collate y values into two vectors for comparison.
+         Using only one shuffle constant! ;) */
+      const vec_float4 y_02_ = spu_shuffle(v0->data[0], v2->data[0], SHUFFLE4(0,B,b,C));
+      const vec_float4 y_10_ = spu_shuffle(v1->data[0], v0->data[0], SHUFFLE4(0,B,b,C));
+      const vec_float4 y_012 = spu_shuffle(y_02_, v1->data[0], SHUFFLE4(0,B,b,C));
+      const vec_float4 y_120 = spu_shuffle(y_10_, v2->data[0], SHUFFLE4(0,B,b,C));
+
+      /* Perform comparison: {y0,y1,y2} > {y1,y2,y0} */
+      const vec_uint4 compare = spu_cmpgt(y_012, y_120);
+      /* Compress the result of the comparison into 4 bits */
+      const vec_uint4 gather = spu_gather(compare);
+      /* Subtract one to attain the index into the LUT.  Magical. */
+      const unsigned int index = spu_extract(gather, 0) - 1;
+
+      /* Load the appropriate pattern and construct the desired vector. */
+      setup.vertex_headers = (qword)spu_shuffle(vs, vs, sort_order_patterns[index]);
+
+      /* Using the result of the comparison, set sign.
+         Very magical. */
+      sign = ((si_to_uint(si_cntb((qword)gather)) == 2) ? 1.0f : -1.0f);
    }
 
    /* Check if triangle is completely outside the tile bounds */
@@ -575,41 +506,28 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
        spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
       return FALSE;
 
-   setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
-   setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
-   setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
-   setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
-   setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0);
-   setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1);
+   setup.ebot.ds = spu_sub(setup.vmid->data[0], setup.vmin->data[0]);
+   setup.emaj.ds = spu_sub(setup.vmax->data[0], setup.vmin->data[0]);
+   setup.etop.ds = spu_sub(setup.vmax->data[0], setup.vmid->data[0]);
 
    /*
     * Compute triangle's area.  Use 1/area to compute partial
     * derivatives of attributes later.
-    *
-    * The area will be the same as prim->det, but the sign may be
-    * different depending on how the vertices get sorted above.
-    *
-    * To determine whether the primitive is front or back facing we
-    * use the prim->det value because its sign is correct.
     */
-   {
-      const float area = (setup.emaj.dx * setup.ebot.dy - 
-			    setup.ebot.dx * setup.emaj.dy);
-
-      setup.oneoverarea = 1.0f / area;
-      /*
-      _mesa_printf("%s one-over-area %f  area %f  det %f\n",
-                   __FUNCTION__, setup.oneoverarea, area, prim->det );
-      */
-   }
+   area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy;
+
+   setup.oneOverArea = 1.0f / area;
 
-#if 0
-   /* We need to know if this is a front or back-facing triangle for:
-    *  - the GLSL gl_FrontFacing fragment attribute (bool)
-    *  - two-sided stencil test
+   /* The product of area * sign indicates front/back orientation (0/1).
+    * Just in case someone gets the bright idea of switching the front
+    * and back constants without noticing that we're assuming their
+    * values in this operation, also assert that the values are
+    * what we think they are.
     */
-   setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
-#endif
+   ASSERT(CELL_FACING_FRONT == 0);
+   ASSERT(CELL_FACING_BACK == 1);
+   setup.facing = (area * sign > 0.0f)
+      ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
 
    return TRUE;
 }
@@ -622,63 +540,11 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
  * \param slot  which attribute slot 
  */
 static INLINE void
-const_coeff(uint slot)
+const_coeff4(uint slot)
 {
-   setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
-   setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
-   setup.coef[slot].a0.v = setup.vprovoke->data[slot];
-}
-
-
-/**
- * Compute a0, dadx and dady for a linearly interpolated coefficient,
- * for a triangle.
- */
-static INLINE void
-tri_linear_coeff(uint slot, uint firstComp, uint lastComp)
-{
-   uint i;
-   const float *vmin_d = (float *) &setup.vmin->data[slot];
-   const float *vmid_d = (float *) &setup.vmid->data[slot];
-   const float *vmax_d = (float *) &setup.vmax->data[slot];
-   const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f;
-   const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
-
-   for (i = firstComp; i < lastComp; i++) {
-      float botda = vmid_d[i] - vmin_d[i];
-      float majda = vmax_d[i] - vmin_d[i];
-      float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
-      float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
-   
-      ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
-
-      setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
-      setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
-
-      /* calculate a0 as the value which would be sampled for the
-       * fragment at (0,0), taking into account that we want to sample at
-       * pixel centers, in other words (0.5, 0.5).
-       *
-       * this is neat but unfortunately not a good way to do things for
-       * triangles with very large values of dadx or dady as it will
-       * result in the subtraction and re-addition from a0 of a very
-       * large number, which means we'll end up loosing a lot of the
-       * fractional bits and precision from a0.  the way to fix this is
-       * to define a0 as the sample at a pixel center somewhere near vmin
-       * instead - i'll switch to this later.
-       */
-      setup.coef[slot].a0.f[i] = (vmin_d[i] - 
-                                 (setup.coef[slot].dadx.f[i] * x + 
-                                  setup.coef[slot].dady.f[i] * y));
-   }
-
-   /*
-   _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
-		slot, "xyzw"[i], 
-		setup.coef[slot].a0[i],
-		setup.coef[slot].dadx.f[i],
-		setup.coef[slot].dady.f[i]);
-   */
+   setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].a0 = setup.vprovoke->data[slot];
 }
 
 
@@ -702,18 +568,16 @@ tri_linear_coeff4(uint slot)
    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 
-   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
-   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+   setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
 
-   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
-   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+   vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
                          
-   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+   setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 }
 
 
-
-#if 0
 /**
  * Compute a0, dadx and dady for a perspective-corrected interpolant,
  * for a triangle.
@@ -722,82 +586,76 @@ tri_linear_coeff4(uint slot)
  * Later, when we compute the value at a particular fragment position we'll
  * divide the interpolated value by the interpolated W at that fragment.
  */
-static void tri_persp_coeff( unsigned slot,
-                             unsigned i )
+static void
+tri_persp_coeff4(uint slot)
 {
-   /* premultiply by 1/w:
-    */
-   float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3];
-   float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3];
-   float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3];
-
-   float botda = mida - mina;
-   float majda = maxa - mina;
-   float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
-   float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
-      
-   /*
-   printf("tri persp %d,%d: %f %f %f\n", slot, i,
-          setup.vmin->data[slot][i],
-          setup.vmid->data[slot][i],
-          setup.vmax->data[slot][i]
-          );
-   */
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3));
+   const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3));
+   const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3));
+
+   vector float vmin_d = setup.vmin->data[slot];
+   vector float vmid_d = setup.vmid->data[slot];
+   vector float vmax_d = setup.vmax->data[slot];
+
+   vmin_d = spu_mul(vmin_d, vmin_w);
+   vmid_d = spu_mul(vmid_d, vmid_w);
+   vmax_d = spu_mul(vmax_d, vmax_w);
+
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
 
-   assert(slot < PIPE_MAX_SHADER_INPUTS);
-   assert(i <= 3);
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
 
-   setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
-   setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
-   setup.coef[slot].a0.f[i] = (mina - 
-			    (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
-			     setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
+   setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
+                         
+   setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 }
-#endif
+
 
 
 /**
  * Compute the setup.coef[] array dadx, dady, a0 values.
  * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
  */
-static void setup_tri_coefficients(void)
+static void
+setup_tri_coefficients(void)
 {
-#if 1
    uint i;
 
    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
-      switch (spu.vertex_info.interp_mode[i]) {
+      switch (spu.vertex_info.attrib[i].interp_mode) {
       case INTERP_NONE:
          break;
-      case INTERP_POS:
-         /*tri_linear_coeff(i, 2, 3);*/
-         /* XXX interp W if PERSPECTIVE... */
-         tri_linear_coeff4(i);
-         break;
       case INTERP_CONSTANT:
-         const_coeff(i);
+         const_coeff4(i);
          break;
+      case INTERP_POS:
+         /* fall-through */
       case INTERP_LINEAR:
          tri_linear_coeff4(i);
          break;
       case INTERP_PERSPECTIVE:
-         tri_linear_coeff4(i);  /* temporary */
+         tri_persp_coeff4(i);
          break;
       default:
          ASSERT(0);
       }
    }
-#else
-   ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS);
-   ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR ||
-          spu.vertex_info.interp_mode[1] == INTERP_CONSTANT);
-   tri_linear_coeff(0, 2, 3);  /* slot 0, z */
-   tri_linear_coeff(1, 0, 4);  /* slot 1, color */
-#endif
 }
 
 
-static void setup_tri_edges(void)
+static void
+setup_tri_edges(void)
 {
    float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
    float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
@@ -827,9 +685,8 @@ static void setup_tri_edges(void)
  * Render the upper or lower half of a triangle.
  * Scissoring/cliprect is applied here too.
  */
-static void subtriangle( struct edge *eleft,
-			 struct edge *eright,
-			 unsigned lines )
+static void
+subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
 {
    const int minx = setup.cliprect_minx;
    const int maxx = setup.cliprect_maxx;
@@ -881,9 +738,11 @@ static void subtriangle( struct edge *eleft,
             setup.span.y = block(_y);
          }
 
-         setup.span.left[_y&1] = left;
-         setup.span.right[_y&1] = right;
-         setup.span.y_flags |= 1<<(_y&1);
+         int offset = _y&1;
+         vec_int4 quad_LlRr = {left, left, right, right};
+         /* Store left and right in 0 or 1 row of quad based on offset */
+         setup.span.quad = spu_sel(quad_LlRr, setup.span.quad, spu_maskw(5<<offset));
+         setup.span.y_flags |= 1<<offset;
       }
    }
 
@@ -902,7 +761,8 @@ static void subtriangle( struct edge *eleft,
  * The tile data should have already been fetched.
  */
 boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
+tri_draw(const float *v0, const float *v1, const float *v2,
+         uint tx, uint ty)
 {
    setup.tx = tx;
    setup.ty = ty;
@@ -924,21 +784,16 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 
    setup.span.y = 0;
    setup.span.y_flags = 0;
-   setup.span.right[0] = 0;
-   setup.span.right[1] = 0;
-   /*   setup.span.z_mode = tri_z_mode( setup.ctx ); */
+   /* Zero right elements */
+   setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0));
 
-   /*   init_constant_attribs( setup ); */
-      
-   if (setup.oneoverarea < 0.0) {
-      /* emaj on left:
-       */
+   if (setup.oneOverArea < 0.0) {
+      /* emaj on left */
       subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
       subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
    }
    else {
-      /* emaj on right:
-       */
+      /* emaj on right */
       subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
       subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
    }
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
index b8a0d4a265..af25dd3718 100644
--- a/src/gallium/drivers/cell/spu/spu_util.c
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -1,7 +1,7 @@
 
 #include "cell/common.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/p_debug.h"
+#include "util/u_debug.h"
 #include "tgsi/tgsi_parse.h"
 //#include "tgsi_build.h"
 #include "tgsi/tgsi_util.h"
diff --git a/src/gallium/drivers/failover/Makefile b/src/gallium/drivers/failover/Makefile
index f08b8df07a..dfb7f5dcf6 100644
--- a/src/gallium/drivers/failover/Makefile
+++ b/src/gallium/drivers/failover/Makefile
@@ -9,6 +9,3 @@ C_SOURCES = \
 	fo_context.c 
 
 include ../../Makefile.template
-
-symlinks:
-
diff --git a/src/gallium/drivers/failover/fo_context.c b/src/gallium/drivers/failover/fo_context.c
index 10c4ffc209..0742b27b8f 100644
--- a/src/gallium/drivers/failover/fo_context.c
+++ b/src/gallium/drivers/failover/fo_context.c
@@ -27,7 +27,7 @@
 
 
 #include "pipe/p_defines.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_memory.h"
 #include "pipe/p_context.h"
 
diff --git a/src/gallium/drivers/failover/fo_context.h b/src/gallium/drivers/failover/fo_context.h
index c6409fe1e1..9ba86ba866 100644
--- a/src/gallium/drivers/failover/fo_context.h
+++ b/src/gallium/drivers/failover/fo_context.h
@@ -114,5 +114,12 @@ failover_context( struct pipe_context *pipe )
    return (struct failover_context *)pipe;
 }
 
+/* Internal functions
+ */
+void
+failover_set_constant_buffer(struct pipe_context *pipe,
+                             uint shader, uint index,
+                             const struct pipe_constant_buffer *buf);
+
 
 #endif /* FO_CONTEXT_H */
diff --git a/src/gallium/drivers/i915simple/Makefile b/src/gallium/drivers/i915simple/Makefile
index 41a61a0020..12821c5a76 100644
--- a/src/gallium/drivers/i915simple/Makefile
+++ b/src/gallium/drivers/i915simple/Makefile
@@ -26,6 +26,3 @@ C_SOURCES = \
 	i915_surface.c 
 
 include ../../Makefile.template
-
-symlinks:
-
diff --git a/src/gallium/drivers/i915simple/i915_batch.h b/src/gallium/drivers/i915simple/i915_batch.h
index 45bf4f4028..a433cf054d 100644
--- a/src/gallium/drivers/i915simple/i915_batch.h
+++ b/src/gallium/drivers/i915simple/i915_batch.h
@@ -108,7 +108,7 @@ i915_batchbuffer_flush( struct i915_batchbuffer *batch,
 #define OUT_RELOC( buf, flags, delta ) \
    i915_batchbuffer_reloc( i915->batch, buf, flags, delta )
 
-#define FLUSH_BATCH(fence) do { 			\
+#define FLUSH_BATCH(fence) do {				\
    i915->winsys->batch_flush( i915->winsys, fence );	\
    i915->hardware_dirty = ~0;				\
 } while (0)
diff --git a/src/gallium/drivers/i915simple/i915_blit.c b/src/gallium/drivers/i915simple/i915_blit.c
index 45fae4c999..448a4708ce 100644
--- a/src/gallium/drivers/i915simple/i915_blit.c
+++ b/src/gallium/drivers/i915simple/i915_blit.c
@@ -38,7 +38,7 @@
 void
 i915_fill_blit(struct i915_context *i915,
 	       unsigned cpp,
-	       short dst_pitch,
+	       unsigned short dst_pitch,
 	       struct pipe_buffer *dst_buffer,
 	       unsigned dst_offset,
 	       short x, short y, 
@@ -47,15 +47,23 @@ i915_fill_blit(struct i915_context *i915,
 {
    unsigned BR13, CMD;
 
+
+   I915_DBG(i915,
+       "%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
+       __FUNCTION__,
+       dst_buffer, dst_pitch, dst_offset, x, y, w, h);
+
    switch (cpp) {
    case 1:
    case 2:
    case 3:
-      BR13 = dst_pitch | (0xF0 << 16) | (1 << 24);
+      BR13 = (((int) dst_pitch) & 0xffff) |
+	 (0xF0 << 16) | (1 << 24);
       CMD = XY_COLOR_BLT_CMD;
       break;
    case 4:
-      BR13 = dst_pitch | (0xF0 << 16) | (1 << 24) | (1 << 25);
+      BR13 = (((int) dst_pitch) & 0xffff) |
+	 (0xF0 << 16) | (1 << 24) | (1 << 25);
       CMD = (XY_COLOR_BLT_CMD | XY_COLOR_BLT_WRITE_ALPHA |
              XY_COLOR_BLT_WRITE_RGB);
       break;
@@ -63,10 +71,6 @@ i915_fill_blit(struct i915_context *i915,
       return;
    }
 
-//   DBG("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
-//       __FUNCTION__, dst_buffer, dst_pitch, dst_offset, x, y, w, h);
-
-
    if (!BEGIN_BATCH(6, 1)) {
       FLUSH_BATCH(NULL);
       assert(BEGIN_BATCH(6, 1));
@@ -77,6 +81,7 @@ i915_fill_blit(struct i915_context *i915,
    OUT_BATCH(((y + h) << 16) | (x + w));
    OUT_RELOC( dst_buffer, I915_BUFFER_ACCESS_WRITE, dst_offset);
    OUT_BATCH(color);
+   FLUSH_BATCH(NULL);
 }
 
 
@@ -84,10 +89,10 @@ void
 i915_copy_blit( struct i915_context *i915,
                   unsigned do_flip,
                   unsigned cpp,
-                  short src_pitch,
+                  unsigned short src_pitch,
                   struct pipe_buffer *src_buffer,
                   unsigned src_offset,
-                  short dst_pitch,
+                  unsigned short dst_pitch,
                   struct pipe_buffer *dst_buffer,
                   unsigned dst_offset,
                   short src_x, short src_y,
@@ -105,20 +110,16 @@ i915_copy_blit( struct i915_context *i915,
        src_buffer, src_pitch, src_offset, src_x, src_y,
        dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h);
 
-   src_pitch *= (short) cpp;
-   dst_pitch *= (short) cpp;
-
    switch (cpp) {
    case 1:
    case 2:
    case 3:
-      BR13 = (((int) dst_pitch) & 0xffff) | 
+      BR13 = (((int) dst_pitch) & 0xffff) |
 	 (0xCC << 16) | (1 << 24);
       CMD = XY_SRC_COPY_BLT_CMD;
       break;
    case 4:
-      BR13 =
-         (((int) dst_pitch) & 0xffff) | 
+      BR13 = (((int) dst_pitch) & 0xffff) |
 	 (0xCC << 16) | (1 << 24) | (1 << 25);
       CMD =
          (XY_SRC_COPY_BLT_CMD | XY_SRC_COPY_BLT_WRITE_ALPHA |
@@ -152,6 +153,7 @@ i915_copy_blit( struct i915_context *i915,
    OUT_BATCH((src_y << 16) | src_x);
    OUT_BATCH(((int) src_pitch & 0xffff));
    OUT_RELOC(src_buffer, I915_BUFFER_ACCESS_READ, src_offset);
+   FLUSH_BATCH(NULL);
 }
 
 
diff --git a/src/gallium/drivers/i915simple/i915_blit.h b/src/gallium/drivers/i915simple/i915_blit.h
index 6e5b44e124..0bb3453861 100644
--- a/src/gallium/drivers/i915simple/i915_blit.h
+++ b/src/gallium/drivers/i915simple/i915_blit.h
@@ -33,10 +33,10 @@
 extern void i915_copy_blit(struct i915_context *i915,
                            unsigned do_flip,
 			   unsigned cpp,
-			   short src_pitch,
+			   unsigned short src_pitch,
 			   struct pipe_buffer *src_buffer,
 			   unsigned src_offset,
-			   short dst_pitch,
+			   unsigned short dst_pitch,
 			   struct pipe_buffer *dst_buffer,
 			   unsigned dst_offset,
 			   short srcx, short srcy,
@@ -45,7 +45,7 @@ extern void i915_copy_blit(struct i915_context *i915,
 
 extern void i915_fill_blit(struct i915_context *i915,
 			   unsigned cpp,
-			   short dst_pitch,
+			   unsigned short dst_pitch,
 			   struct pipe_buffer *dst_buffer,
 			   unsigned dst_offset,
 			   short x, short y,
diff --git a/src/gallium/drivers/i915simple/i915_context.c b/src/gallium/drivers/i915simple/i915_context.c
index 6dd3eda85d..3e3a596884 100644
--- a/src/gallium/drivers/i915simple/i915_context.c
+++ b/src/gallium/drivers/i915simple/i915_context.c
@@ -34,7 +34,7 @@
 
 #include "draw/draw_context.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "pipe/p_inlines.h"
 #include "util/u_memory.h"
 #include "pipe/p_screen.h"
diff --git a/src/gallium/drivers/i915simple/i915_debug.c b/src/gallium/drivers/i915simple/i915_debug.c
index 5e26d1b905..e08582efab 100644
--- a/src/gallium/drivers/i915simple/i915_debug.c
+++ b/src/gallium/drivers/i915simple/i915_debug.c
@@ -29,8 +29,9 @@
 #include "i915_context.h"
 #include "i915_winsys.h"
 #include "i915_debug.h"
-#include "pipe/p_winsys.h"
-#include "pipe/p_debug.h"
+#include "i915_batch.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_debug.h"
 
 
 static void
@@ -210,6 +211,7 @@ BITS(
    PRINTF(stream, ": 0x%x\n", ((dw) & himask) >> (lo));
 }
 
+#ifdef DEBUG
 #define MBZ( dw, hi, lo) do {							\
    unsigned x = (dw) >> (lo);				\
    unsigned lomask = (1 << (lo)) - 1;			\
@@ -217,6 +219,10 @@ BITS(
    himask = (1UL << (hi)) - 1;				\
    assert ((x & himask & ~lomask) == 0);	\
 } while (0)
+#else
+#define MBZ( dw, hi, lo) do {							\
+} while (0)
+#endif
 
 static void
 FLAG(
@@ -858,19 +864,17 @@ static boolean i915_debug_packet( struct debug_stream *stream )
 
 
 void
-i915_dump_batchbuffer( struct i915_context *i915 )
+i915_dump_batchbuffer( struct i915_batchbuffer *batch )
 {
    struct debug_stream stream;
-   /* TODO fix me */
-   unsigned *start = 0;/*i915->batch_start;*/
-   unsigned *end = 0;/*i915->winsys->batch_start( i915->winsys, 0, 0 );*/
+   unsigned *start = (unsigned*)batch->map;
+   unsigned *end = (unsigned*)batch->ptr;
    unsigned long bytes = (unsigned long) (end - start) * 4;
    boolean done = FALSE;
 
    stream.offset = 0;
    stream.ptr = (char *)start;
    stream.print_addresses = 0;
-   stream.winsys = i915->pipe.winsys;
 
    if (!start || !end) {
       debug_printf( "\n\nBATCH: ???\n");
diff --git a/src/gallium/drivers/i915simple/i915_debug.h b/src/gallium/drivers/i915simple/i915_debug.h
index afb63edabf..16ca7277c7 100644
--- a/src/gallium/drivers/i915simple/i915_debug.h
+++ b/src/gallium/drivers/i915simple/i915_debug.h
@@ -41,7 +41,6 @@ struct debug_stream
    char *ptr;		/* pointer to gtt offset zero */
    char *end;		/* pointer to gtt offset zero */
    unsigned print_addresses;
-   struct pipe_winsys *winsys;
 };
 
 
@@ -73,7 +72,7 @@ void i915_print_ureg(const char *msg, unsigned ureg);
 
 #if defined(DEBUG) && defined(FILE_DEBUG_FLAG)
 
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 
 static INLINE void
 I915_DBG(
@@ -105,9 +104,9 @@ I915_DBG(
 #endif
 
 
-void i915_dump_batchbuffer( struct i915_context *i915 );
-
+struct i915_batchbuffer;
 
+void i915_dump_batchbuffer( struct i915_batchbuffer *i915 );
 
 void i915_debug_init( struct i915_context *i915 );
 
diff --git a/src/gallium/drivers/i915simple/i915_debug_fp.c b/src/gallium/drivers/i915simple/i915_debug_fp.c
index 48be3e1472..9c5b117b6d 100644
--- a/src/gallium/drivers/i915simple/i915_debug_fp.c
+++ b/src/gallium/drivers/i915simple/i915_debug_fp.c
@@ -28,7 +28,7 @@
 
 #include "i915_reg.h"
 #include "i915_debug.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_memory.h"
 
 
diff --git a/src/gallium/drivers/i915simple/i915_fpc_translate.c b/src/gallium/drivers/i915simple/i915_fpc_translate.c
index 34b4a846c1..961c1bf213 100644
--- a/src/gallium/drivers/i915simple/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915simple/i915_fpc_translate.c
@@ -144,7 +144,7 @@ src_vector(struct i915_fp_compile *p,
            const struct tgsi_full_src_register *source)
 {
    uint index = source->SrcRegister.Index;
-   uint src, sem_name, sem_ind;
+   uint src = 0, sem_name, sem_ind;
 
    switch (source->SrcRegister.File) {
    case TGSI_FILE_TEMPORARY:
@@ -321,16 +321,27 @@ static uint
 translate_tex_src_target(struct i915_fp_compile *p, uint tex)
 {
    switch (tex) {
+   case TGSI_TEXTURE_SHADOW1D:
+      /* fall-through */
    case TGSI_TEXTURE_1D:
       return D0_SAMPLE_TYPE_2D;
+
+   case TGSI_TEXTURE_SHADOW2D:
+      /* fall-through */
    case TGSI_TEXTURE_2D:
       return D0_SAMPLE_TYPE_2D;
+
+   case TGSI_TEXTURE_SHADOWRECT:
+      /* fall-through */
    case TGSI_TEXTURE_RECT:
       return D0_SAMPLE_TYPE_2D;
+
    case TGSI_TEXTURE_3D:
       return D0_SAMPLE_TYPE_VOLUME;
+
    case TGSI_TEXTURE_CUBE:
       return D0_SAMPLE_TYPE_CUBE;
+
    default:
       i915_program_error(p, "TexSrc type");
       return 0;
@@ -964,7 +975,7 @@ i915_translate_instructions(struct i915_fp_compile *p,
                = &parse.FullToken.FullImmediate;
             const uint pos = p->num_immediates++;
             uint j;
-            for (j = 0; j < imm->Immediate.Size; j++) {
+            for (j = 0; j < imm->Immediate.NrTokens - 1; j++) {
                p->immediates[pos][j] = imm->u.ImmediateFloat32[j].Float;
             }
          }
diff --git a/src/gallium/drivers/i915simple/i915_prim_vbuf.c b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
index 4fda1ab64f..58c41840e1 100644
--- a/src/gallium/drivers/i915simple/i915_prim_vbuf.c
+++ b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
@@ -40,9 +40,9 @@
 
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
-#include "pipe/p_debug.h"
+#include "util/u_debug.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
@@ -62,7 +62,7 @@ struct i915_vbuf_render {
    struct i915_context *i915;   
 
    /** Vertex size in bytes */
-   unsigned vertex_size;
+   size_t vertex_size;
 
    /** Software primitive */
    unsigned prim;
@@ -79,6 +79,7 @@ struct i915_vbuf_render {
    size_t vbo_offset;
    void *vbo_ptr;
    size_t vbo_alloc_size;
+   size_t vbo_max_used;
 };
 
 
@@ -108,7 +109,7 @@ i915_vbuf_render_get_vertex_info( struct vbuf_render *render )
 }
 
 
-static void *
+static boolean
 i915_vbuf_render_allocate_vertices( struct vbuf_render *render,
                                     ushort vertex_size,
                                     ushort nr_vertices )
@@ -124,7 +125,8 @@ i915_vbuf_render_allocate_vertices( struct vbuf_render *render,
    if (i915_render->vbo_size > size + i915_render->vbo_offset && !i915->vbo_flushed) {
    } else {
       i915->vbo_flushed = 0;
-      pipe_buffer_reference(screen, &i915_render->vbo, NULL);
+      if (i915_render->vbo)
+         pipe_buffer_reference(screen, &i915_render->vbo, NULL);
    }
 
    if (!i915_render->vbo) {
@@ -134,19 +136,49 @@ i915_vbuf_render_allocate_vertices( struct vbuf_render *render,
                                             64,
                                             I915_BUFFER_USAGE_LIT_VERTEX,
                                             i915_render->vbo_size);
-      i915_render->vbo_ptr = pipe_buffer_map(screen,
-                                             i915_render->vbo,
-                                             PIPE_BUFFER_USAGE_CPU_WRITE);
-      pipe_buffer_unmap(screen, i915_render->vbo);
+
    }
 
+   i915_render->vertex_size = vertex_size;
    i915->vbo = i915_render->vbo;
    i915->vbo_offset = i915_render->vbo_offset;
    i915->dirty |= I915_NEW_VBO;
 
+   if (!i915_render->vbo)
+      return FALSE;
+   return TRUE;
+}
+
+
+static void *
+i915_vbuf_render_map_vertices( struct vbuf_render *render )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   struct pipe_screen *screen = i915->pipe.screen;
+
+   if (i915->vbo_flushed)
+      debug_printf("%s bad vbo flush occured stalling on hw\n");
+
+   i915_render->vbo_ptr = pipe_buffer_map(screen,
+                                          i915_render->vbo,
+                                          PIPE_BUFFER_USAGE_CPU_WRITE);
+
    return (unsigned char *)i915_render->vbo_ptr + i915->vbo_offset;
 }
 
+static void
+i915_vbuf_render_unmap_vertices( struct vbuf_render *render,
+                                 ushort min_index,
+                                 ushort max_index )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   struct pipe_screen *screen = i915->pipe.screen;
+
+   i915_render->vbo_max_used = MAX2(i915_render->vbo_max_used, i915_render->vertex_size * (max_index + 1));
+   pipe_buffer_unmap(screen, i915_render->vbo);
+}
 
 static boolean
 i915_vbuf_render_set_primitive( struct vbuf_render *render, 
@@ -197,9 +229,7 @@ i915_vbuf_render_set_primitive( struct vbuf_render *render,
       i915_render->fallback = 0;
       return TRUE;
    default:
-      assert((int)"Error unkown primtive type" & 0);
-      /* Actually, can handle a lot more just fine...  Fixme.
-       */
+      /* FIXME: Actually, can handle a lot more just fine... */
       return FALSE;
    }
 }
@@ -456,18 +486,15 @@ out:
 
 
 static void
-i915_vbuf_render_release_vertices( struct vbuf_render *render,
-			           void *vertices, 
-			           unsigned vertex_size,
-			           unsigned vertices_used )
+i915_vbuf_render_release_vertices( struct vbuf_render *render )
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    struct i915_context *i915 = i915_render->i915;
-   size_t size = (size_t)vertex_size * (size_t)vertices_used;
 
    assert(i915->vbo);
 
-   i915_render->vbo_offset += size;
+   i915_render->vbo_offset += i915_render->vbo_max_used;
+   i915_render->vbo_max_used = 0;
    i915->vbo = NULL;
    i915->dirty |= I915_NEW_VBO;
 }
@@ -501,6 +528,8 @@ i915_vbuf_render_create( struct i915_context *i915 )
 
    i915_render->base.get_vertex_info = i915_vbuf_render_get_vertex_info;
    i915_render->base.allocate_vertices = i915_vbuf_render_allocate_vertices;
+   i915_render->base.map_vertices = i915_vbuf_render_map_vertices;
+   i915_render->base.unmap_vertices = i915_vbuf_render_unmap_vertices;
    i915_render->base.set_primitive = i915_vbuf_render_set_primitive;
    i915_render->base.draw = i915_vbuf_render_draw;
    i915_render->base.draw_arrays = i915_vbuf_render_draw_arrays;
diff --git a/src/gallium/drivers/i915simple/i915_screen.c b/src/gallium/drivers/i915simple/i915_screen.c
index 1c976082df..b7bd3b3b74 100644
--- a/src/gallium/drivers/i915simple/i915_screen.c
+++ b/src/gallium/drivers/i915simple/i915_screen.c
@@ -27,7 +27,8 @@
 
 
 #include "util/u_memory.h"
-#include "pipe/p_winsys.h"
+#include "util/u_simple_screen.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "pipe/p_inlines.h"
 #include "util/u_string.h"
 
@@ -203,16 +204,79 @@ i915_destroy_screen( struct pipe_screen *screen )
 }
 
 
+static struct pipe_transfer*
+i915_get_tex_transfer(struct pipe_screen *screen,
+                      struct pipe_texture *texture,
+                      unsigned face, unsigned level, unsigned zslice,
+                      enum pipe_transfer_usage usage, unsigned x, unsigned y,
+                      unsigned w, unsigned h)
+{
+   struct i915_texture *tex = (struct i915_texture *)texture;
+   struct i915_transfer *trans;
+   unsigned offset;  /* in bytes */
+
+   if (texture->target == PIPE_TEXTURE_CUBE) {
+      offset = tex->image_offset[level][face];
+   }
+   else if (texture->target == PIPE_TEXTURE_3D) {
+      offset = tex->image_offset[level][zslice];
+   }
+   else {
+      offset = tex->image_offset[level][0];
+      assert(face == 0);
+      assert(zslice == 0);
+   }
+
+   trans = CALLOC_STRUCT(i915_transfer);
+   if (trans) {
+      trans->base.refcount = 1;
+      pipe_texture_reference(&trans->base.texture, texture);
+      trans->base.format = trans->base.format;
+      trans->base.width = w;
+      trans->base.height = h;
+      trans->base.block = texture->block;
+      trans->base.nblocksx = texture->nblocksx[level];
+      trans->base.nblocksy = texture->nblocksy[level];
+      trans->base.stride = tex->stride;
+      trans->offset = offset;
+      trans->base.usage = usage;
+   }
+   return &trans->base;
+}
+
+static void
+i915_tex_transfer_release(struct pipe_screen *screen,
+                          struct pipe_transfer **transfer)
+{
+   struct pipe_transfer *trans = *transfer;
+
+   if (--trans->refcount == 0) {
+      pipe_texture_reference(&trans->texture, NULL);
+      FREE(trans);
+   }
+
+   *transfer = NULL;
+}
+
 static void *
-i915_surface_map( struct pipe_screen *screen,
-                  struct pipe_surface *surface,
-                  unsigned flags )
+i915_transfer_map( struct pipe_screen *screen,
+                   struct pipe_transfer *transfer )
 {
-   char *map = pipe_buffer_map( screen, surface->buffer, flags );
+   struct i915_texture *tex = (struct i915_texture *)transfer->texture;
+   char *map;
+   unsigned flags = 0;
+
+   if (transfer->usage != PIPE_TRANSFER_WRITE)
+      flags |= PIPE_BUFFER_USAGE_CPU_READ;
+
+   if (transfer->usage != PIPE_TRANSFER_READ)
+      flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
+
+   map = pipe_buffer_map( screen, tex->buffer, flags );
    if (map == NULL)
       return NULL;
 
-   if (surface->texture &&
+   if (transfer->texture &&
        (flags & PIPE_BUFFER_USAGE_CPU_WRITE)) 
    {
       /* Do something to notify contexts of a texture change.  
@@ -220,14 +284,17 @@ i915_surface_map( struct pipe_screen *screen,
       /* i915_screen(screen)->timestamp++; */
    }
    
-   return map + surface->offset;
+   return map + i915_transfer(transfer)->offset +
+      transfer->y / transfer->block.height * transfer->stride +
+      transfer->x / transfer->block.width * transfer->block.size;
 }
 
 static void
-i915_surface_unmap(struct pipe_screen *screen,
-                   struct pipe_surface *surface)
+i915_transfer_unmap(struct pipe_screen *screen,
+                    struct pipe_transfer *transfer)
 {
-   pipe_buffer_unmap( screen, surface->buffer );
+   struct i915_texture *tex = (struct i915_texture *)transfer->texture;
+   pipe_buffer_unmap( screen, tex->buffer );
 }
 
 
@@ -275,10 +342,13 @@ i915_create_screen(struct pipe_winsys *winsys, uint pci_id)
    i915screen->screen.get_param = i915_get_param;
    i915screen->screen.get_paramf = i915_get_paramf;
    i915screen->screen.is_format_supported = i915_is_format_supported;
-   i915screen->screen.surface_map = i915_surface_map;
-   i915screen->screen.surface_unmap = i915_surface_unmap;
+   i915screen->screen.get_tex_transfer = i915_get_tex_transfer;
+   i915screen->screen.tex_transfer_release = i915_tex_transfer_release;
+   i915screen->screen.transfer_map = i915_transfer_map;
+   i915screen->screen.transfer_unmap = i915_transfer_unmap;
 
    i915_init_screen_texture_functions(&i915screen->screen);
+   u_simple_screen_init(&i915screen->screen);
 
    return &i915screen->screen;
 }
diff --git a/src/gallium/drivers/i915simple/i915_screen.h b/src/gallium/drivers/i915simple/i915_screen.h
index 73b0ff05ce..a371663453 100644
--- a/src/gallium/drivers/i915simple/i915_screen.h
+++ b/src/gallium/drivers/i915simple/i915_screen.h
@@ -50,13 +50,30 @@ struct i915_screen
 };
 
 
-/** cast wrapper */
+/**
+ * Subclass of pipe_transfer
+ */
+struct i915_transfer
+{
+   struct pipe_transfer base;
+
+   unsigned offset;
+};
+
+
+/** cast wrappers */
 static INLINE struct i915_screen *
 i915_screen(struct pipe_screen *pscreen)
 {
    return (struct i915_screen *) pscreen;
 }
 
+static INLINE struct i915_transfer *
+i915_transfer( struct pipe_transfer *transfer )
+{
+   return (struct i915_transfer *)transfer;
+}
+
 
 extern struct pipe_screen *
 i915_create_screen(struct pipe_winsys *winsys, uint pci_id);
diff --git a/src/gallium/drivers/i915simple/i915_state.c b/src/gallium/drivers/i915simple/i915_state.c
index d2487d8277..273e74002a 100644
--- a/src/gallium/drivers/i915simple/i915_state.c
+++ b/src/gallium/drivers/i915simple/i915_state.c
@@ -30,7 +30,7 @@
 
 
 #include "draw/draw_context.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "pipe/p_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
@@ -318,8 +318,8 @@ i915_create_depth_stencil_state(struct pipe_context *pipe,
    struct i915_depth_stencil_state *cso = CALLOC_STRUCT( i915_depth_stencil_state );
 
    {
-      int testmask = depth_stencil->stencil[0].value_mask & 0xff;
-      int writemask = depth_stencil->stencil[0].write_mask & 0xff;
+      int testmask = depth_stencil->stencil[0].valuemask & 0xff;
+      int writemask = depth_stencil->stencil[0].writemask & 0xff;
 
       cso->stencil_modes4 |= (_3DSTATE_MODES_4_CMD |
                               ENABLE_STENCIL_TEST_MASK |
@@ -350,8 +350,8 @@ i915_create_depth_stencil_state(struct pipe_context *pipe,
       int dfop  = i915_translate_stencil_op(depth_stencil->stencil[1].zfail_op);
       int dpop  = i915_translate_stencil_op(depth_stencil->stencil[1].zpass_op);
       int ref   = depth_stencil->stencil[1].ref_value & 0xff;
-      int tmask = depth_stencil->stencil[1].value_mask & 0xff;
-      int wmask = depth_stencil->stencil[1].write_mask & 0xff;
+      int tmask = depth_stencil->stencil[1].valuemask & 0xff;
+      int wmask = depth_stencil->stencil[1].writemask & 0xff;
 
       cso->bfo[0] = (_3DSTATE_BACKFACE_STENCIL_OPS |
                      BFO_ENABLE_STENCIL_FUNCS |
@@ -394,7 +394,7 @@ i915_create_depth_stencil_state(struct pipe_context *pipe,
 
    if (depth_stencil->alpha.enabled) {
       int test = i915_translate_compare_func(depth_stencil->alpha.func);
-      ubyte refByte = float_to_ubyte(depth_stencil->alpha.ref);
+      ubyte refByte = float_to_ubyte(depth_stencil->alpha.ref_value);
 
       cso->depth_LIS6 |= (S6_ALPHA_TEST_ENABLE |
 			  (test << S6_ALPHA_TEST_FUNC_SHIFT) |
@@ -535,13 +535,13 @@ static void i915_set_constant_buffer(struct pipe_context *pipe,
     */
    if (buf) {
       void *mapped;
-      if (buf->size &&
+      if (buf->buffer && buf->buffer->size &&
           (mapped = ws->buffer_map(ws, buf->buffer,
-                                   PIPE_BUFFER_USAGE_CPU_READ))) {
-         memcpy(i915->current.constants[shader], mapped, buf->size);
+                                    PIPE_BUFFER_USAGE_CPU_READ))) {
+         memcpy(i915->current.constants[shader], mapped, buf->buffer->size);
          ws->buffer_unmap(ws, buf->buffer);
          i915->current.num_user_constants[shader]
-            = buf->size / (4 * sizeof(float));
+            = buf->buffer->size / (4 * sizeof(float));
       }
       else {
          i915->current.num_user_constants[shader] = 0;
diff --git a/src/gallium/drivers/i915simple/i915_state_emit.c b/src/gallium/drivers/i915simple/i915_state_emit.c
index 9bd6f92323..26e03f5127 100644
--- a/src/gallium/drivers/i915simple/i915_state_emit.c
+++ b/src/gallium/drivers/i915simple/i915_state_emit.c
@@ -211,20 +211,23 @@ i915_emit_hardware_state(struct i915_context *i915 )
       struct pipe_surface *depth_surface = i915->framebuffer.zsbuf;
 
       if (cbuf_surface) {
-	 unsigned cpitch = cbuf_surface->stride;
 	 unsigned ctile = BUF_3D_USE_FENCE;
-	 if (cbuf_surface->texture &&
-	       ((struct i915_texture*)(cbuf_surface->texture))->tiled) {
+         struct i915_texture *tex = (struct i915_texture *)
+                                    cbuf_surface->texture;
+         struct pipe_buffer *buffer = tex->buffer;
+         assert(tex);
+
+	 if (tex && tex->tiled) {
 	    ctile = BUF_3D_TILED_SURFACE;
 	 }
 
 	 OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
 
-	 OUT_BATCH(BUF_3D_ID_COLOR_BACK | 
-		   BUF_3D_PITCH(cpitch) |  /* pitch in bytes */
+	 OUT_BATCH(BUF_3D_ID_COLOR_BACK |
+		   BUF_3D_PITCH(tex->stride) |  /* pitch in bytes */
 		   ctile);
 
-	 OUT_RELOC(cbuf_surface->buffer,
+	 OUT_RELOC(tex->buffer,
 		   I915_BUFFER_ACCESS_WRITE,
 		   cbuf_surface->offset);
       }
@@ -232,20 +235,23 @@ i915_emit_hardware_state(struct i915_context *i915 )
       /* What happens if no zbuf??
        */
       if (depth_surface) {
-	 unsigned zpitch = depth_surface->stride;
 	 unsigned ztile = BUF_3D_USE_FENCE;
-	 if (depth_surface->texture &&
-	       ((struct i915_texture*)(depth_surface->texture))->tiled) {
+         struct i915_texture *tex = (struct i915_texture *)
+                                    depth_surface->texture;
+         struct pipe_buffer *buffer = tex->buffer;
+         assert(tex);
+
+	 if (tex && tex->tiled) {
 	    ztile = BUF_3D_TILED_SURFACE;
 	 }
 
 	 OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
 
 	 OUT_BATCH(BUF_3D_ID_DEPTH |
-		   BUF_3D_PITCH(zpitch) |  /* pitch in bytes */
+		   BUF_3D_PITCH(tex->stride) |  /* pitch in bytes */
 		   ztile);
 
-	 OUT_RELOC(depth_surface->buffer,
+	 OUT_RELOC(tex->buffer,
 		   I915_BUFFER_ACCESS_WRITE,
 		   depth_surface->offset);
       }
diff --git a/src/gallium/drivers/i915simple/i915_surface.c b/src/gallium/drivers/i915simple/i915_surface.c
index 62f1926644..7eec649906 100644
--- a/src/gallium/drivers/i915simple/i915_surface.c
+++ b/src/gallium/drivers/i915simple/i915_surface.c
@@ -31,7 +31,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_tile.h"
 #include "util/u_rect.h"
 
@@ -47,42 +47,22 @@ i915_surface_copy(struct pipe_context *pipe,
 		  struct pipe_surface *src,
 		  unsigned srcx, unsigned srcy, unsigned width, unsigned height)
 {
-   assert( dst != src );
-   assert( dst->block.size == src->block.size );
-   assert( dst->block.width == src->block.height );
-   assert( dst->block.height == src->block.height );
+   struct i915_texture *dst_tex = (struct i915_texture *)dst->texture;
+   struct i915_texture *src_tex = (struct i915_texture *)src->texture;
 
-   if (0) {
-      void *dst_map = pipe->screen->surface_map( pipe->screen,
-                                                 dst,
-                                                 PIPE_BUFFER_USAGE_CPU_WRITE );
-      
-      const void *src_map = pipe->screen->surface_map( pipe->screen,
-                                                       src,
-                                                       PIPE_BUFFER_USAGE_CPU_READ );
-      
-      pipe_copy_rect(dst_map,
-                     &dst->block,
-                     dst->stride,
-                     dstx, dsty,
-                     width, height,
-                     src_map,
-                     do_flip ? -(int) src->stride : src->stride,
-                     srcx, do_flip ? height - 1 - srcy : srcy);
+   assert( dst != src );
+   assert( dst_tex->base.block.size == src_tex->base.block.size );
+   assert( dst_tex->base.block.width == src_tex->base.block.height );
+   assert( dst_tex->base.block.height == src_tex->base.block.height );
+   assert( dst_tex->base.block.width == 1 );
+   assert( dst_tex->base.block.height == 1 );
 
-      pipe->screen->surface_unmap(pipe->screen, src);
-      pipe->screen->surface_unmap(pipe->screen, dst);
-   }
-   else {
-      assert(dst->block.width == 1);
-      assert(dst->block.height == 1);
-      i915_copy_blit( i915_context(pipe),
-                      do_flip,
-                      dst->block.size,
-		      (short) src->stride, src->buffer, src->offset,
-		      (short) dst->stride, dst->buffer, dst->offset,
-		      (short) srcx, (short) srcy, (short) dstx, (short) dsty, (short) width, (short) height );
-   }
+   i915_copy_blit( i915_context(pipe),
+                   do_flip,
+                   dst_tex->base.block.size,
+                   (unsigned short) src_tex->stride, src_tex->buffer, src->offset,
+                   (unsigned short) dst_tex->stride, dst_tex->buffer, dst->offset,
+                   (short) srcx, (short) srcy, (short) dstx, (short) dsty, (short) width, (short) height );
 }
 
 
@@ -92,26 +72,18 @@ i915_surface_fill(struct pipe_context *pipe,
 		  unsigned dstx, unsigned dsty,
 		  unsigned width, unsigned height, unsigned value)
 {
-   if (0) {
-      void *dst_map = pipe->screen->surface_map( pipe->screen,
-                                                 dst,
-                                                 PIPE_BUFFER_USAGE_CPU_WRITE );
+   struct i915_texture *tex = (struct i915_texture *)dst->texture;
 
-      pipe_fill_rect(dst_map, &dst->block, dst->stride, dstx, dsty, width, height, value);
+   assert(tex->base.block.width == 1);
+   assert(tex->base.block.height == 1);
 
-      pipe->screen->surface_unmap(pipe->screen, dst);
-   }
-   else {
-      assert(dst->block.width == 1);
-      assert(dst->block.height == 1);
-      i915_fill_blit( i915_context(pipe),
-		      dst->block.size,
-		      (short) dst->stride,
-		      dst->buffer, dst->offset,
-		      (short) dstx, (short) dsty,
-		      (short) width, (short) height,
-		      value );
-   }
+   i915_fill_blit( i915_context(pipe),
+                   tex->base.block.size,
+                   (unsigned short) tex->stride,
+                   tex->buffer, dst->offset,
+                   (short) dstx, (short) dsty,
+                   (short) width, (short) height,
+                   value );
 }
 
 
diff --git a/src/gallium/drivers/i915simple/i915_texture.c b/src/gallium/drivers/i915simple/i915_texture.c
index bd87217063..957726523f 100644
--- a/src/gallium/drivers/i915simple/i915_texture.c
+++ b/src/gallium/drivers/i915simple/i915_texture.c
@@ -34,7 +34,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
@@ -178,7 +178,9 @@ i915_displaytarget_layout(struct i915_texture *tex)
    if (tex->base.width[0] >= 128) {
       tex->stride = power_of_two(tex->base.nblocksx[0] * pt->block.size);
       tex->total_nblocksy = round_up(tex->base.nblocksy[0], 8);
+#if 0 /* used for tiled display targets */
       tex->tiled = 1;
+#endif
    } else {
       tex->stride = round_up(tex->base.nblocksx[0] * pt->block.size, 64);
       tex->total_nblocksy = tex->base.nblocksy[0];
@@ -206,11 +208,10 @@ i945_miptree_layout_2d( struct i915_texture *tex )
    unsigned nblocksx = pt->nblocksx[0];
    unsigned nblocksy = pt->nblocksy[0];
 
-#if 0 /* used for tiled display targets */
-   if (pt->last_level == 0 && pt->block.size == 4)
+   /* used for tiled display targets */
+   if (0)
       if (i915_displaytarget_layout(tex))
 	 return;
-#endif
 
    tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
 
@@ -605,8 +606,8 @@ i915_texture_create(struct pipe_screen *screen,
    tex_size = tex->stride * tex->total_nblocksy;
 
    tex->buffer = ws->buffer_create(ws, 64,
-				   PIPE_BUFFER_USAGE_PIXEL,
-				   tex_size);
+                                    PIPE_BUFFER_USAGE_PIXEL,
+                                    tex_size);
 
    if (!tex->buffer)
       goto fail;
@@ -663,7 +664,6 @@ i915_get_tex_surface(struct pipe_screen *screen,
                      unsigned flags)
 {
    struct i915_texture *tex = (struct i915_texture *)pt;
-   struct pipe_winsys *ws = screen->winsys;
    struct pipe_surface *ps;
    unsigned offset;  /* in bytes */
 
@@ -682,16 +682,10 @@ i915_get_tex_surface(struct pipe_screen *screen,
    ps = CALLOC_STRUCT(pipe_surface);
    if (ps) {
       ps->refcount = 1;
-      ps->winsys = ws;
       pipe_texture_reference(&ps->texture, pt);
-      pipe_buffer_reference(screen, &ps->buffer, tex->buffer);
       ps->format = pt->format;
       ps->width = pt->width[level];
       ps->height = pt->height[level];
-      ps->block = pt->block;
-      ps->nblocksx = pt->nblocksx[level];
-      ps->nblocksy = pt->nblocksy[level];
-      ps->stride = tex->stride;
       ps->offset = offset;
       ps->usage = flags;
       ps->status = PIPE_SURFACE_STATUS_DEFINED;
@@ -756,7 +750,6 @@ i915_tex_surface_release(struct pipe_screen *screen,
       }
 
       pipe_texture_reference(&surf->texture, NULL);
-      pipe_buffer_reference(screen, &surf->buffer, NULL);
       FREE(surf);
    }
 
diff --git a/src/gallium/drivers/i965simple/Makefile b/src/gallium/drivers/i965simple/Makefile
index e97146e57c..19182afa75 100644
--- a/src/gallium/drivers/i965simple/Makefile
+++ b/src/gallium/drivers/i965simple/Makefile
@@ -50,5 +50,3 @@ C_SOURCES = \
 	brw_wm_surface_state.c
 
 include ../../Makefile.template
-
-symlinks:
diff --git a/src/gallium/drivers/i965simple/brw_blit.c b/src/gallium/drivers/i965simple/brw_blit.c
index 8494f70493..4d11f8d2ab 100644
--- a/src/gallium/drivers/i965simple/brw_blit.c
+++ b/src/gallium/drivers/i965simple/brw_blit.c
@@ -35,7 +35,7 @@
 #include "brw_reg.h"
 
 #include "pipe/p_context.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 
 #define FILE_DEBUG_FLAG DEBUG_BLIT
 
diff --git a/src/gallium/drivers/i965simple/brw_cc.c b/src/gallium/drivers/i965simple/brw_cc.c
index 79d4150383..3668123e2e 100644
--- a/src/gallium/drivers/i965simple/brw_cc.c
+++ b/src/gallium/drivers/i965simple/brw_cc.c
@@ -166,8 +166,8 @@ static void upload_cc_unit( struct brw_context *brw )
       cc.cc0.stencil_pass_depth_pass_op = brw_translate_stencil_op(
          brw->attribs.DepthStencil->stencil[0].zpass_op);
       cc.cc1.stencil_ref = brw->attribs.DepthStencil->stencil[0].ref_value;
-      cc.cc1.stencil_write_mask = brw->attribs.DepthStencil->stencil[0].write_mask;
-      cc.cc1.stencil_test_mask = brw->attribs.DepthStencil->stencil[0].value_mask;
+      cc.cc1.stencil_write_mask = brw->attribs.DepthStencil->stencil[0].writemask;
+      cc.cc1.stencil_test_mask = brw->attribs.DepthStencil->stencil[0].valuemask;
 
       if (brw->attribs.DepthStencil->stencil[1].enabled) {
 	 cc.cc0.bf_stencil_enable = brw->attribs.DepthStencil->stencil[1].enabled;
@@ -180,14 +180,14 @@ static void upload_cc_unit( struct brw_context *brw )
 	 cc.cc0.bf_stencil_pass_depth_pass_op = brw_translate_stencil_op(
             brw->attribs.DepthStencil->stencil[1].zpass_op);
 	 cc.cc1.bf_stencil_ref = brw->attribs.DepthStencil->stencil[1].ref_value;
-	 cc.cc2.bf_stencil_write_mask = brw->attribs.DepthStencil->stencil[1].write_mask;
-	 cc.cc2.bf_stencil_test_mask = brw->attribs.DepthStencil->stencil[1].value_mask;
+	 cc.cc2.bf_stencil_write_mask = brw->attribs.DepthStencil->stencil[1].writemask;
+	 cc.cc2.bf_stencil_test_mask = brw->attribs.DepthStencil->stencil[1].valuemask;
       }
 
       /* Not really sure about this:
        */
-      if (brw->attribs.DepthStencil->stencil[0].write_mask ||
-	  brw->attribs.DepthStencil->stencil[1].write_mask)
+      if (brw->attribs.DepthStencil->stencil[0].writemask ||
+	  brw->attribs.DepthStencil->stencil[1].writemask)
 	 cc.cc0.stencil_write_enable = 1;
    }
 
@@ -233,7 +233,7 @@ static void upload_cc_unit( struct brw_context *brw )
       cc.cc3.alpha_test_func = 
 	 brw_translate_compare_func(brw->attribs.DepthStencil->alpha.func);
 
-      cc.cc7.alpha_ref.ub[0] = float_to_ubyte(brw->attribs.DepthStencil->alpha.ref);
+      cc.cc7.alpha_ref.ub[0] = float_to_ubyte(brw->attribs.DepthStencil->alpha.ref_value);
 
       cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
    }
diff --git a/src/gallium/drivers/i965simple/brw_context.c b/src/gallium/drivers/i965simple/brw_context.c
index 96920df008..c74cbf8d73 100644
--- a/src/gallium/drivers/i965simple/brw_context.c
+++ b/src/gallium/drivers/i965simple/brw_context.c
@@ -37,7 +37,7 @@
 #include "brw_tex_layout.h"
 #include "brw_winsys.h"
 
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "pipe/p_context.h"
 #include "util/u_memory.h"
 #include "pipe/p_screen.h"
diff --git a/src/gallium/drivers/i965simple/brw_curbe.c b/src/gallium/drivers/i965simple/brw_curbe.c
index 824ee7fd6d..904cde8e30 100644
--- a/src/gallium/drivers/i965simple/brw_curbe.c
+++ b/src/gallium/drivers/i965simple/brw_curbe.c
@@ -38,7 +38,7 @@
 #include "brw_util.h"
 #include "brw_wm.h"
 #include "pipe/p_state.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
@@ -257,13 +257,13 @@ static void upload_constant_buffer(struct brw_context *brw)
       if (brw->vs.prog_data->num_consts) {
          /* map the vertex constant buffer and copy to curbe: */
          void *data = ws->buffer_map(ws, cbuffer->buffer, 0);
-         /* FIXME: this is wrong. the cbuffer->size currently
+         /* FIXME: this is wrong. the cbuffer->buffer->size currently
           * represents size of consts + immediates. so if we'll
           * have both we'll copy over the end of the buffer
           * with the subsequent memcpy */
-         memcpy(&buf[offset], data, cbuffer->size);
+         memcpy(&buf[offset], data, cbuffer->buffer->size);
          ws->buffer_unmap(ws, cbuffer->buffer);
-         offset += cbuffer->size;
+         offset += cbuffer->buffer->size;
       }
       /*immediates*/
       if (brw->vs.prog_data->num_imm) {
diff --git a/src/gallium/drivers/i965simple/brw_draw.c b/src/gallium/drivers/i965simple/brw_draw.c
index 7598e3dc8a..648aaa0da5 100644
--- a/src/gallium/drivers/i965simple/brw_draw.c
+++ b/src/gallium/drivers/i965simple/brw_draw.c
@@ -34,7 +34,7 @@
 #include "brw_state.h"
 
 #include "pipe/p_context.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 
 static unsigned hw_prim[PIPE_PRIM_POLYGON+1] = {
    _3DPRIM_POINTLIST,
diff --git a/src/gallium/drivers/i965simple/brw_draw_upload.c b/src/gallium/drivers/i965simple/brw_draw_upload.c
index 7c20ea52af..2d9ca3f2ea 100644
--- a/src/gallium/drivers/i965simple/brw_draw_upload.c
+++ b/src/gallium/drivers/i965simple/brw_draw_upload.c
@@ -223,7 +223,7 @@ boolean brw_upload_vertex_buffers( struct brw_context *brw )
 	 break;
       }
 
-      vbp.vb[i].vb0.bits.pitch = brw->vb.vbo_array[i]->pitch;
+      vbp.vb[i].vb0.bits.pitch = brw->vb.vbo_array[i]->stride;
       vbp.vb[i].vb0.bits.pad = 0;
       vbp.vb[i].vb0.bits.access_type = BRW_VERTEXBUFFER_ACCESS_VERTEXDATA;
       vbp.vb[i].vb0.bits.vb_index = i;
diff --git a/src/gallium/drivers/i965simple/brw_eu_debug.c b/src/gallium/drivers/i965simple/brw_eu_debug.c
index 4a94ddefa6..4adfb0c02f 100644
--- a/src/gallium/drivers/i965simple/brw_eu_debug.c
+++ b/src/gallium/drivers/i965simple/brw_eu_debug.c
@@ -30,7 +30,7 @@
   */
     
 
-#include "pipe/p_debug.h"
+#include "util/u_debug.h"
 
 #include "brw_eu.h"
 
diff --git a/src/gallium/drivers/i965simple/brw_misc_state.c b/src/gallium/drivers/i965simple/brw_misc_state.c
index be812c5da9..99ff4403a5 100644
--- a/src/gallium/drivers/i965simple/brw_misc_state.c
+++ b/src/gallium/drivers/i965simple/brw_misc_state.c
@@ -223,7 +223,7 @@ static void upload_depthbuffer(struct brw_context *brw)
       OUT_BATCH(0);
    } else {
       unsigned int format;
-
+      struct brw_texture *tex = (struct brw_texture *)depth_surface->texture;
       assert(depth_surface->block.width == 1);
       assert(depth_surface->block.height == 1);
       switch (depth_surface->block.size) {
@@ -246,7 +246,7 @@ static void upload_depthbuffer(struct brw_context *brw)
 		(BRW_TILEWALK_YMAJOR << 26) |
 //		(depth_surface->region->tiled << 27) |
 		(BRW_SURFACE_2D << 29));
-      OUT_RELOC(depth_surface->buffer,
+      OUT_RELOC(tex->buffer,
 		PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE, 0);
       OUT_BATCH((BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1) |
 		((depth_surface->stride/depth_surface->block.size - 1) << 6) |
diff --git a/src/gallium/drivers/i965simple/brw_screen.c b/src/gallium/drivers/i965simple/brw_screen.c
index ab7cd624b2..b22e105f10 100644
--- a/src/gallium/drivers/i965simple/brw_screen.c
+++ b/src/gallium/drivers/i965simple/brw_screen.c
@@ -27,8 +27,9 @@
 
 
 #include "util/u_memory.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_string.h"
+#include "util/u_simple_screen.h"
 
 #include "brw_context.h"
 #include "brw_screen.h"
@@ -239,6 +240,7 @@ brw_create_screen(struct pipe_winsys *winsys, uint pci_id)
    brwscreen->screen.is_format_supported = brw_is_format_supported;
 
    brw_init_screen_texture_funcs(&brwscreen->screen);
+   u_simple_screen_init(&brwscreen->screen);
 
    return &brwscreen->screen;
 }
diff --git a/src/gallium/drivers/i965simple/brw_state.c b/src/gallium/drivers/i965simple/brw_state.c
index af46cb546f..b47f5373f3 100644
--- a/src/gallium/drivers/i965simple/brw_state.c
+++ b/src/gallium/drivers/i965simple/brw_state.c
@@ -30,7 +30,7 @@
  */
 
 
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_shader_tokens.h"
diff --git a/src/gallium/drivers/i965simple/brw_state_pool.c b/src/gallium/drivers/i965simple/brw_state_pool.c
index 007dc8f9de..e91263cb1f 100644
--- a/src/gallium/drivers/i965simple/brw_state_pool.c
+++ b/src/gallium/drivers/i965simple/brw_state_pool.c
@@ -42,7 +42,7 @@
  * the pool.
  */
 
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "pipe/p_inlines.h"
diff --git a/src/gallium/drivers/i965simple/brw_surface.c b/src/gallium/drivers/i965simple/brw_surface.c
index b89756c47b..0a95dce194 100644
--- a/src/gallium/drivers/i965simple/brw_surface.c
+++ b/src/gallium/drivers/i965simple/brw_surface.c
@@ -30,7 +30,7 @@
 #include "brw_state.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_tile.h"
 #include "util/u_rect.h"
 
@@ -74,13 +74,15 @@ brw_surface_copy(struct pipe_context *pipe,
       pipe->screen->surface_unmap(pipe->screen, dst);
    }
    else {
+      struct brw_texture *dst_tex = (struct brw_texture *)dst->texture;
+      struct brw_texture *src_tex = (struct brw_texture *)src->texture;
       assert(dst->block.width == 1);
       assert(dst->block.height == 1);
       brw_copy_blit(brw_context(pipe),
                     do_flip,
                     dst->block.size,
-                    (short) src->stride/src->block.size, src->buffer, src->offset, FALSE,
-                    (short) dst->stride/dst->block.size, dst->buffer, dst->offset, FALSE,
+                    (short) src->stride/src->block.size, src_tex->buffer, src->offset, FALSE,
+                    (short) dst->stride/dst->block.size, dst_tex->buffer, dst->offset, FALSE,
                     (short) srcx, (short) srcy, (short) dstx, (short) dsty,
                     (short) width, (short) height, PIPE_LOGICOP_COPY);
    }
@@ -103,12 +105,13 @@ brw_surface_fill(struct pipe_context *pipe,
       pipe->screen->surface_unmap(pipe->screen, dst);
    }
    else {
+      struct brw_texture *tex = (struct brw_texture *)dst->texture;
       assert(dst->block.width == 1);
       assert(dst->block.height == 1);
       brw_fill_blit(brw_context(pipe),
                     dst->block.size,
                     (short) dst->stride/dst->block.size, 
-                    dst->buffer, dst->offset, FALSE,
+                    tex->buffer, dst->offset, FALSE,
                     (short) dstx, (short) dsty,
                     (short) width, (short) height,
                     value);
diff --git a/src/gallium/drivers/i965simple/brw_tex_layout.c b/src/gallium/drivers/i965simple/brw_tex_layout.c
index cc0c665e02..448229ed4e 100644
--- a/src/gallium/drivers/i965simple/brw_tex_layout.c
+++ b/src/gallium/drivers/i965simple/brw_tex_layout.c
@@ -37,7 +37,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "brw_context.h"
@@ -296,9 +296,9 @@ brw_texture_create_screen(struct pipe_screen *screen,
    
       if (brw_miptree_layout(tex))
 	 tex->buffer = ws->buffer_create(ws, 64,
-                                         PIPE_BUFFER_USAGE_PIXEL,
-                                         tex->stride *
-                                         tex->total_nblocksy);
+                                          PIPE_BUFFER_USAGE_PIXEL,
+                                          tex->stride *
+                                          tex->total_nblocksy);
 
       if (!tex->buffer) {
 	 FREE(tex);
@@ -322,7 +322,6 @@ brw_texture_release_screen(struct pipe_screen *screen,
        __FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
    */
    if (--(*pt)->refcount <= 0) {
-      struct pipe_winsys *ws = screen->winsys;
       struct brw_texture *tex = (struct brw_texture *)*pt;
       uint i;
 
@@ -330,7 +329,7 @@ brw_texture_release_screen(struct pipe_screen *screen,
       DBG("%s deleting %p\n", __FUNCTION__, (void *) tex);
       */
 
-      winsys_buffer_reference(ws, &tex->buffer, NULL);
+      pipe_buffer_reference(screen, &tex->buffer, NULL);
 
       for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++)
          if (tex->image_offset[i])
@@ -347,7 +346,6 @@ brw_get_tex_surface_screen(struct pipe_screen *screen,
                            struct pipe_texture *pt,
                            unsigned face, unsigned level, unsigned zslice)
 {
-   struct pipe_winsys *ws = screen->winsys;
    struct brw_texture *tex = (struct brw_texture *)pt;
    struct pipe_surface *ps;
    unsigned offset;  /* in bytes */
@@ -365,11 +363,10 @@ brw_get_tex_surface_screen(struct pipe_screen *screen,
       assert(zslice == 0);
    }
 
-   ps = ws->surface_alloc(ws);
+   ps = CALLOC_STRUCT(pipe_surface);
    if (ps) {
-      assert(ps->format);
-      assert(ps->refcount);
-      winsys_buffer_reference(ws, &ps->buffer, tex->buffer);
+      ps->refcount = 1;
+      pipe_texture_reference(&ps->texture, pt);
       ps->format = pt->format;
       ps->width = pt->width[level];
       ps->height = pt->height[level];
@@ -378,6 +375,7 @@ brw_get_tex_surface_screen(struct pipe_screen *screen,
       ps->nblocksy = pt->nblocksy[level];
       ps->stride = tex->stride;
       ps->offset = offset;
+      ps->status = PIPE_SURFACE_STATUS_DEFINED;
    }
    return ps;
 }
diff --git a/src/gallium/drivers/i965simple/brw_vs_emit.c b/src/gallium/drivers/i965simple/brw_vs_emit.c
index 34dbc0624d..e03d653482 100644
--- a/src/gallium/drivers/i965simple/brw_vs_emit.c
+++ b/src/gallium/drivers/i965simple/brw_vs_emit.c
@@ -1293,7 +1293,7 @@ void brw_vs_emit(struct brw_vs_compile *c)
          break;
       case TGSI_TOKEN_TYPE_IMMEDIATE: {
          struct tgsi_full_immediate *imm = &parse.FullToken.FullImmediate;
-         /*assert(imm->Immediate.Size == 4);*/
+         assert(imm->Immediate.NrTokens == 4 + 1);
          c->prog_data.imm_buf[c->prog_data.num_imm][0] = imm->u.ImmediateFloat32[0].Float;
          c->prog_data.imm_buf[c->prog_data.num_imm][1] = imm->u.ImmediateFloat32[1].Float;
          c->prog_data.imm_buf[c->prog_data.num_imm][2] = imm->u.ImmediateFloat32[2].Float;
diff --git a/src/gallium/drivers/i965simple/brw_wm.c b/src/gallium/drivers/i965simple/brw_wm.c
index 8de565b96c..10161f2d2f 100644
--- a/src/gallium/drivers/i965simple/brw_wm.c
+++ b/src/gallium/drivers/i965simple/brw_wm.c
@@ -111,8 +111,8 @@ static void brw_wm_populate_key( struct brw_context *brw,
    if (brw->attribs.DepthStencil->stencil[0].enabled) {
       lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
 
-      if (brw->attribs.DepthStencil->stencil[0].write_mask ||
-	  brw->attribs.DepthStencil->stencil[1].write_mask)
+      if (brw->attribs.DepthStencil->stencil[0].writemask ||
+	  brw->attribs.DepthStencil->stencil[1].writemask)
 	 lookup |= IZ_STENCIL_WRITE_ENABLE_BIT;
    }
 
diff --git a/src/gallium/drivers/i965simple/brw_wm_surface_state.c b/src/gallium/drivers/i965simple/brw_wm_surface_state.c
index 1a326f9918..1bab5bfdb3 100644
--- a/src/gallium/drivers/i965simple/brw_wm_surface_state.c
+++ b/src/gallium/drivers/i965simple/brw_wm_surface_state.c
@@ -193,6 +193,7 @@ static void upload_wm_surfaces(struct brw_context *brw )
       /* BRW_NEW_FRAMEBUFFER
        */
       struct pipe_surface *pipe_surface = brw->attribs.FrameBuffer.cbufs[0];/*fixme*/
+      struct brw_texture *tex = (struct brw_texture *)pipe_surface->texture;
 
       memset(&surf, 0, sizeof(surf));
 
@@ -204,7 +205,7 @@ static void upload_wm_surfaces(struct brw_context *brw )
 
 	 surf.ss0.surface_type = BRW_SURFACE_2D;
 
-	 surf.ss1.base_addr = brw_buffer_offset( brw, pipe_surface->buffer );
+	 surf.ss1.base_addr = brw_buffer_offset( brw, tex->buffer );
 
 	 surf.ss2.width = pipe_surface->width - 1;
 	 surf.ss2.height = pipe_surface->height - 1;
diff --git a/src/gallium/drivers/nouveau/nouveau_gldefs.h b/src/gallium/drivers/nouveau/nouveau_gldefs.h
new file mode 100644
index 0000000000..ff97aaa9af
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_gldefs.h
@@ -0,0 +1,196 @@
+#ifndef __NOUVEAU_GLDEFS_H__
+#define __NOUVEAU_GLDEFS_H__
+
+static INLINE unsigned
+nvgl_blend_func(unsigned factor)
+{
+	switch (factor) {
+	case PIPE_BLENDFACTOR_ZERO:
+		return 0x0000;
+	case PIPE_BLENDFACTOR_ONE:
+		return 0x0001;
+	case PIPE_BLENDFACTOR_SRC_COLOR:
+		return 0x0300;
+	case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+		return 0x0301;
+	case PIPE_BLENDFACTOR_SRC_ALPHA:
+		return 0x0302;
+	case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+		return 0x0303;
+	case PIPE_BLENDFACTOR_DST_ALPHA:
+		return 0x0304;
+	case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+		return 0x0305;
+	case PIPE_BLENDFACTOR_DST_COLOR:
+		return 0x0306;
+	case PIPE_BLENDFACTOR_INV_DST_COLOR:
+		return 0x0307;
+	case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+		return 0x0308;
+	case PIPE_BLENDFACTOR_CONST_COLOR:
+		return 0x8001;
+	case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+		return 0x8002;
+	case PIPE_BLENDFACTOR_CONST_ALPHA:
+		return 0x8003;
+	case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+		return 0x8004;
+	default:
+		return 0x0000;
+	}
+}
+
+static INLINE unsigned
+nvgl_blend_eqn(unsigned func)
+{
+	switch (func) {
+	case PIPE_BLEND_ADD:
+		return 0x8006;
+	case PIPE_BLEND_MIN:
+		return 0x8007;
+	case PIPE_BLEND_MAX:
+		return 0x8008;
+	case PIPE_BLEND_SUBTRACT:
+		return 0x800a;
+	case PIPE_BLEND_REVERSE_SUBTRACT:
+		return 0x800b;
+	default:
+		return 0x8006;
+	}
+}
+
+static INLINE unsigned
+nvgl_logicop_func(unsigned func)
+{
+	switch (func) {
+	case PIPE_LOGICOP_CLEAR:
+		return 0x1500;
+	case PIPE_LOGICOP_NOR:
+		return 0x1508;
+	case PIPE_LOGICOP_AND_INVERTED:
+		return 0x1504;
+	case PIPE_LOGICOP_COPY_INVERTED:
+		return 0x150c;
+	case PIPE_LOGICOP_AND_REVERSE:
+		return 0x1502;
+	case PIPE_LOGICOP_INVERT:
+		return 0x150a;
+	case PIPE_LOGICOP_XOR:
+		return 0x1506;
+	case PIPE_LOGICOP_NAND:
+		return 0x150e;
+	case PIPE_LOGICOP_AND:
+		return 0x1501;
+	case PIPE_LOGICOP_EQUIV:
+		return 0x1509;
+	case PIPE_LOGICOP_NOOP:
+		return 0x1505;
+	case PIPE_LOGICOP_OR_INVERTED:
+		return 0x150d;
+	case PIPE_LOGICOP_COPY:
+		return 0x1503;
+	case PIPE_LOGICOP_OR_REVERSE:
+		return 0x150b;
+	case PIPE_LOGICOP_OR:
+		return 0x1507;
+	case PIPE_LOGICOP_SET:
+		return 0x150f;
+	default:
+		return 0x1505;
+	}
+}
+
+static INLINE unsigned
+nvgl_comparison_op(unsigned op)
+{
+	switch (op) {
+	case PIPE_FUNC_NEVER:
+		return 0x0200;
+	case PIPE_FUNC_LESS:
+		return 0x0201;
+	case PIPE_FUNC_EQUAL:
+		return 0x0202;
+	case PIPE_FUNC_LEQUAL:
+		return 0x0203;
+	case PIPE_FUNC_GREATER:
+		return 0x0204;
+	case PIPE_FUNC_NOTEQUAL:
+		return 0x0205;
+	case PIPE_FUNC_GEQUAL:
+		return 0x0206;
+	case PIPE_FUNC_ALWAYS:
+		return 0x0207;
+	default:
+		return 0x0207;
+	}
+}
+
+static INLINE unsigned
+nvgl_polygon_mode(unsigned mode)
+{
+	switch (mode) {
+	case PIPE_POLYGON_MODE_POINT:
+		return 0x1b00;
+	case PIPE_POLYGON_MODE_LINE:
+		return 0x1b01;
+	case PIPE_POLYGON_MODE_FILL:
+		return 0x1b02;
+	default:
+		return 0x1b02;
+	}
+}
+
+static INLINE unsigned
+nvgl_stencil_op(unsigned op)
+{
+	switch (op) {
+	case PIPE_STENCIL_OP_ZERO:
+		return 0x0000;
+	case PIPE_STENCIL_OP_INVERT:
+		return 0x150a;
+	case PIPE_STENCIL_OP_KEEP:
+		return 0x1e00;
+	case PIPE_STENCIL_OP_REPLACE:
+		return 0x1e01;
+	case PIPE_STENCIL_OP_INCR:
+		return 0x1e02;
+	case PIPE_STENCIL_OP_DECR:
+		return 0x1e03;
+	case PIPE_STENCIL_OP_INCR_WRAP:
+		return 0x8507;
+	case PIPE_STENCIL_OP_DECR_WRAP:
+		return 0x8508;
+	default:
+		return 0x1e00;
+	}
+}
+
+static INLINE unsigned
+nvgl_primitive(unsigned prim) {
+	switch (prim) {
+	case PIPE_PRIM_POINTS:
+		return 0x0001;
+	case PIPE_PRIM_LINES:
+		return 0x0002;
+	case PIPE_PRIM_LINE_LOOP:
+		return 0x0003;
+	case PIPE_PRIM_LINE_STRIP:
+		return 0x0004;
+	case PIPE_PRIM_TRIANGLES:
+		return 0x0005;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		return 0x0006;
+	case PIPE_PRIM_TRIANGLE_FAN:
+		return 0x0007;
+	case PIPE_PRIM_QUADS:
+		return 0x0008;
+	case PIPE_PRIM_QUAD_STRIP:
+		return 0x0009;
+	case PIPE_PRIM_POLYGON:
+		return 0x000a;
+	default:
+		return 0;
+	}
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_push.h b/src/gallium/drivers/nouveau/nouveau_push.h
new file mode 100644
index 0000000000..54ef1c1291
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_push.h
@@ -0,0 +1,82 @@
+#ifndef __NOUVEAU_PUSH_H__
+#define __NOUVEAU_PUSH_H__
+
+#include "nouveau/nouveau_winsys.h"
+
+#ifndef NOUVEAU_PUSH_CONTEXT
+#error undefined push context
+#endif
+
+#define OUT_RING(data) do {                                                    \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	(*pc->nvws->channel->pushbuf->cur++) = (data);                         \
+} while(0)
+
+#define OUT_RINGp(src,size) do {                                               \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	memcpy(pc->nvws->channel->pushbuf->cur, (src), (size) * 4);            \
+	pc->nvws->channel->pushbuf->cur += (size);                             \
+} while(0)
+
+#define OUT_RINGf(data) do {                                                   \
+	union { float v; uint32_t u; } c;                                      \
+	c.v = (data);                                                          \
+	OUT_RING(c.u);                                                         \
+} while(0)
+
+#define BEGIN_RING(obj,mthd,size) do {                                         \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	if (pc->nvws->channel->pushbuf->remaining < ((size) + 1))              \
+		pc->nvws->push_flush(pc->nvws, ((size) + 1), NULL);            \
+	OUT_RING((pc->obj->subc << 13) | ((size) << 18) | (mthd));             \
+	pc->nvws->channel->pushbuf->remaining -= ((size) + 1);                 \
+} while(0)
+
+#define BEGIN_RING_NI(obj,mthd,size) do {                                      \
+	BEGIN_RING(obj, (mthd) | 0x40000000, (size));                          \
+} while(0)
+
+#define FIRE_RING(fence) do {                                                  \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	pc->nvws->push_flush(pc->nvws, 0, fence);                              \
+} while(0)
+
+#define OUT_RELOC(bo,data,flags,vor,tor) do {                                  \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	pc->nvws->push_reloc(pc->nvws, pc->nvws->channel->pushbuf->cur++,      \
+			     (bo), (data), (flags), (vor), (tor));             \
+} while(0)
+
+/* Raw data + flags depending on FB/TT buffer */
+#define OUT_RELOCd(bo,data,flags,vor,tor) do {                                 \
+	OUT_RELOC((bo), (data), (flags) | NOUVEAU_BO_OR, (vor), (tor));        \
+} while(0)
+
+/* FB/TT object handle */
+#define OUT_RELOCo(bo,flags) do {                                              \
+	OUT_RELOC((bo), 0, (flags) | NOUVEAU_BO_OR,                            \
+		  pc->nvws->channel->vram->handle,                             \
+		  pc->nvws->channel->gart->handle);                            \
+} while(0)
+
+/* Low 32-bits of offset */
+#define OUT_RELOCl(bo,delta,flags) do {                                        \
+	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_LOW, 0, 0);              \
+} while(0)
+
+/* High 32-bits of offset */
+#define OUT_RELOCh(bo,delta,flags) do {                                        \
+	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_HIGH, 0, 0);             \
+} while(0)
+
+/* A reloc which'll recombine into a NV_DMA_METHOD packet header */
+#define OUT_RELOCm(bo, flags, obj, mthd, size) do {                            \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	if (pc->nvws->channel->pushbuf->remaining < ((size) + 1))              \
+		pc->nvws->push_flush(pc->nvws->channel, ((size) + 1), NULL);   \
+	OUT_RELOCd((bo), (pc->obj->subc << 13) | ((size) << 18) | (mthd),      \
+		   (flags), 0, 0);                                             \
+	pc->nvws->channel->pushbuf->remaining -= ((size) + 1);                 \
+} while(0)
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_stateobj.h b/src/gallium/drivers/nouveau/nouveau_stateobj.h
new file mode 100644
index 0000000000..029b01e17d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_stateobj.h
@@ -0,0 +1,159 @@
+#ifndef __NOUVEAU_STATEOBJ_H__
+#define __NOUVEAU_STATEOBJ_H__
+
+#include "util/u_debug.h"
+
+struct nouveau_stateobj_reloc {
+	struct pipe_buffer *bo;
+
+	unsigned offset;
+	unsigned packet;
+
+	unsigned data;
+	unsigned flags;
+	unsigned vor;
+	unsigned tor;
+};
+
+struct nouveau_stateobj {
+	int refcount;
+
+	unsigned *push;
+	struct nouveau_stateobj_reloc *reloc;
+
+	unsigned *cur;
+	unsigned cur_packet;
+	unsigned cur_reloc;
+};
+
+static INLINE struct nouveau_stateobj *
+so_new(unsigned push, unsigned reloc)
+{
+	struct nouveau_stateobj *so;
+
+	so = MALLOC(sizeof(struct nouveau_stateobj));
+	so->refcount = 0;
+	so->push = MALLOC(sizeof(unsigned) * push);
+	so->reloc = MALLOC(sizeof(struct nouveau_stateobj_reloc) * reloc);
+
+	so->cur = so->push;
+	so->cur_reloc = so->cur_packet = 0;
+
+	return so;
+}
+
+static INLINE void
+so_ref(struct nouveau_stateobj *ref, struct nouveau_stateobj **pso)
+{
+	struct nouveau_stateobj *so = *pso;
+
+	if (ref) {
+		ref->refcount++;
+	}
+
+	if (so && --so->refcount <= 0) {
+		free(so->push);
+		free(so->reloc);
+		free(so);
+	}
+
+	*pso = ref;
+}
+
+static INLINE void
+so_data(struct nouveau_stateobj *so, unsigned data)
+{
+	(*so->cur++) = (data);
+	so->cur_packet += 4;
+}
+
+static INLINE void
+so_datap(struct nouveau_stateobj *so, unsigned *data, unsigned size)
+{
+	so->cur_packet += (4 * size);
+	while (size--)
+		(*so->cur++) = (*data++);
+}
+
+static INLINE void
+so_method(struct nouveau_stateobj *so, struct nouveau_grobj *gr,
+	  unsigned mthd, unsigned size)
+{
+	so->cur_packet = (gr->subc << 13) | (1 << 18) | (mthd - 4);
+	so_data(so, (gr->subc << 13) | (size << 18) | mthd);
+}
+
+static INLINE void
+so_reloc(struct nouveau_stateobj *so, struct pipe_buffer *bo,
+	 unsigned data, unsigned flags, unsigned vor, unsigned tor)
+{
+	struct nouveau_stateobj_reloc *r = &so->reloc[so->cur_reloc++];
+	
+	r->bo = bo;
+	r->offset = so->cur - so->push;
+	r->packet = so->cur_packet;
+	r->data = data;
+	r->flags = flags;
+	r->vor = vor;
+	r->tor = tor;
+	so_data(so, data);
+}
+
+static INLINE void
+so_dump(struct nouveau_stateobj *so)
+{
+	unsigned i, nr = so->cur - so->push;
+
+	for (i = 0; i < nr; i++)
+		debug_printf("+0x%04x: 0x%08x\n", i, so->push[i]);
+}
+
+static INLINE void
+so_emit(struct nouveau_winsys *nvws, struct nouveau_stateobj *so)
+{
+	struct nouveau_pushbuf *pb = nvws->channel->pushbuf;
+	unsigned nr, i;
+
+	nr = so->cur - so->push;
+	if (pb->remaining < nr)
+		nvws->push_flush(nvws, nr, NULL);
+	pb->remaining -= nr;
+
+	memcpy(pb->cur, so->push, nr * 4);
+	for (i = 0; i < so->cur_reloc; i++) {
+		struct nouveau_stateobj_reloc *r = &so->reloc[i];
+
+		nvws->push_reloc(nvws, pb->cur + r->offset, r->bo,
+				 r->data, r->flags, r->vor, r->tor);
+	}
+	pb->cur += nr;
+}
+
+static INLINE void
+so_emit_reloc_markers(struct nouveau_winsys *nvws, struct nouveau_stateobj *so)
+{
+	struct nouveau_pushbuf *pb = nvws->channel->pushbuf;
+	unsigned i;
+
+	if (!so)
+		return;
+
+	i = so->cur_reloc << 1;
+	if (nvws->channel->pushbuf->remaining < i)
+		nvws->push_flush(nvws, i, NULL);
+	nvws->channel->pushbuf->remaining -= i;
+
+	for (i = 0; i < so->cur_reloc; i++) {
+		struct nouveau_stateobj_reloc *r = &so->reloc[i];
+
+		nvws->push_reloc(nvws, pb->cur++, r->bo, r->packet,
+				 (r->flags & (NOUVEAU_BO_VRAM |
+					      NOUVEAU_BO_GART |
+					      NOUVEAU_BO_RDWR)) |
+				 NOUVEAU_BO_DUMMY, 0, 0);
+		nvws->push_reloc(nvws, pb->cur++, r->bo, r->data,
+				 r->flags | NOUVEAU_BO_DUMMY, r->vor, r->tor);
+	}
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_util.h b/src/gallium/drivers/nouveau/nouveau_util.h
new file mode 100644
index 0000000000..a10114beab
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_util.h
@@ -0,0 +1,91 @@
+#ifndef __NOUVEAU_UTIL_H__
+#define __NOUVEAU_UTIL_H__
+
+/* Determine how many vertices can be pushed into the command stream.
+ * Where the remaining space isn't large enough to represent all verices,
+ * split the buffer at primitive boundaries.
+ *
+ * Returns a count of vertices that can be rendered, and an index to
+ * restart drawing at after a flush.
+ */
+static INLINE unsigned
+nouveau_vbuf_split(unsigned remaining, unsigned overhead, unsigned vpp,
+		   unsigned mode, unsigned start, unsigned count,
+		   unsigned *restart)
+{
+	int max, adj = 0;
+
+	max  = remaining - overhead;
+	if (max < 0)
+		return 0;
+
+	max *= vpp;
+	if (max >= count)
+		return count;
+
+	switch (mode) {
+	case PIPE_PRIM_POINTS:
+		break;
+	case PIPE_PRIM_LINES:
+		max = max & 1;
+		break;
+	case PIPE_PRIM_TRIANGLES:
+		max = max - (max % 3);
+		break;
+	case PIPE_PRIM_QUADS:
+		max = max & 3;
+		break;
+	case PIPE_PRIM_LINE_LOOP:
+	case PIPE_PRIM_LINE_STRIP:
+		if (max < 2)
+			max = 0;
+		adj = 1;
+		break;
+	case PIPE_PRIM_POLYGON:
+	case PIPE_PRIM_TRIANGLE_STRIP:
+	case PIPE_PRIM_TRIANGLE_FAN:
+		if (max < 3)
+			max = 0;
+		adj = 2;
+		break;
+	case PIPE_PRIM_QUAD_STRIP:
+		if (max < 4)
+			max = 0;
+		adj = 3;
+		break;
+	default:
+		assert(0);
+	}
+
+	*restart = start + max - adj;
+	return max;
+}
+
+/* Integer base-2 logarithm, rounded towards zero. */
+static INLINE unsigned log2i(unsigned i)
+{
+	unsigned r = 0;
+
+	if (i & 0xffff0000) {
+		i >>= 16;
+		r += 16;
+	}
+	if (i & 0x0000ff00) {
+		i >>= 8;
+		r += 8;
+	}
+	if (i & 0x000000f0) {
+		i >>= 4;
+		r += 4;
+	}
+	if (i & 0x0000000c) {
+		i >>= 2;
+		r += 2;
+	}
+	if (i & 0x00000002) {
+		r += 1;
+	}
+	return r;
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
new file mode 100644
index 0000000000..4fcadbae3f
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -0,0 +1,101 @@
+#ifndef NOUVEAU_WINSYS_H
+#define NOUVEAU_WINSYS_H
+
+#include <stdint.h>
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_defines.h"
+
+#include "nouveau/nouveau_bo.h"
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_class.h"
+#include "nouveau/nouveau_device.h"
+#include "nouveau/nouveau_grobj.h"
+#include "nouveau/nouveau_notifier.h"
+#include "nouveau/nouveau_resource.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+#define NOUVEAU_CAP_HW_VTXBUF (0xbeef0000)
+#define NOUVEAU_CAP_HW_IDXBUF (0xbeef0001)
+
+#define NOUVEAU_TEXTURE_USAGE_LINEAR (1 << 16)
+
+#define NOUVEAU_BUFFER_USAGE_TEXTURE  (1 << 16)
+#define NOUVEAU_BUFFER_USAGE_ZETA     (1 << 17)
+#define NOUVEAU_BUFFER_USAGE_TRANSFER (1 << 18)
+
+struct nouveau_winsys {
+	struct nouveau_context *nv;
+
+	struct nouveau_channel *channel;
+
+	int  (*res_init)(struct nouveau_resource **heap, unsigned start,
+			 unsigned size);
+	int  (*res_alloc)(struct nouveau_resource *heap, int size, void *priv,
+			  struct nouveau_resource **);
+	void (*res_free)(struct nouveau_resource **);
+
+	int  (*push_reloc)(struct nouveau_winsys *, void *ptr,
+			   struct pipe_buffer *, uint32_t data,
+			   uint32_t flags, uint32_t vor, uint32_t tor);
+	int  (*push_flush)(struct nouveau_winsys *, unsigned size,
+			   struct pipe_fence_handle **fence);
+			       
+	int       (*grobj_alloc)(struct nouveau_winsys *, int grclass,
+				 struct nouveau_grobj **);
+	void      (*grobj_free)(struct nouveau_grobj **);
+
+	int       (*notifier_alloc)(struct nouveau_winsys *, int count,
+				    struct nouveau_notifier **);
+	void      (*notifier_free)(struct nouveau_notifier **);
+	void      (*notifier_reset)(struct nouveau_notifier *, int id);
+	uint32_t  (*notifier_status)(struct nouveau_notifier *, int id);
+	uint32_t  (*notifier_retval)(struct nouveau_notifier *, int id);
+	int       (*notifier_wait)(struct nouveau_notifier *, int id,
+				   int status, double timeout);
+
+	int (*surface_copy)(struct nouveau_winsys *, struct pipe_surface *,
+			    unsigned, unsigned, struct pipe_surface *,
+			    unsigned, unsigned, unsigned, unsigned);
+	int (*surface_fill)(struct nouveau_winsys *, struct pipe_surface *,
+			    unsigned, unsigned, unsigned, unsigned, unsigned);
+
+	struct nouveau_bo *(*get_bo)(struct pipe_buffer *);
+};
+
+extern struct pipe_screen *
+nv04_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv04_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv10_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv10_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv20_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv20_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv30_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv30_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv40_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv40_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv50_create(struct pipe_screen *, unsigned pctx_id);
+
+#endif
diff --git a/src/gallium/drivers/nv04/Makefile b/src/gallium/drivers/nv04/Makefile
new file mode 100644
index 0000000000..cf9deea851
--- /dev/null
+++ b/src/gallium/drivers/nv04/Makefile
@@ -0,0 +1,20 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv04
+
+C_SOURCES = \
+	nv04_surface_2d.c \
+	nv04_clear.c \
+	nv04_context.c \
+	nv04_fragprog.c \
+	nv04_fragtex.c \
+	nv04_miptree.c \
+	nv04_prim_vbuf.c \
+	nv04_screen.c \
+	nv04_state.c \
+	nv04_state_emit.c \
+	nv04_surface.c \
+	nv04_vbo.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/nv04/nv04_clear.c b/src/gallium/drivers/nv04/nv04_clear.c
new file mode 100644
index 0000000000..01cacd36fe
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_clear.c
@@ -0,0 +1,12 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv04_context.h"
+
+void
+nv04_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+}
diff --git a/src/gallium/drivers/nv04/nv04_context.c b/src/gallium/drivers/nv04/nv04_context.c
new file mode 100644
index 0000000000..d6710cd892
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_context.c
@@ -0,0 +1,107 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "nv04_context.h"
+#include "nv04_screen.h"
+
+static void
+nv04_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	draw_flush(nv04->draw);
+
+	FIRE_RING(fence);
+}
+
+static void
+nv04_destroy(struct pipe_context *pipe)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	if (nv04->draw)
+		draw_destroy(nv04->draw);
+
+	FREE(nv04);
+}
+
+static void
+nv04_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+}
+
+static boolean
+nv04_init_hwctx(struct nv04_context *nv04)
+{
+	// requires a valid handle
+//	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_NOTIFY, 1);
+//	OUT_RING(0);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_NOP, 1);
+	OUT_RING(0);
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
+	OUT_RING(0x40182800);
+//	OUT_RING(1<<20/*no cull*/);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_BLEND, 1);
+//	OUT_RING(0x24|(1<<6)|(1<<8));
+	OUT_RING(0x120001a4);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FORMAT, 1);
+	OUT_RING(0x332213a1);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FILTER, 1);
+	OUT_RING(0x11001010);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_COLORKEY, 1);
+	OUT_RING(0x0);
+//	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 1);
+//	OUT_RING(SCREEN_OFFSET);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR, 1);
+	OUT_RING(0xff000000);
+
+
+
+	FIRE_RING (NULL);
+	return TRUE;
+}
+
+struct pipe_context *
+nv04_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv04_screen *screen = nv04_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv04_context *nv04;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv04 = CALLOC(1, sizeof(struct nv04_context));
+	if (!nv04)
+		return NULL;
+	nv04->screen = screen;
+	nv04->pctx_id = pctx_id;
+
+	nv04->nvws = nvws;
+
+	nv04->pipe.winsys = ws;
+	nv04->pipe.screen = pscreen;
+	nv04->pipe.destroy = nv04_destroy;
+	nv04->pipe.set_edgeflags = nv04_set_edgeflags;
+	nv04->pipe.draw_arrays = nv04_draw_arrays;
+	nv04->pipe.draw_elements = nv04_draw_elements;
+	nv04->pipe.clear = nv04_clear;
+	nv04->pipe.flush = nv04_flush;
+
+	nv04_init_surface_functions(nv04);
+	nv04_init_state_functions(nv04);
+
+	nv04->draw = draw_create();
+	assert(nv04->draw);
+	draw_wide_point_threshold(nv04->draw, 0.0);
+	draw_wide_line_threshold(nv04->draw, 0.0);
+	draw_enable_line_stipple(nv04->draw, FALSE);
+	draw_enable_point_sprites(nv04->draw, FALSE);
+	draw_set_rasterize_stage(nv04->draw, nv04_draw_vbuf_stage(nv04));
+
+	nv04_init_hwctx(nv04);
+
+	return &nv04->pipe;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_context.h b/src/gallium/drivers/nv04/nv04_context.h
new file mode 100644
index 0000000000..2842b2c90d
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_context.h
@@ -0,0 +1,151 @@
+#ifndef __NV04_CONTEXT_H__
+#define __NV04_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv04_screen *ctx = nv04->screen
+#include "nouveau/nouveau_push.h"
+
+#include "nv04_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+#include "nv04_screen.h"
+
+#define NV04_NEW_VERTPROG	(1 << 1)
+#define NV04_NEW_FRAGPROG	(1 << 2)
+#define NV04_NEW_BLEND		(1 << 3)
+#define NV04_NEW_RAST		(1 << 4)
+#define NV04_NEW_CONTROL	(1 << 5)
+#define NV04_NEW_VIEWPORT	(1 << 6)
+#define NV04_NEW_SAMPLER	(1 << 7)
+#define NV04_NEW_FRAMEBUFFER	(1 << 8)
+#define NV04_NEW_VTXARRAYS	(1 << 9)
+
+struct nv04_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv04_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	int chipset;
+	struct nouveau_notifier *sync;
+
+	uint32_t dirty;
+
+	struct nv04_blend_state *blend;
+	struct nv04_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+	struct nv04_fragtex_state fragtex;
+	struct nv04_rasterizer_state *rast;
+	struct nv04_depth_stencil_alpha_state *dsa;
+
+	struct nv04_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned dirty_samplers;
+	unsigned fp_samplers;
+	unsigned vp_samplers;
+
+	uint32_t rt_enable;
+	struct pipe_framebuffer_state *framebuffer;
+	struct pipe_surface *rt;
+	struct pipe_surface *zeta;
+
+	struct {
+		struct pipe_buffer *buffer;
+		uint32_t format;
+	} tex[16];
+
+	unsigned vb_enable;
+	struct {
+		struct pipe_buffer *buffer;
+		unsigned delta;
+	} vb[16];
+
+	float *constbuf[PIPE_SHADER_TYPES][32][4];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+
+	struct vertex_info vertex_info;
+	struct {
+	
+		struct nouveau_resource *exec_heap;
+		struct nouveau_resource *data_heap;
+
+		struct nv04_vertex_program *active;
+
+		struct nv04_vertex_program *current;
+		struct pipe_buffer *constant_buf;
+	} vertprog;
+
+	struct {
+		struct nv04_fragment_program *active;
+
+		struct nv04_fragment_program *current;
+		struct pipe_buffer *constant_buf;
+	} fragprog;
+
+	struct pipe_vertex_buffer  vtxbuf[PIPE_MAX_ATTRIBS];
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+
+	struct pipe_viewport_state viewport;
+};
+
+static INLINE struct nv04_context *
+nv04_context(struct pipe_context *pipe)
+{
+	return (struct nv04_context *)pipe;
+}
+
+extern void nv04_init_state_functions(struct nv04_context *nv04);
+extern void nv04_init_surface_functions(struct nv04_context *nv04);
+extern void nv04_screen_init_miptree_functions(struct pipe_screen *screen);
+
+/* nv04_clear.c */
+extern void nv04_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv04_draw.c */
+extern struct draw_stage *nv04_draw_render_stage(struct nv04_context *nv04);
+
+/* nv04_fragprog.c */
+extern void nv04_fragprog_bind(struct nv04_context *,
+			       struct nv04_fragment_program *);
+extern void nv04_fragprog_destroy(struct nv04_context *,
+				  struct nv04_fragment_program *);
+
+/* nv04_fragtex.c */
+extern void nv04_fragtex_bind(struct nv04_context *);
+
+/* nv04_prim_vbuf.c */
+struct draw_stage *nv04_draw_vbuf_stage( struct nv04_context *nv04 );
+
+/* nv04_state.c and friends */
+extern void nv04_emit_hw_state(struct nv04_context *nv04);
+extern void nv04_state_tex_update(struct nv04_context *nv04);
+
+/* nv04_vbo.c */
+extern boolean nv04_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv04_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count);
+
+
+#endif
diff --git a/src/gallium/drivers/nv04/nv04_fragprog.c b/src/gallium/drivers/nv04/nv04_fragprog.c
new file mode 100644
index 0000000000..8a2af41fe0
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_fragprog.c
@@ -0,0 +1,21 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv04_context.h"
+
+void
+nv04_fragprog_bind(struct nv04_context *nv04, struct nv04_fragment_program *fp)
+{
+}
+
+void
+nv04_fragprog_destroy(struct nv04_context *nv04,
+		      struct nv04_fragment_program *fp)
+{
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_fragtex.c b/src/gallium/drivers/nv04/nv04_fragtex.c
new file mode 100644
index 0000000000..21f990fd53
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_fragtex.c
@@ -0,0 +1,73 @@
+#include "nv04_context.h"
+#include "nouveau/nouveau_util.h"
+
+#define _(m,tf)                                                                \
+{                                                                              \
+  PIPE_FORMAT_##m,                                                             \
+  NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_##tf,                                               \
+}
+
+struct nv04_texture_format {
+	uint	pipe;
+	int     format;
+};
+
+static struct nv04_texture_format
+nv04_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8),
+	_(X8R8G8B8_UNORM, X8R8G8B8),
+	_(A1R5G5B5_UNORM, A1R5G5B5),
+	_(A4R4G4B4_UNORM, A4R4G4B4),
+	_(L8_UNORM,       Y8      ),
+	_(A8_UNORM,       Y8      ),
+};
+
+static uint32_t
+nv04_fragtex_format(uint pipe_format)
+{
+	struct nv04_texture_format *tf = nv04_texture_formats;
+	int i;
+
+	for (i=0; i< sizeof(nv04_texture_formats)/sizeof(nv04_texture_formats[0]); i++) {
+		if (tf->pipe == pipe_format)
+			return tf->format;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", pf_name(pipe_format));
+	return 0;
+}
+
+
+static void
+nv04_fragtex_build(struct nv04_context *nv04, int unit)
+{
+	struct nv04_miptree *nv04mt = nv04->tex_miptree[unit];
+	struct pipe_texture *pt = &nv04mt->base;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_2D:
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	nv04->fragtex.format = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_CORNER 
+		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_CORNER
+		| nv04_fragtex_format(pt->format)
+		| ( (pt->last_level + 1) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_MIPMAP_LEVELS_SHIFT )
+		| ( log2i(pt->width[0]) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_U_SHIFT )
+		| ( log2i(pt->height[0]) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_V_SHIFT )
+		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE
+		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP_TO_EDGE
+		;
+}
+
+
+void
+nv04_fragtex_bind(struct nv04_context *nv04)
+{
+	nv04_fragtex_build(nv04, 0);
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_miptree.c b/src/gallium/drivers/nv04/nv04_miptree.c
new file mode 100644
index 0000000000..993c5ef5dd
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_miptree.c
@@ -0,0 +1,177 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv04_context.h"
+#include "nv04_screen.h"
+
+static void
+nv04_miptree_layout(struct nv04_miptree *nv04mt)
+{
+	struct pipe_texture *pt = &nv04mt->base;
+	uint width = pt->width[0], height = pt->height[0];
+	uint offset = 0;
+	int nr_faces, l;
+
+	nr_faces = 1;
+
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+		
+		nv04mt->level[l].pitch = pt->width[0];
+		nv04mt->level[l].pitch = (nv04mt->level[l].pitch + 63) & ~63;
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+	}
+
+	for (l = 0; l <= pt->last_level; l++) {
+
+		nv04mt->level[l].image_offset = offset;
+		offset += nv04mt->level[l].pitch * pt->height[l];
+	}
+
+	nv04mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv04_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv04_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv04_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->shadow_tex = NULL;
+	mt->shadow_surface = NULL;
+
+	//mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+
+	nv04_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256, PIPE_BUFFER_USAGE_PIXEL |
+						NOUVEAU_BUFFER_USAGE_TEXTURE,
+						mt->total_size);
+	if (!mt->buffer) {
+		printf("failed %d byte alloc\n",mt->total_size);
+		FREE(mt);
+		return NULL;
+	}
+	
+	return &mt->base;
+}
+
+static struct pipe_texture *
+nv04_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
+		     const unsigned *stride, struct pipe_buffer *pb)
+{
+	struct nv04_miptree *mt;
+
+	/* Only supports 2D, non-mipmapped textures for the moment */
+	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
+	    pt->depth[0] != 1)
+		return NULL;
+
+	mt = CALLOC_STRUCT(nv04_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->level[0].pitch = stride[0];
+	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+
+	pipe_buffer_reference(pscreen, &mt->buffer, pb);
+	return &mt->base;
+}
+
+static void
+nv04_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **ppt)
+{
+	struct pipe_texture *pt = *ppt;
+	struct nv04_miptree *mt = (struct nv04_miptree *)pt;
+	int l;
+
+	*ppt = NULL;
+	if (--pt->refcount)
+		return;
+
+	pipe_buffer_reference(pscreen, &mt->buffer, NULL);
+	for (l = 0; l <= pt->last_level; l++) {
+		if (mt->level[l].image_offset)
+			FREE(mt->level[l].image_offset);
+	}
+
+	if (mt->shadow_tex) {
+		assert(mt->shadow_surface);
+		pscreen->tex_surface_release(pscreen, &mt->shadow_surface);
+		nv04_miptree_release(pscreen, &mt->shadow_tex);
+	}
+
+	FREE(mt);
+}
+
+static struct pipe_surface *
+nv04_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv04_miptree *nv04mt = (struct nv04_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = nv04mt->level[level].pitch;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+	ps->face = face;
+	ps->level = level;
+	ps->zslice = zslice;
+
+	ps->offset = nv04mt->level[level].image_offset;
+
+	return ps;
+}
+
+static void
+nv04_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+
+	*psurface = NULL;
+	if (--ps->refcount > 0)
+		return;
+
+	pipe_texture_reference(&ps->texture, NULL);
+	FREE(ps);
+}
+
+void
+nv04_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv04_miptree_create;
+	pscreen->texture_blanket = nv04_miptree_blanket;
+	pscreen->texture_release = nv04_miptree_release;
+	pscreen->get_tex_surface = nv04_miptree_surface_new;
+	pscreen->tex_surface_release = nv04_miptree_surface_del;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_prim_vbuf.c b/src/gallium/drivers/nv04/nv04_prim_vbuf.c
new file mode 100644
index 0000000000..f6458232ae
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_prim_vbuf.c
@@ -0,0 +1,321 @@
+
+#include "util/u_debug.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_compiler.h"
+
+#include "draw/draw_vbuf.h"
+
+#include "nv04_context.h"
+#include "nv04_state.h"
+
+#define VERTEX_SIZE 40
+#define VERTEX_BUFFER_SIZE (4096*VERTEX_SIZE) // 4096 vertices of 40 bytes each
+
+/**
+ * Primitive renderer for nv04.
+ */
+struct nv04_vbuf_render {
+	struct vbuf_render base;
+
+	struct nv04_context *nv04;   
+
+	/** Vertex buffer */
+	unsigned char* buffer;
+
+	/** Vertex size in bytes */
+	unsigned vertex_size;
+
+	/** Current primitive */
+	unsigned prim;
+};
+
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct nv04_vbuf_render *
+nv04_vbuf_render( struct vbuf_render *render )
+{
+	assert(render);
+	return (struct nv04_vbuf_render *)render;
+}
+
+
+static const struct vertex_info *
+nv04_vbuf_render_get_vertex_info( struct vbuf_render *render )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+	struct nv04_context *nv04 = nv04_render->nv04;
+	return &nv04->vertex_info;
+}
+
+
+static boolean
+nv04_vbuf_render_allocate_vertices( struct vbuf_render *render,
+		ushort vertex_size,
+		ushort nr_vertices )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+
+	nv04_render->buffer = (unsigned char*) MALLOC(VERTEX_BUFFER_SIZE);
+	assert(!nv04_render->buffer);
+
+	return nv04_render->buffer ? TRUE : FALSE;
+}
+
+static void *
+nv04_vbuf_render_map_vertices( struct vbuf_render *render )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+	return nv04_render->buffer;
+}
+
+static void
+nv04_vbuf_render_unmap_vertices( struct vbuf_render *render,
+		ushort min_index,
+		ushort max_index )
+{
+}
+
+static boolean 
+nv04_vbuf_render_set_primitive( struct vbuf_render *render, 
+		unsigned prim )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+
+	if (prim <= PIPE_PRIM_LINE_STRIP)
+		return FALSE;
+
+	nv04_render->prim = prim;
+	return TRUE;
+}
+
+static INLINE void nv04_2triangles(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5)
+{
+	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xA),49);
+	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v3,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v4,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v5,8);
+	OUT_RING(0xFEDCBA);
+}
+
+static INLINE void nv04_1triangle(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2)
+{
+	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xD),25);
+	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
+	OUT_RING(0xFED);
+}
+
+static INLINE void nv04_1quad(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2, ushort v3)
+{
+	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xC),33);
+	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v3,8);
+	OUT_RING(0xFECEDC);
+}
+
+static void nv04_vbuf_render_triangles_elts(struct nv04_vbuf_render * render, const ushort * indices, uint nr_indices)
+{
+	unsigned char* buffer = render->buffer;
+	struct nv04_context* nv04 = render->nv04;
+	int i;
+
+	for( i=0; i< nr_indices-5; i+=6)
+		nv04_2triangles(nv04,
+				buffer,
+				indices[i+0],
+				indices[i+1],
+				indices[i+2],
+				indices[i+3],
+				indices[i+4],
+				indices[i+5]
+			       );
+	if (i != nr_indices)
+	{
+		nv04_1triangle(nv04,
+				buffer,
+				indices[i+0],
+				indices[i+1],
+				indices[i+2]
+			       );
+		i+=3;
+	}
+	if (i != nr_indices)
+		NOUVEAU_ERR("Houston, we have lost some vertices\n");
+}
+
+static void nv04_vbuf_render_tri_strip_elts(struct nv04_vbuf_render* render, const ushort* indices, uint nr_indices)
+{
+	const uint32_t striptbl[]={0x321210,0x543432,0x765654,0x987876,0xBA9A98,0xDCBCBA,0xFEDEDC};
+	unsigned char* buffer = render->buffer;
+	struct nv04_context* nv04 = render->nv04;
+	int i,j;
+
+	for(i = 0; i<nr_indices; i+=14) 
+	{
+		int numvert = MIN2(16, nr_indices - i);
+		int numtri = numvert - 2;
+		if (numvert<3)
+			break;
+
+		BEGIN_RING( fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), numvert*8 );
+		for(j = 0; j<numvert; j++)
+			OUT_RINGp( buffer + VERTEX_SIZE * indices [i+j], 8 );
+
+		BEGIN_RING_NI( fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE(0), (numtri+1)/2 );
+		for(j = 0; j<numtri/2; j++ )
+			OUT_RING(striptbl[j]);
+		if (numtri%2)
+			OUT_RING(striptbl[numtri/2]&0xFFF);
+	}
+}
+
+static void nv04_vbuf_render_tri_fan_elts(struct nv04_vbuf_render* render, const ushort* indices, uint nr_indices)
+{
+	const uint32_t fantbl[]={0x320210,0x540430,0x760650,0x980870,0xBA0A90,0xDC0CB0,0xFE0ED0};
+	unsigned char* buffer = render->buffer;
+	struct nv04_context* nv04 = render->nv04;
+	int i,j;
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), 8);
+	OUT_RINGp(buffer + VERTEX_SIZE * indices[0], 8);
+
+	for(i = 1; i<nr_indices; i+=14)
+	{
+		int numvert=MIN2(15, nr_indices - i);
+		int numtri=numvert-2;
+		if (numvert < 3)
+			break;
+
+		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x1), numvert*8);
+
+		for(j=0;j<numvert;j++)
+			OUT_RINGp( buffer + VERTEX_SIZE * indices[ i+j ], 8 );
+
+		BEGIN_RING_NI(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE(0), (numtri+1)/2);
+		for(j = 0; j<numtri/2; j++)
+			OUT_RING(fantbl[j]);
+		if (numtri%2)
+			OUT_RING(fantbl[numtri/2]&0xFFF);
+	}
+}
+
+static void nv04_vbuf_render_quads_elts(struct nv04_vbuf_render* render, const ushort* indices, uint nr_indices)
+{
+	unsigned char* buffer = render->buffer;
+	struct nv04_context* nv04 = render->nv04;
+	int i;
+
+	for(i = 0; i < nr_indices; i += 4)
+		nv04_1quad(nv04,
+				buffer,
+				indices[i+0],
+				indices[i+1],
+				indices[i+2],
+				indices[i+3]
+			       );
+}
+
+
+static void 
+nv04_vbuf_render_draw( struct vbuf_render *render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+
+	// emit the indices
+	switch( nv04_render->prim )
+	{
+		case PIPE_PRIM_TRIANGLES:
+			nv04_vbuf_render_triangles_elts(nv04_render, indices, nr_indices);
+			break;
+		case PIPE_PRIM_QUAD_STRIP:
+		case PIPE_PRIM_TRIANGLE_STRIP:
+			nv04_vbuf_render_tri_strip_elts(nv04_render, indices, nr_indices);
+			break;
+		case PIPE_PRIM_TRIANGLE_FAN:
+		case PIPE_PRIM_POLYGON:
+			nv04_vbuf_render_tri_fan_elts(nv04_render, indices, nr_indices);
+			break;
+		case PIPE_PRIM_QUADS:
+			nv04_vbuf_render_quads_elts(nv04_render, indices, nr_indices);
+			break;
+		default:
+			NOUVEAU_ERR("You have to implement primitive %d, young padawan\n", nv04_render->prim);
+			break;
+	}
+}
+
+
+static void
+nv04_vbuf_render_release_vertices( struct vbuf_render *render )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+
+	free(nv04_render->buffer);
+	nv04_render->buffer = NULL;
+}
+
+
+static void
+nv04_vbuf_render_destroy( struct vbuf_render *render )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+	FREE(nv04_render);
+}
+
+
+/**
+ * Create a new primitive render.
+ */
+static struct vbuf_render *
+nv04_vbuf_render_create( struct nv04_context *nv04 )
+{
+	struct nv04_vbuf_render *nv04_render = CALLOC_STRUCT(nv04_vbuf_render);
+
+	nv04_render->nv04 = nv04;
+
+	nv04_render->base.max_vertex_buffer_bytes = VERTEX_BUFFER_SIZE;
+	nv04_render->base.max_indices = 65536; 
+	nv04_render->base.get_vertex_info = nv04_vbuf_render_get_vertex_info;
+	nv04_render->base.allocate_vertices = nv04_vbuf_render_allocate_vertices;
+	nv04_render->base.map_vertices = nv04_vbuf_render_map_vertices;
+	nv04_render->base.unmap_vertices = nv04_vbuf_render_unmap_vertices;
+	nv04_render->base.set_primitive = nv04_vbuf_render_set_primitive;
+	nv04_render->base.draw = nv04_vbuf_render_draw;
+	nv04_render->base.release_vertices = nv04_vbuf_render_release_vertices;
+	nv04_render->base.destroy = nv04_vbuf_render_destroy;
+
+	return &nv04_render->base;
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *nv04_draw_vbuf_stage( struct nv04_context *nv04 )
+{
+	struct vbuf_render *render;
+	struct draw_stage *stage;
+
+	render = nv04_vbuf_render_create(nv04);
+	if(!render)
+		return NULL;
+
+	stage = draw_vbuf_stage( nv04->draw, render );
+	if(!stage) {
+		render->destroy(render);
+		return NULL;
+	}
+
+	return stage;
+}
diff --git a/src/gallium/drivers/nv04/nv04_screen.c b/src/gallium/drivers/nv04/nv04_screen.c
new file mode 100644
index 0000000000..9ef38bc244
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_screen.c
@@ -0,0 +1,237 @@
+#include "pipe/p_screen.h"
+#include "pipe/p_inlines.h"
+#include "util/u_simple_screen.h"
+
+#include "nv04_context.h"
+#include "nv04_screen.h"
+
+static const char *
+nv04_screen_get_name(struct pipe_screen *screen)
+{
+	struct nv04_screen *nv04screen = nv04_screen(screen);
+	struct nouveau_device *dev = nv04screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv04_screen_get_vendor(struct pipe_screen *screen)
+{
+	return "nouveau";
+}
+
+static int
+nv04_screen_get_param(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 1;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 0;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 0;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 1;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 0;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 0;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 0;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+		return 0;
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case NOUVEAU_CAP_HW_VTXBUF:
+	case NOUVEAU_CAP_HW_IDXBUF:
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv04_screen_get_paramf(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 0.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 0.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 0.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 0.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv04_screen_is_format_supported(struct pipe_screen *screen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_X8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv04_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	void *map;
+	struct nv04_miptree *nv04mt = (struct nv04_miptree *)surface->texture;
+
+	map = pipe_buffer_map(screen, nv04mt->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface->offset;
+}
+
+static void
+nv04_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct nv04_miptree *nv04mt = (struct nv04_miptree *)surface->texture;
+
+	pipe_buffer_unmap(screen, nv04mt->buffer);
+}
+
+static void
+nv04_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv04_screen *screen = nv04_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->fahrenheit);
+	nv04_surface_2d_takedown(&screen->eng2d);
+
+	FREE(pscreen);
+}
+
+static struct pipe_buffer *
+nv04_surface_buffer(struct pipe_surface *surf)
+{
+	struct nv04_miptree *mt = (struct nv04_miptree *)surf->texture;
+
+	return mt->buffer;
+}
+
+struct pipe_screen *
+nv04_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv04_screen *screen = CALLOC_STRUCT(nv04_screen);
+	unsigned fahrenheit_class = 0, sub3d_class = 0;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	if (chipset>=0x20) {
+		fahrenheit_class = 0;
+		sub3d_class = 0;
+	} else if (chipset>=0x10) {
+		fahrenheit_class = NV10_DX5_TEXTURED_TRIANGLE;
+		sub3d_class = NV10_CONTEXT_SURFACES_3D;
+	} else {
+		fahrenheit_class=NV04_DX5_TEXTURED_TRIANGLE;
+		sub3d_class = NV04_CONTEXT_SURFACES_3D;
+	}
+
+	if (!fahrenheit_class) {
+		NOUVEAU_ERR("Unknown nv04 chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(nvws);
+	screen->eng2d->buf = nv04_surface_buffer;
+
+	/* 3D object */
+	ret = nvws->grobj_alloc(nvws, fahrenheit_class, &screen->fahrenheit);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return NULL;
+	}
+
+	/* 3D surface object */
+	ret = nvws->grobj_alloc(nvws, sub3d_class, &screen->context_surfaces_3d);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D surface object: %d\n", ret);
+		return NULL;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv04_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv04_screen_destroy;
+
+	screen->pipe.get_name = nv04_screen_get_name;
+	screen->pipe.get_vendor = nv04_screen_get_vendor;
+	screen->pipe.get_param = nv04_screen_get_param;
+	screen->pipe.get_paramf = nv04_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv04_screen_is_format_supported;
+
+	screen->pipe.surface_map = nv04_surface_map;
+	screen->pipe.surface_unmap = nv04_surface_unmap;
+
+	nv04_screen_init_miptree_functions(&screen->pipe);
+	u_simple_screen_init(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_screen.h b/src/gallium/drivers/nv04/nv04_screen.h
new file mode 100644
index 0000000000..540aec907b
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_screen.h
@@ -0,0 +1,27 @@
+#ifndef __NV04_SCREEN_H__
+#define __NV04_SCREEN_H__
+
+#include "pipe/p_screen.h"
+#include "nv04_surface_2d.h"
+
+struct nv04_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+	unsigned chipset;
+
+	/* HW graphics objects */
+	struct nv04_surface_2d *eng2d;
+	struct nouveau_grobj *fahrenheit;
+	struct nouveau_grobj *context_surfaces_3d;
+	struct nouveau_notifier *sync;
+
+};
+
+static INLINE struct nv04_screen *
+nv04_screen(struct pipe_screen *screen)
+{
+	return (struct nv04_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv04/nv04_state.c b/src/gallium/drivers/nv04/nv04_state.c
new file mode 100644
index 0000000000..87c635f962
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_state.c
@@ -0,0 +1,458 @@
+#include "draw/draw_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv04_context.h"
+#include "nv04_state.h"
+
+static void *
+nv04_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv04_blend_state *cb;
+
+	cb = MALLOC(sizeof(struct nv04_blend_state));
+
+	cb->b_enable = cso->blend_enable ? 1 : 0;
+	cb->b_src = ((nvgl_blend_func(cso->alpha_src_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_src_factor)));
+	cb->b_dst = ((nvgl_blend_func(cso->alpha_dst_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_dst_factor)));
+	
+
+	return (void *)cb;
+}
+
+static void
+nv04_blend_state_bind(struct pipe_context *pipe, void *blend)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	nv04->blend = (struct nv04_blend_state*)blend;
+
+	nv04->dirty |= NV04_NEW_BLEND;
+}
+
+static void
+nv04_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP;
+	}
+	return ret >> NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT;
+}
+
+static void *
+nv04_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+
+	struct nv04_sampler_state *ss;
+	uint32_t filter = 0;
+
+	ss = MALLOC(sizeof(struct nv04_sampler_state));
+
+	ss->format = ((wrap_mode(cso->wrap_s) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_SHIFT));
+
+	if (cso->max_anisotropy > 1.0) {
+		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MINIFY_ENABLE | NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MAGNIFY_ENABLE;
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ss->filter = filter;
+
+	return (void *)ss;
+}
+
+static void
+nv04_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv04->sampler[unit] = sampler[unit];
+		nv04->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void
+nv04_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void
+nv04_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv04->tex_miptree[unit] = (struct nv04_miptree *)miptree[unit];
+		nv04->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void *
+nv04_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv04_rasterizer_state *rs;
+
+	/*XXX: ignored:
+	 * 	scissor
+	 * 	points/lines (no hw support, emulated with tris in gallium)
+	 */
+	rs = MALLOC(sizeof(struct nv04_rasterizer_state));
+
+	rs->blend = cso->flatshade ? NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_FLAT : NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_GOURAUD;
+
+	return (void *)rs;
+}
+
+static void
+nv04_rasterizer_state_bind(struct pipe_context *pipe, void *rast)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	nv04->rast = (struct nv04_rasterizer_state*)rast;
+
+	draw_set_rasterizer_state(nv04->draw, (nv04->rast ? nv04->rast->templ : NULL));
+
+	nv04->dirty |= NV04_NEW_RAST | NV04_NEW_BLEND;
+}
+
+static void
+nv04_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static INLINE uint32_t nv04_compare_func(uint32_t f)
+{
+	switch ( f ) {
+		case PIPE_FUNC_NEVER:		return 1;
+		case PIPE_FUNC_LESS:		return 2;
+		case PIPE_FUNC_EQUAL:		return 3;
+		case PIPE_FUNC_LEQUAL:		return 4;
+		case PIPE_FUNC_GREATER:		return 5;
+		case PIPE_FUNC_NOTEQUAL:	return 6;
+		case PIPE_FUNC_GEQUAL:		return 7;
+		case PIPE_FUNC_ALWAYS:		return 8;
+	}
+	NOUVEAU_MSG("Unable to find the function\n");
+	return 0;
+}
+
+static void *
+nv04_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv04_depth_stencil_alpha_state *hw;
+
+	hw = MALLOC(sizeof(struct nv04_depth_stencil_alpha_state));
+
+	hw->control = float_to_ubyte(cso->alpha.ref_value);
+	hw->control |= ( nv04_compare_func(cso->alpha.func) << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_FUNC_SHIFT );
+	hw->control |= cso->alpha.enabled ? NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_TEST_ENABLE : 0;
+	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ORIGIN;
+	hw->control |= cso->depth.enabled ? (1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_ENABLE_SHIFT) : 0;
+	hw->control |= ( nv04_compare_func(cso->depth.func)<< NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FUNC_SHIFT );
+	hw->control |= 1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_SHIFT; // no culling, handled by the draw module
+	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_DITHER_ENABLE;
+	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_PERSPECTIVE_ENABLE;
+	hw->control |= cso->depth.writemask ? (1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_WRITE_ENABLE_SHIFT) : 0;
+	hw->control |= 1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FORMAT_SHIFT; // integer zbuffer format
+
+	return (void *)hw;
+}
+
+static void
+nv04_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	nv04->dsa = hwcso;
+	nv04->dirty |= NV04_NEW_CONTROL;
+}
+
+static void
+nv04_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void *
+nv04_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *templ)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	return draw_create_vertex_shader(nv04->draw, templ);
+}
+
+static void
+nv04_vp_state_bind(struct pipe_context *pipe, void *shader)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	draw_bind_vertex_shader(nv04->draw, (struct draw_vertex_shader *) shader);
+
+	nv04->dirty |= NV04_NEW_VERTPROG;
+}
+
+static void
+nv04_vp_state_delete(struct pipe_context *pipe, void *shader)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	draw_delete_vertex_shader(nv04->draw, (struct draw_vertex_shader *) shader);
+}
+
+static void *
+nv04_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv04_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv04_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+	return (void *)fp;
+}
+
+static void
+nv04_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_fragment_program *fp = hwcso;
+
+	nv04->fragprog.current = fp;
+	nv04->dirty |= NV04_NEW_FRAGPROG;
+}
+
+static void
+nv04_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_fragment_program *fp = hwcso;
+
+	nv04_fragprog_destroy(nv04, fp);
+	free((void*)fp->pipe.tokens);
+	free(fp);
+}
+
+static void
+nv04_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+}
+
+static void
+nv04_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+}
+
+static void
+nv04_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+
+	assert(shader < PIPE_SHADER_TYPES);
+	assert(index == 0);
+
+	if (buf) {
+		void *mapped;
+		if (buf->buffer && buf->buffer->size &&
+                    (mapped = ws->buffer_map(ws, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+		{
+			memcpy(nv04->constbuf[shader], mapped, buf->buffer->size);
+			nv04->constbuf_nr[shader] =
+				buf->buffer->size / (4 * sizeof(float));
+			ws->buffer_unmap(ws, buf->buffer);
+		}
+	}
+}
+
+static void
+nv04_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	
+	nv04->framebuffer = (struct pipe_framebuffer_state*)fb;
+
+	nv04->dirty |= NV04_NEW_FRAMEBUFFER;
+}
+static void
+nv04_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	NOUVEAU_ERR("line stipple hahaha\n");
+}
+
+static void
+nv04_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+/*	struct nv04_context *nv04 = nv04_context(pipe);
+
+	// XXX
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_SCISSOR_HORIZ, 2);
+	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
+}
+
+static void
+nv04_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *viewport)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	nv04->viewport = *viewport;
+
+	draw_set_viewport_state(nv04->draw, &nv04->viewport);
+}
+
+static void
+nv04_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+		       const struct pipe_vertex_buffer *buffers)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	memcpy(nv04->vtxbuf, buffers, count * sizeof(buffers[0]));
+	nv04->dirty |= NV04_NEW_VTXARRAYS;
+
+	draw_set_vertex_buffers(nv04->draw, count, buffers);
+}
+
+static void
+nv04_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_element *elements)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	memcpy(nv04->vtxelt, elements, sizeof(*elements) * count);
+	nv04->dirty |= NV04_NEW_VTXARRAYS;
+
+	draw_set_vertex_elements(nv04->draw, count, elements);
+}
+
+void
+nv04_init_state_functions(struct nv04_context *nv04)
+{
+	nv04->pipe.create_blend_state = nv04_blend_state_create;
+	nv04->pipe.bind_blend_state = nv04_blend_state_bind;
+	nv04->pipe.delete_blend_state = nv04_blend_state_delete;
+
+	nv04->pipe.create_sampler_state = nv04_sampler_state_create;
+	nv04->pipe.bind_sampler_states = nv04_sampler_state_bind;
+	nv04->pipe.delete_sampler_state = nv04_sampler_state_delete;
+	nv04->pipe.set_sampler_textures = nv04_set_sampler_texture;
+
+	nv04->pipe.create_rasterizer_state = nv04_rasterizer_state_create;
+	nv04->pipe.bind_rasterizer_state = nv04_rasterizer_state_bind;
+	nv04->pipe.delete_rasterizer_state = nv04_rasterizer_state_delete;
+
+	nv04->pipe.create_depth_stencil_alpha_state = nv04_depth_stencil_alpha_state_create;
+	nv04->pipe.bind_depth_stencil_alpha_state = nv04_depth_stencil_alpha_state_bind;
+	nv04->pipe.delete_depth_stencil_alpha_state = nv04_depth_stencil_alpha_state_delete;
+
+	nv04->pipe.create_vs_state = nv04_vp_state_create;
+	nv04->pipe.bind_vs_state = nv04_vp_state_bind;
+	nv04->pipe.delete_vs_state = nv04_vp_state_delete;
+
+	nv04->pipe.create_fs_state = nv04_fp_state_create;
+	nv04->pipe.bind_fs_state = nv04_fp_state_bind;
+	nv04->pipe.delete_fs_state = nv04_fp_state_delete;
+
+	nv04->pipe.set_blend_color = nv04_set_blend_color;
+	nv04->pipe.set_clip_state = nv04_set_clip_state;
+	nv04->pipe.set_constant_buffer = nv04_set_constant_buffer;
+	nv04->pipe.set_framebuffer_state = nv04_set_framebuffer_state;
+	nv04->pipe.set_polygon_stipple = nv04_set_polygon_stipple;
+	nv04->pipe.set_scissor_state = nv04_set_scissor_state;
+	nv04->pipe.set_viewport_state = nv04_set_viewport_state;
+
+	nv04->pipe.set_vertex_buffers = nv04_set_vertex_buffers;
+	nv04->pipe.set_vertex_elements = nv04_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_state.h b/src/gallium/drivers/nv04/nv04_state.h
new file mode 100644
index 0000000000..15d4685ec1
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_state.h
@@ -0,0 +1,74 @@
+#ifndef __NV04_STATE_H__
+#define __NV04_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv04_blend_state {
+	uint32_t b_enable;
+	uint32_t b_src;
+	uint32_t b_dst;
+};
+
+struct nv04_fragtex_state {
+	uint32_t format;
+};
+
+struct nv04_sampler_state {
+	uint32_t filter;
+	uint32_t format;
+};
+
+struct nv04_depth_stencil_alpha_state {
+	uint32_t control;
+};
+
+struct nv04_rasterizer_state {
+	uint32_t blend;
+
+	const struct pipe_rasterizer_state *templ;
+};
+
+struct nv04_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct pipe_texture *shadow_tex;
+	struct pipe_surface *shadow_surface;
+
+	struct {
+		uint pitch;
+		uint image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+struct nv04_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv04_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv04_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+};
+
+
+
+#endif
diff --git a/src/gallium/drivers/nv04/nv04_state_emit.c b/src/gallium/drivers/nv04/nv04_state_emit.c
new file mode 100644
index 0000000000..bd8ef1adbf
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_state_emit.c
@@ -0,0 +1,223 @@
+#include "nv04_context.h"
+#include "nv04_state.h"
+
+static void nv04_vertex_layout(struct pipe_context* pipe)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_fragment_program *fp = nv04->fragprog.current;
+	uint32_t src = 0;
+	int i;
+	struct vertex_info vinfo;
+
+	memset(&vinfo, 0, sizeof(vinfo));
+
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		int isn = fp->info.input_semantic_name[i];
+		int isi = fp->info.input_semantic_index[i];
+		switch (isn) {
+			case TGSI_SEMANTIC_POSITION:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			case TGSI_SEMANTIC_COLOR:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			default:
+			case TGSI_SEMANTIC_GENERIC:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+			case TGSI_SEMANTIC_FOG:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+		}
+	}
+
+	printf("%d vertex input\n",fp->info.num_inputs);
+	draw_compute_vertex_size(&vinfo);
+}
+
+static uint32_t nv04_blend_func(uint32_t f)
+{
+	switch ( f ) {
+		case PIPE_BLENDFACTOR_ZERO:			return 0x1;
+		case PIPE_BLENDFACTOR_ONE:			return 0x2;
+		case PIPE_BLENDFACTOR_SRC_COLOR:		return 0x3;
+		case PIPE_BLENDFACTOR_INV_SRC_COLOR:		return 0x4;
+		case PIPE_BLENDFACTOR_SRC_ALPHA:		return 0x5;
+		case PIPE_BLENDFACTOR_INV_SRC_ALPHA:		return 0x6;
+		case PIPE_BLENDFACTOR_DST_ALPHA:		return 0x7;
+		case PIPE_BLENDFACTOR_INV_DST_ALPHA:		return 0x8;
+		case PIPE_BLENDFACTOR_DST_COLOR:		return 0x9;
+		case PIPE_BLENDFACTOR_INV_DST_COLOR:		return 0xA;
+		case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:	return 0xB;
+	}
+	NOUVEAU_MSG("Unable to find the blend function 0x%x\n",f);
+	return 0;
+}
+
+static void nv04_emit_control(struct nv04_context* nv04)
+{
+	uint32_t control = nv04->dsa->control;
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
+	OUT_RING(control);
+}
+
+static void nv04_emit_blend(struct nv04_context* nv04)
+{
+	uint32_t blend;
+
+	blend=0x4; // texture MODULATE_ALPHA
+	blend|=0x20; // alpha is MSB
+	blend|=(2<<6); // flat shading
+	blend|=(1<<8); // persp correct
+	blend|=(0<<16); // no fog
+	blend|=(nv04->blend->b_enable<<20);
+	blend|=(nv04_blend_func(nv04->blend->b_src)<<24);
+	blend|=(nv04_blend_func(nv04->blend->b_dst)<<28);
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_BLEND, 1);
+	OUT_RING(blend);
+}
+
+static void nv04_emit_sampler(struct nv04_context *nv04, int unit)
+{
+	struct nv04_miptree *nv04mt = nv04->tex_miptree[unit];
+	struct pipe_texture *pt = &nv04mt->base;
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 3);
+	OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(nv04mt->buffer, (nv04->fragtex.format | nv04->sampler[unit]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING(nv04->sampler[unit]->filter);
+}
+
+static void nv04_state_emit_framebuffer(struct nv04_context* nv04)
+{
+	struct pipe_framebuffer_state* fb = nv04->framebuffer;
+	struct pipe_surface *rt, *zeta;
+	uint32_t rt_format, w, h;
+	int colour_format = 0, zeta_format = 0;
+	struct nv04_miptree *nv04mt = 0;
+
+	w = fb->cbufs[0]->width;
+	h = fb->cbufs[0]->height;
+	colour_format = fb->cbufs[0]->format;
+	rt = fb->cbufs[0];
+
+	if (fb->zsbuf) {
+		if (colour_format) {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		} else {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+		}
+
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format = 0x108;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format = 0x103;
+		break;
+	default:
+		assert(0);
+	}
+
+	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_FORMAT, 1);
+	OUT_RING(rt_format);
+
+	nv04mt = (struct nv04_miptree *)rt->texture;
+	/* FIXME pitches have to be aligned ! */
+	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
+	OUT_RING(rt->stride|(zeta->stride<<16));
+	OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	if (fb->zsbuf) {
+		nv04mt = (struct nv04_miptree *)zeta->texture;
+		BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
+		OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	}
+}
+
+void
+nv04_emit_hw_state(struct nv04_context *nv04)
+{
+	int i;
+
+	if (nv04->dirty & NV04_NEW_VERTPROG) {
+		//nv04_vertprog_bind(nv04, nv04->vertprog.current);
+		nv04->dirty &= ~NV04_NEW_VERTPROG;
+	}
+
+	if (nv04->dirty & NV04_NEW_FRAGPROG) {
+		nv04_fragprog_bind(nv04, nv04->fragprog.current);
+		nv04->dirty &= ~NV04_NEW_FRAGPROG;
+		nv04->dirty_samplers |= (1<<10);
+		nv04->dirty_samplers = 0;
+	}
+
+	if (nv04->dirty & NV04_NEW_CONTROL) {
+		nv04->dirty &= ~NV04_NEW_CONTROL;
+
+		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
+		OUT_RING(nv04->dsa->control);
+	}
+
+	if (nv04->dirty & NV04_NEW_BLEND) {
+		nv04->dirty &= ~NV04_NEW_BLEND;
+
+		nv04_emit_blend(nv04);
+	}
+
+	if (nv04->dirty & NV04_NEW_VTXARRAYS) {
+		nv04->dirty &= ~NV04_NEW_VTXARRAYS;
+		nv04_vertex_layout(nv04);
+	}
+
+	if (nv04->dirty & NV04_NEW_SAMPLER) {
+		nv04->dirty &= ~NV04_NEW_SAMPLER;
+
+		nv04_emit_sampler(nv04, 0);
+	}
+
+	if (nv04->dirty & NV04_NEW_VIEWPORT) {
+		nv04->dirty &= ~NV04_NEW_VIEWPORT;
+//		nv04_state_emit_viewport(nv04);
+	}
+
+ 	if (nv04->dirty & NV04_NEW_FRAMEBUFFER) {
+		nv04->dirty &= ~NV04_NEW_FRAMEBUFFER;
+		nv04_state_emit_framebuffer(nv04);
+	}
+
+	/* Emit relocs for every referenced buffer.
+	 * This is to ensure the bufmgr has an accurate idea of how
+	 * the buffer is used.  This isn't very efficient, but we don't
+	 * seem to take a significant performance hit.  Will be improved
+	 * at some point.  Vertex arrays are emitted by nv04_vbo.c
+	 */
+
+	/* Render target */
+	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
+	OUT_RING(nv04->rt->stride|(nv04->zeta->stride<<16));
+	OUT_RELOCl(nv04->rt, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	if (nv04->zeta) {
+		BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
+		OUT_RELOCl(nv04->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	}
+
+	/* Texture images */
+	for (i = 0; i < 1; i++) {
+		if (!(nv04->fp_samplers & (1 << i)))
+			continue;
+		struct nv04_miptree *nv04mt = nv04->tex_miptree[i];
+		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 2);
+		OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		OUT_RELOCd(nv04mt->buffer, (nv04->fragtex.format | nv04->sampler[i]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	}
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_surface.c b/src/gallium/drivers/nv04/nv04_surface.c
new file mode 100644
index 0000000000..14abf16679
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_surface.c
@@ -0,0 +1,72 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv04_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nv04_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_surface_2d *eng2d = nv04->screen->eng2d;
+
+	if (do_flip) {
+		desty += height;
+		while (height--) {
+			eng2d->copy(eng2d, dest, destx, desty--, src,
+				    srcx, srcy++, width, 1);
+		}
+		return;
+	}
+
+	eng2d->copy(eng2d, dest, destx, desty, src, srcx, srcy, width, height);
+}
+
+static void
+nv04_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_surface_2d *eng2d = nv04->screen->eng2d;
+
+	eng2d->fill(eng2d, dest, destx, desty, width, height, value);
+}
+
+void
+nv04_init_surface_functions(struct nv04_context *nv04)
+{
+	nv04->pipe.surface_copy = nv04_surface_copy;
+	nv04->pipe.surface_fill = nv04_surface_fill;
+}
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.c b/src/gallium/drivers/nv04/nv04_surface_2d.c
new file mode 100644
index 0000000000..230cfd17dd
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.c
@@ -0,0 +1,448 @@
+#include "pipe/p_context.h"
+#include "pipe/p_format.h"
+#include "util/u_memory.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_util.h"
+#include "nv04_surface_2d.h"
+
+static INLINE int
+nv04_surface_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
+	case PIPE_FORMAT_R16_SNORM:
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5;
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_Z24S8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
+	default:
+		return -1;
+	}
+}
+
+static INLINE int
+nv04_rect_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case PIPE_FORMAT_Z24S8_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+	default:
+		return -1;
+	}
+}
+
+static INLINE int
+nv04_scaled_image_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A1R5G5B5_UNORM:
+		return NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5;
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+		return NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
+		return NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+	case PIPE_FORMAT_R16_SNORM:
+		return NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5;
+	default:
+		return -1;
+	}
+}
+
+static INLINE unsigned
+nv04_swizzle_bits(unsigned x, unsigned y)
+{
+	unsigned u = (x & 0x001) << 0 |
+	             (x & 0x002) << 1 |
+	             (x & 0x004) << 2 |
+	             (x & 0x008) << 3 |
+	             (x & 0x010) << 4 |
+	             (x & 0x020) << 5 |
+	             (x & 0x040) << 6 |
+	             (x & 0x080) << 7 |
+	             (x & 0x100) << 8 |
+	             (x & 0x200) << 9 |
+	             (x & 0x400) << 10 |
+	             (x & 0x800) << 11;
+
+	unsigned v = (y & 0x001) << 1 |
+	             (y & 0x002) << 2 |
+	             (y & 0x004) << 3 |
+	             (y & 0x008) << 4 |
+	             (y & 0x010) << 5 |
+	             (y & 0x020) << 6 |
+	             (y & 0x040) << 7 |
+	             (y & 0x080) << 8 |
+	             (y & 0x100) << 9 |
+	             (y & 0x200) << 10 |
+	             (y & 0x400) << 11 |
+	             (y & 0x800) << 12;
+	return v | u;
+}
+
+static int
+nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
+			  struct pipe_surface *dst, int dx, int dy,
+			  struct pipe_surface *src, int sx, int sy,
+			  int w, int h)
+{
+	struct nouveau_channel *chan = ctx->nvws->channel;
+	struct nouveau_grobj *swzsurf = ctx->swzsurf;
+	struct nouveau_grobj *sifm = ctx->sifm;
+	struct nouveau_bo *src_bo = ctx->nvws->get_bo(ctx->buf(src));
+	struct nouveau_bo *dst_bo = ctx->nvws->get_bo(ctx->buf(dst));
+	const unsigned max_w = 1024;
+	const unsigned max_h = 1024;
+	const unsigned sub_w = w > max_w ? max_w : w;
+	const unsigned sub_h = h > max_h ? max_h : h;
+	unsigned cx;
+	unsigned cy;
+
+	/* POT or GTFO */
+	assert(!(w & (w - 1)) && !(h & (h - 1)));
+
+	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, dst_bo,
+	                 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_FORMAT, 1);
+	OUT_RING  (chan, nv04_surface_format(dst->format) |
+	                 log2i(w) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT |
+	                 log2i(h) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT);
+ 
+	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, src_bo,
+	                 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1);
+	OUT_RING  (chan, swzsurf->handle);
+
+	for (cy = 0; cy < h; cy += sub_h) {
+	  for (cx = 0; cx < w; cx += sub_w) {
+	    BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
+	    OUT_RELOCl(chan, dst_bo, dst->offset + nv04_swizzle_bits(cx, cy) *
+			     dst->block.size, NOUVEAU_BO_GART |
+			     NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	    BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
+	    OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
+	    OUT_RING  (chan, nv04_scaled_image_format(src->format));
+	    OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
+	    OUT_RING  (chan, 0);
+	    OUT_RING  (chan, sub_h << 16 | sub_w);
+	    OUT_RING  (chan, 0);
+	    OUT_RING  (chan, sub_h << 16 | sub_w);
+	    OUT_RING  (chan, 1 << 20);
+	    OUT_RING  (chan, 1 << 20);
+
+	    BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SIZE, 4);
+	    OUT_RING  (chan, sub_h << 16 | sub_w);
+	    OUT_RING  (chan, src->stride |
+			     NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
+			     NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
+	    OUT_RELOCl(chan, src_bo, src->offset + cy * src->stride +
+			     cx * src->block.size, NOUVEAU_BO_GART |
+			     NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	    OUT_RING  (chan, 0);
+	  }
+	}
+
+	return 0;
+}
+
+static int
+nv04_surface_copy_m2mf(struct nv04_surface_2d *ctx,
+		       struct pipe_surface *dst, int dx, int dy,
+		       struct pipe_surface *src, int sx, int sy, int w, int h)
+{
+	struct nouveau_channel *chan = ctx->nvws->channel;
+	struct nouveau_grobj *m2mf = ctx->m2mf;
+	struct nouveau_bo *src_bo = ctx->nvws->get_bo(ctx->buf(src));
+	struct nouveau_bo *dst_bo = ctx->nvws->get_bo(ctx->buf(dst));
+	unsigned dst_offset, src_offset;
+
+	dst_offset = dst->offset + (dy * dst->stride) + (dx * dst->block.size);
+	src_offset = src->offset + (sy * src->stride) + (sx * src->block.size);
+
+	WAIT_RING (chan, 3 + ((h / 2047) + 1) * 9);
+	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
+	OUT_RELOCo(chan, src_bo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dst_bo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	while (h) {
+		int count = (h > 2047) ? 2047 : h;
+
+		BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
+		OUT_RELOCl(chan, src_bo, src_offset,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		OUT_RELOCl(chan, dst_bo, dst_offset,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+		OUT_RING  (chan, src->stride);
+		OUT_RING  (chan, dst->stride);
+		OUT_RING  (chan, w * src->block.size);
+		OUT_RING  (chan, count);
+		OUT_RING  (chan, 0x0101);
+		OUT_RING  (chan, 0);
+
+		h -= count;
+		src_offset += src->stride * count;
+		dst_offset += dst->stride * count;
+	}
+
+	return 0;
+}
+
+static int
+nv04_surface_copy_blit(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
+		       int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		       int w, int h)
+{
+	struct nouveau_channel *chan = ctx->nvws->channel;
+	struct nouveau_grobj *surf2d = ctx->surf2d;
+	struct nouveau_grobj *blit = ctx->blit;
+	struct nouveau_bo *src_bo = ctx->nvws->get_bo(ctx->buf(src));
+	struct nouveau_bo *dst_bo = ctx->nvws->get_bo(ctx->buf(dst));
+	int format;
+
+	format = nv04_surface_format(dst->format);
+	if (format < 0)
+		return 1;
+
+	WAIT_RING (chan, 12);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, src_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, format);
+	OUT_RING  (chan, (dst->stride << 16) | src->stride);
+	OUT_RELOCl(chan, src_bo, src->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, blit, 0x0300, 3);
+	OUT_RING  (chan, (sy << 16) | sx);
+	OUT_RING  (chan, (dy << 16) | dx);
+	OUT_RING  (chan, ( h << 16) |  w);
+
+	return 0;
+}
+
+static void
+nv04_surface_copy(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
+		  int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		  int w, int h)
+{
+	int src_linear = src->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR;
+	int dst_linear = dst->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR;
+
+	assert(src->format == dst->format);
+
+	/* Setup transfer to swizzle the texture to vram if needed */
+	if (src_linear && !dst_linear && w > 1 && h > 1) {
+		nv04_surface_copy_swizzle(ctx, dst, dx, dy, src, sx, sy, w, h);
+		return;
+	}
+
+	/* NV_CONTEXT_SURFACES_2D has buffer alignment restrictions, fallback
+	 * to NV_MEMORY_TO_MEMORY_FORMAT in this case.
+	 */
+	if ((src->offset & 63) || (dst->offset & 63) ||
+	    (src->stride & 63) || (dst->stride & 63)) {
+		nv04_surface_copy_m2mf(ctx, dst, dx, dy, src, sx, sy, w, h);
+		return;
+	}
+
+	nv04_surface_copy_blit(ctx, dst, dx, dy, src, sx, sy, w, h);
+}
+
+static void
+nv04_surface_fill(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
+		  int dx, int dy, int w, int h, unsigned value)
+{
+	struct nouveau_channel *chan = ctx->nvws->channel;
+	struct nouveau_grobj *surf2d = ctx->surf2d;
+	struct nouveau_grobj *rect = ctx->rect;
+	struct nouveau_bo *dst_bo = ctx->nvws->get_bo(ctx->buf(dst));
+	int cs2d_format, gdirect_format;
+
+	cs2d_format = nv04_surface_format(dst->format);
+	assert(cs2d_format >= 0);
+
+	gdirect_format = nv04_rect_format(dst->format);
+	assert(gdirect_format >= 0);
+
+	WAIT_RING (chan, 16);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, cs2d_format);
+	OUT_RING  (chan, (dst->stride << 16) | dst->stride);
+	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT, 1);
+	OUT_RING  (chan, gdirect_format);
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR1_A, 1);
+	OUT_RING  (chan, value);
+	BEGIN_RING(chan, rect,
+		   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(0), 2);
+	OUT_RING  (chan, (dx << 16) | dy);
+	OUT_RING  (chan, ( w << 16) |  h);
+}
+
+void
+nv04_surface_2d_takedown(struct nv04_surface_2d **pctx)
+{
+	struct nv04_surface_2d *ctx;
+
+	if (!pctx || !*pctx)
+		return;
+	ctx = *pctx;
+	*pctx = NULL;
+
+	nouveau_notifier_free(&ctx->ntfy);
+	nouveau_grobj_free(&ctx->m2mf);
+	nouveau_grobj_free(&ctx->surf2d);
+	nouveau_grobj_free(&ctx->swzsurf);
+	nouveau_grobj_free(&ctx->rect);
+	nouveau_grobj_free(&ctx->blit);
+	nouveau_grobj_free(&ctx->sifm);
+
+	FREE(ctx);
+}
+
+struct nv04_surface_2d *
+nv04_surface_2d_init(struct nouveau_winsys *nvws)
+{
+	struct nv04_surface_2d *ctx = CALLOC_STRUCT(nv04_surface_2d);
+	struct nouveau_channel *chan = nvws->channel;
+	unsigned handle = 0x88000000, class;
+	int ret;
+
+	if (!ctx)
+		return NULL;
+
+	ret = nouveau_notifier_alloc(chan, handle++, 1, &ctx->ntfy);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, 0x0039, &ctx->m2mf);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+
+	if (chan->device->chipset < 0x10)
+		class = NV04_CONTEXT_SURFACES_2D;
+	else
+		class = NV10_CONTEXT_SURFACES_2D;
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->surf2d);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->surf2d,
+			 NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
+
+	if (chan->device->chipset < 0x10)
+		class = NV04_IMAGE_BLIT;
+	else
+		class = NV12_IMAGE_BLIT;
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->blit);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+	BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_SURFACE, 1);
+	OUT_RING  (chan, ctx->surf2d->handle);
+	BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_OPERATION, 1);
+	OUT_RING  (chan, NV04_IMAGE_BLIT_OPERATION_SRCCOPY);
+
+	ret = nouveau_grobj_alloc(chan, handle++, NV04_GDI_RECTANGLE_TEXT,
+				  &ctx->rect);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_SURFACE, 1);
+	OUT_RING  (chan, ctx->surf2d->handle);
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_OPERATION, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY);
+	BEGIN_RING(chan, ctx->rect,
+			 NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE);
+
+	switch (chan->device->chipset & 0xf0) {
+	case 0x00:
+	case 0x10:
+		class = NV04_SWIZZLED_SURFACE;
+		break;
+	case 0x20:
+		class = NV20_SWIZZLED_SURFACE;
+		break;
+	case 0x30:
+		class = NV30_SWIZZLED_SURFACE;
+		break;
+	case 0x40:
+	case 0x60:
+		class = NV40_SWIZZLED_SURFACE;
+		break;
+	default:
+		/* Famous last words: this really can't happen.. */
+		assert(0);
+		break;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->swzsurf);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	if (chan->device->chipset < 0x10) {
+		class = NV04_SCALED_IMAGE_FROM_MEMORY;
+	} else
+	if (chan->device->chipset < 0x40) {
+		class = NV10_SCALED_IMAGE_FROM_MEMORY;
+	} else {
+		class = NV40_SCALED_IMAGE_FROM_MEMORY;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->sifm);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	ctx->nvws = nvws;
+	ctx->copy = nv04_surface_copy;
+	ctx->fill = nv04_surface_fill;
+	return ctx;
+}
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.h b/src/gallium/drivers/nv04/nv04_surface_2d.h
new file mode 100644
index 0000000000..21b8f86960
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.h
@@ -0,0 +1,29 @@
+#ifndef __NV04_SURFACE_2D_H__
+#define __NV04_SURFACE_2D_H__
+
+struct nv04_surface_2d {
+	struct nouveau_winsys *nvws;
+	struct nouveau_notifier *ntfy;
+	struct nouveau_grobj *surf2d;
+	struct nouveau_grobj *swzsurf;
+	struct nouveau_grobj *m2mf;
+	struct nouveau_grobj *rect;
+	struct nouveau_grobj *blit;
+	struct nouveau_grobj *sifm;
+
+	struct pipe_buffer *(*buf)(struct pipe_surface *);
+
+	void (*copy)(struct nv04_surface_2d *, struct pipe_surface *dst,
+		     int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		     int w, int h);
+	void (*fill)(struct nv04_surface_2d *, struct pipe_surface *dst,
+		     int dx, int dy, int w, int h, unsigned value);
+};
+
+struct nv04_surface_2d *
+nv04_surface_2d_init(struct nouveau_winsys *nvws);
+
+void
+nv04_surface_2d_takedown(struct nv04_surface_2d **);
+
+#endif
diff --git a/src/gallium/drivers/nv04/nv04_vbo.c b/src/gallium/drivers/nv04/nv04_vbo.c
new file mode 100644
index 0000000000..d21a0e34f7
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_vbo.c
@@ -0,0 +1,78 @@
+#include "draw/draw_context.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv04_context.h"
+#include "nv04_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+boolean nv04_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count)
+{
+	struct nv04_context *nv04 = nv04_context( pipe );
+	struct draw_context *draw = nv04->draw;
+	unsigned i;
+
+	nv04_emit_hw_state(nv04);
+
+	/*
+	 * Map vertex buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv04->vtxbuf[i].buffer) {
+			void *buf
+				= pipe->winsys->buffer_map(pipe->winsys,
+						nv04->vtxbuf[i].buffer,
+						PIPE_BUFFER_USAGE_CPU_READ);
+			draw_set_mapped_vertex_buffer(draw, i, buf);
+		}
+	}
+	/* Map index buffer, if present */
+	if (indexBuffer) {
+		void *mapped_indexes
+			= pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
+					PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
+	}
+	else {
+		/* no index/element buffer */
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	draw_set_mapped_constant_buffer(draw,
+					nv04->constbuf[PIPE_SHADER_VERTEX],
+					nv04->constbuf_nr[PIPE_SHADER_VERTEX]);
+
+	/* draw! */
+	draw_arrays(nv04->draw, prim, start, count);
+
+	/*
+	 * unmap vertex/index buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv04->vtxbuf[i].buffer) {
+			pipe->winsys->buffer_unmap(pipe->winsys, nv04->vtxbuf[i].buffer);
+			draw_set_mapped_vertex_buffer(draw, i, NULL);
+		}
+	}
+	if (indexBuffer) {
+		pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	return TRUE;
+}
+
+boolean nv04_draw_arrays( struct pipe_context *pipe,
+				 unsigned prim, unsigned start, unsigned count)
+{
+	printf("coucou in draw arrays\n");
+	return nv04_draw_elements(pipe, NULL, 0, prim, start, count);
+}
+
+
+
diff --git a/src/gallium/drivers/nv10/Makefile b/src/gallium/drivers/nv10/Makefile
new file mode 100644
index 0000000000..2b5fbd4f5a
--- /dev/null
+++ b/src/gallium/drivers/nv10/Makefile
@@ -0,0 +1,19 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv10
+
+C_SOURCES = \
+	nv10_clear.c \
+	nv10_context.c \
+	nv10_fragprog.c \
+	nv10_fragtex.c \
+	nv10_miptree.c \
+	nv10_prim_vbuf.c \
+	nv10_screen.c \
+	nv10_state.c \
+	nv10_state_emit.c \
+	nv10_surface.c \
+	nv10_vbo.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/nv10/nv10_clear.c b/src/gallium/drivers/nv10/nv10_clear.c
new file mode 100644
index 0000000000..be7e09cf4b
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_clear.c
@@ -0,0 +1,12 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv10_context.h"
+
+void
+nv10_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+}
diff --git a/src/gallium/drivers/nv10/nv10_context.c b/src/gallium/drivers/nv10/nv10_context.c
new file mode 100644
index 0000000000..ef2c0c5d9f
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_context.c
@@ -0,0 +1,296 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "nv10_context.h"
+#include "nv10_screen.h"
+
+static void
+nv10_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	draw_flush(nv10->draw);
+
+	FIRE_RING(fence);
+}
+
+static void
+nv10_destroy(struct pipe_context *pipe)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	if (nv10->draw)
+		draw_destroy(nv10->draw);
+
+	FREE(nv10);
+}
+
+static void nv10_init_hwctx(struct nv10_context *nv10)
+{
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_winsys *nvws = screen->nvws;
+	int i;
+	float projectionmatrix[16];
+
+	BEGIN_RING(celsius, NV10TCL_DMA_NOTIFY, 1);
+	OUT_RING  (screen->sync->handle);
+	BEGIN_RING(celsius, NV10TCL_DMA_IN_MEMORY0, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->gart->handle);
+	BEGIN_RING(celsius, NV10TCL_DMA_IN_MEMORY2, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->vram->handle);
+
+	BEGIN_RING(celsius, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_RT_HORIZ, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 1);
+	OUT_RING  ((0x7ff<<16)|0x800);
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_VERT(0), 1);
+	OUT_RING  ((0x7ff<<16)|0x800);
+
+	for (i=1;i<8;i++) {
+		BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		OUT_RING  (0);
+		BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_VERT(i), 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(celsius, 0x290, 1);
+	OUT_RING  ((0x10<<16)|1);
+	BEGIN_RING(celsius, 0x3f4, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	if (nv10->screen->celsius->grclass != NV10TCL) {
+		/* For nv11, nv17 */
+		BEGIN_RING(celsius, 0x120, 3);
+		OUT_RING  (0);
+		OUT_RING  (1);
+		OUT_RING  (2);
+
+		BEGIN_RING(celsius, NV10TCL_NOP, 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(celsius, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	/* Set state */
+	BEGIN_RING(celsius, NV10TCL_FOG_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_FUNC, 2);
+	OUT_RING  (0x207);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_TX_ENABLE(0), 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_RC_IN_ALPHA(0), 12);
+	OUT_RING  (0x30141010);
+	OUT_RING  (0);
+	OUT_RING  (0x20040000);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0x00000c00);
+	OUT_RING  (0);
+	OUT_RING  (0x00000c00);
+	OUT_RING  (0x18000000);
+	OUT_RING  (0x300e0300);
+	OUT_RING  (0x0c091c80);
+
+	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_DITHER_ENABLE, 2);
+	OUT_RING  (1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_WEIGHT_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_SRC, 4);
+	OUT_RING  (1);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0x8006);
+	BEGIN_RING(celsius, NV10TCL_STENCIL_MASK, 8);
+	OUT_RING  (0xff);
+	OUT_RING  (0x207);
+	OUT_RING  (0);
+	OUT_RING  (0xff);
+	OUT_RING  (0x1e00);
+	OUT_RING  (0x1e00);
+	OUT_RING  (0x1e00);
+	OUT_RING  (0x1d01);
+	BEGIN_RING(celsius, NV10TCL_NORMALIZE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_FOG_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_LIGHT_MODEL, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_COLOR_CONTROL, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_ENABLED_LIGHTS, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_DEPTH_FUNC, 1);
+	OUT_RING  (0x201);
+	BEGIN_RING(celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_POLYGON_OFFSET_FACTOR, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_POINT_SIZE, 1);
+	OUT_RING  (8);
+	BEGIN_RING(celsius, NV10TCL_POINT_PARAMETERS_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_LINE_WIDTH, 1);
+	OUT_RING  (8);
+	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (0x1b02);
+	OUT_RING  (0x1b02);
+	BEGIN_RING(celsius, NV10TCL_CULL_FACE, 2);
+	OUT_RING  (0x405);
+	OUT_RING  (0x901);
+	BEGIN_RING(celsius, NV10TCL_POLYGON_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_TX_GEN_S(0), 8);
+	for (i=0;i<8;i++) {
+		OUT_RING  (0);
+	}
+	BEGIN_RING(celsius, NV10TCL_FOG_EQUATION_CONSTANT, 3);
+	OUT_RING  (0x3fc00000);	/* -1.50 */
+	OUT_RING  (0xbdb8aa0a);	/* -0.09 */
+	OUT_RING  (0);		/*  0.00 */
+
+	BEGIN_RING(celsius, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_FOG_MODE, 2);
+	OUT_RING  (0x802);
+	OUT_RING  (2);
+	/* for some reason VIEW_MATRIX_ENABLE need to be 6 instead of 4 when
+	 * using texturing, except when using the texture matrix
+	 */
+	BEGIN_RING(celsius, NV10TCL_VIEW_MATRIX_ENABLE, 1);
+	OUT_RING  (6);
+	BEGIN_RING(celsius, NV10TCL_COLOR_MASK, 1);
+	OUT_RING  (0x01010101);
+
+	/* Set vertex component */
+	BEGIN_RING(celsius, NV10TCL_VERTEX_COL_4F_R, 4);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_COL2_3F_R, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_NOR_3F_X, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_TX0_4F_S, 4);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_TX1_4F_S, 4);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_FOG_1F, 1);
+	OUT_RINGf (0.0);
+	BEGIN_RING(celsius, NV10TCL_EDGEFLAG_ENABLE, 1);
+	OUT_RING  (1);
+
+	memset(projectionmatrix, 0, sizeof(projectionmatrix));
+	BEGIN_RING(celsius, NV10TCL_PROJECTION_MATRIX(0), 16);
+	projectionmatrix[0*4+0] = 1.0;
+	projectionmatrix[1*4+1] = 1.0;
+	projectionmatrix[2*4+2] = 1.0;
+	projectionmatrix[3*4+3] = 1.0;
+	for (i=0;i<16;i++) {
+		OUT_RINGf  (projectionmatrix[i]);
+	}
+
+	BEGIN_RING(celsius, NV10TCL_DEPTH_RANGE_NEAR, 2);
+	OUT_RING  (0.0);
+	OUT_RINGf  (16777216.0);
+
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_SCALE_X, 4);
+	OUT_RINGf  (-2048.0);
+	OUT_RINGf  (-2048.0);
+	OUT_RINGf  (16777215.0 * 0.5);
+	OUT_RING  (0);
+
+	FIRE_RING (NULL);
+}
+
+static void
+nv10_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+}
+
+struct pipe_context *
+nv10_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv10_screen *screen = nv10_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv10_context *nv10;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv10 = CALLOC(1, sizeof(struct nv10_context));
+	if (!nv10)
+		return NULL;
+	nv10->screen = screen;
+	nv10->pctx_id = pctx_id;
+
+	nv10->nvws = nvws;
+
+	nv10->pipe.winsys = ws;
+	nv10->pipe.screen = pscreen;
+	nv10->pipe.destroy = nv10_destroy;
+	nv10->pipe.set_edgeflags = nv10_set_edgeflags;
+	nv10->pipe.draw_arrays = nv10_draw_arrays;
+	nv10->pipe.draw_elements = nv10_draw_elements;
+	nv10->pipe.clear = nv10_clear;
+	nv10->pipe.flush = nv10_flush;
+
+	nv10_init_surface_functions(nv10);
+	nv10_init_state_functions(nv10);
+
+	nv10->draw = draw_create();
+	assert(nv10->draw);
+	draw_set_rasterize_stage(nv10->draw, nv10_draw_vbuf_stage(nv10));
+
+	nv10_init_hwctx(nv10);
+
+	return &nv10->pipe;
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_context.h b/src/gallium/drivers/nv10/nv10_context.h
new file mode 100644
index 0000000000..f3b56de25a
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_context.h
@@ -0,0 +1,153 @@
+#ifndef __NV10_CONTEXT_H__
+#define __NV10_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv10_screen *ctx = nv10->screen
+#include "nouveau/nouveau_push.h"
+
+#include "nv10_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+#define NV10_NEW_VERTPROG	(1 << 0)
+#define NV10_NEW_FRAGPROG	(1 << 1)
+#define NV10_NEW_VTXARRAYS	(1 << 2)
+#define NV10_NEW_BLEND		(1 << 3)
+#define NV10_NEW_BLENDCOL	(1 << 4)
+#define NV10_NEW_RAST 		(1 << 5)
+#define NV10_NEW_DSA  		(1 << 6)
+#define NV10_NEW_VIEWPORT	(1 << 7)
+#define NV10_NEW_SCISSOR	(1 << 8)
+#define NV10_NEW_FRAMEBUFFER	(1 << 9)
+
+#include "nv10_screen.h"
+
+struct nv10_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv10_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	uint32_t dirty;
+
+	struct nv10_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv10_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned dirty_samplers;
+	unsigned fp_samplers;
+	unsigned vp_samplers;
+
+	uint32_t rt_enable;
+	struct pipe_buffer *rt[4];
+	struct pipe_buffer *zeta;
+	uint32_t lma_offset;
+
+	struct nv10_blend_state *blend;
+	struct pipe_blend_color *blend_color;
+	struct nv10_rasterizer_state *rast;
+	struct nv10_depth_stencil_alpha_state *dsa;
+	struct pipe_viewport_state *viewport;
+	struct pipe_scissor_state *scissor;
+	struct pipe_framebuffer_state *framebuffer;
+
+	//struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	float *constbuf[PIPE_SHADER_TYPES][32][4];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+
+	struct vertex_info vertex_info;
+
+	struct {
+		struct pipe_buffer *buffer;
+		uint32_t format;
+	} tex[2];
+
+	unsigned vb_enable;
+	struct {
+		struct pipe_buffer *buffer;
+		unsigned delta;
+	} vb[16];
+
+/*	struct {
+	
+		struct nouveau_resource *exec_heap;
+		struct nouveau_resource *data_heap;
+
+		struct nv10_vertex_program *active;
+
+		struct nv10_vertex_program *current;
+	} vertprog;
+*/
+	struct {
+		struct nv10_fragment_program *active;
+
+		struct nv10_fragment_program *current;
+		struct pipe_buffer *constant_buf;
+	} fragprog;
+
+	struct pipe_vertex_buffer  vtxbuf[PIPE_MAX_ATTRIBS];
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+};
+
+static INLINE struct nv10_context *
+nv10_context(struct pipe_context *pipe)
+{
+	return (struct nv10_context *)pipe;
+}
+
+extern void nv10_init_state_functions(struct nv10_context *nv10);
+extern void nv10_init_surface_functions(struct nv10_context *nv10);
+
+extern void nv10_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv10_clear.c */
+extern void nv10_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv10_draw.c */
+extern struct draw_stage *nv10_draw_render_stage(struct nv10_context *nv10);
+
+/* nv10_fragprog.c */
+extern void nv10_fragprog_bind(struct nv10_context *,
+			       struct nv10_fragment_program *);
+extern void nv10_fragprog_destroy(struct nv10_context *,
+				  struct nv10_fragment_program *);
+
+/* nv10_fragtex.c */
+extern void nv10_fragtex_bind(struct nv10_context *);
+
+/* nv10_prim_vbuf.c */
+struct draw_stage *nv10_draw_vbuf_stage( struct nv10_context *nv10 );
+extern void nv10_vtxbuf_bind(struct nv10_context* nv10);
+
+/* nv10_state.c and friends */
+extern void nv10_emit_hw_state(struct nv10_context *nv10);
+extern void nv10_state_tex_update(struct nv10_context *nv10);
+
+/* nv10_vbo.c */
+extern boolean nv10_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv10_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count);
+
+
+#endif
diff --git a/src/gallium/drivers/nv10/nv10_fragprog.c b/src/gallium/drivers/nv10/nv10_fragprog.c
new file mode 100644
index 0000000000..698db5a16a
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_fragprog.c
@@ -0,0 +1,21 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv10_context.h"
+
+void
+nv10_fragprog_bind(struct nv10_context *nv10, struct nv10_fragment_program *fp)
+{
+}
+
+void
+nv10_fragprog_destroy(struct nv10_context *nv10,
+		      struct nv10_fragment_program *fp)
+{
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_fragtex.c b/src/gallium/drivers/nv10/nv10_fragtex.c
new file mode 100644
index 0000000000..27f2f87584
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_fragtex.c
@@ -0,0 +1,124 @@
+#include "nv10_context.h"
+#include "nouveau/nouveau_util.h"
+
+#define _(m,tf)                                                                \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV10TCL_TX_FORMAT_FORMAT_##tf,                                               \
+}
+
+struct nv10_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+};
+
+static struct nv10_texture_format
+nv10_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8),
+	_(A1R5G5B5_UNORM, A1R5G5B5),
+	_(A4R4G4B4_UNORM, A4R4G4B4),
+	_(L8_UNORM      , L8      ),
+	_(A8_UNORM      , A8      ),
+	_(A8L8_UNORM    , A8L8    ),
+//	_(RGB_DXT1      , DXT1,   ),
+//	_(RGBA_DXT1     , DXT1,   ),
+//	_(RGBA_DXT3     , DXT3,   ),
+//	_(RGBA_DXT5     , DXT5,   ),
+	{},
+};
+
+static struct nv10_texture_format *
+nv10_fragtex_format(uint pipe_format)
+{
+	struct nv10_texture_format *tf = nv10_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	return NULL;
+}
+
+
+static void
+nv10_fragtex_build(struct nv10_context *nv10, int unit)
+{
+#if 0
+	struct nv10_sampler_state *ps = nv10->tex_sampler[unit];
+	struct nv10_miptree *nv10mt = nv10->tex_miptree[unit];
+	struct pipe_texture *pt = &nv10mt->base;
+	struct nv10_texture_format *tf;
+	uint32_t txf, txs, txp;
+
+	tf = nv10_fragtex_format(pt->format);
+	if (!tf || !tf->defined) {
+		NOUVEAU_ERR("Unsupported texture format: 0x%x\n", pt->format);
+		return;
+	}
+
+	txf  = tf->format << 8;
+	txf |= (pt->last_level + 1) << 16;
+	txf |= log2i(pt->width[0]) << 20;
+	txf |= log2i(pt->height[0]) << 24;
+	txf |= log2i(pt->depth[0]) << 28;
+	txf |= 8;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV10TCL_TX_FORMAT_CUBE_MAP;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= (2<<4);
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= (1<<4);
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	BEGIN_RING(celsius, NV10TCL_TX_OFFSET(unit), 8);
+	OUT_RELOCl(nv10mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(nv10mt->buffer,txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING  (ps->wrap);
+	OUT_RING  (0x40000000); /* enable */
+	OUT_RING  (txs);
+	OUT_RING  (ps->filt | 0x2000 /* magic */);
+	OUT_RING  ((pt->width[0] << 16) | pt->height[0]);
+	OUT_RING  (ps->bcol);
+#endif
+}
+
+void
+nv10_fragtex_bind(struct nv10_context *nv10)
+{
+#if 0
+	struct nv10_fragment_program *fp = nv10->fragprog.active;
+	unsigned samplers, unit;
+
+	samplers = nv10->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		BEGIN_RING(celsius, NV10TCL_TX_ENABLE(unit), 1);
+		OUT_RING  (0);
+	}
+
+	samplers = nv10->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		nv10_fragtex_build(nv10, unit);
+	}
+
+	nv10->fp_samplers = fp->samplers;
+#endif
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_miptree.c b/src/gallium/drivers/nv10/nv10_miptree.c
new file mode 100644
index 0000000000..9616135461
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_miptree.c
@@ -0,0 +1,174 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv10_context.h"
+#include "nv10_screen.h"
+
+static void
+nv10_miptree_layout(struct nv10_miptree *nv10mt)
+{
+	struct pipe_texture *pt = &nv10mt->base;
+	boolean swizzled = FALSE;
+	uint width = pt->width[0], height = pt->height[0];
+	uint offset = 0;
+	int nr_faces, l, f;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else {
+		nr_faces = 1;
+	}
+	
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (swizzled)
+			nv10mt->level[l].pitch = pt->nblocksx[l] * pt->block.size;
+		else
+			nv10mt->level[l].pitch = pt->nblocksx[0] * pt->block.size;
+		nv10mt->level[l].pitch = (nv10mt->level[l].pitch + 63) & ~63;
+
+		nv10mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			nv10mt->level[l].image_offset[f] = offset;
+			offset += nv10mt->level[l].pitch * pt->height[l];
+		}
+	}
+
+	nv10mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv10_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
+		     const unsigned *stride, struct pipe_buffer *pb)
+{
+	struct nv10_miptree *mt;
+
+	/* Only supports 2D, non-mipmapped textures for the moment */
+	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
+	    pt->depth[0] != 1)
+		return NULL;
+
+	mt = CALLOC_STRUCT(nv10_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->level[0].pitch = stride[0];
+	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+
+	pipe_buffer_reference(pscreen, &mt->buffer, pb);
+	return &mt->base;
+}
+
+static struct pipe_texture *
+nv10_miptree_create(struct pipe_screen *screen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv10_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv10_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = screen;
+
+	nv10_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256, PIPE_BUFFER_USAGE_PIXEL,
+					   mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+	
+	return &mt->base;
+}
+
+static void
+nv10_miptree_release(struct pipe_screen *screen, struct pipe_texture **pt)
+{
+	struct pipe_texture *mt = *pt;
+
+	*pt = NULL;
+	if (--mt->refcount <= 0) {
+		struct nv10_miptree *nv10mt = (struct nv10_miptree *)mt;
+		int l;
+
+		pipe_buffer_reference(screen, &nv10mt->buffer, NULL);
+		for (l = 0; l <= mt->last_level; l++) {
+			if (nv10mt->level[l].image_offset)
+				FREE(nv10mt->level[l].image_offset);
+		}
+		FREE(nv10mt);
+	}
+}
+
+static void
+nv10_miptree_update(struct pipe_context *pipe, struct pipe_texture *mt,
+		    uint face, uint levels)
+{
+}
+
+
+static struct pipe_surface *
+nv10_miptree_surface_get(struct pipe_screen *screen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv10_miptree *nv10mt = (struct nv10_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = nv10mt->level[level].pitch;
+	ps->refcount = 1;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = nv10mt->level[level].image_offset[face];
+	} else {
+		ps->offset = nv10mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv10_miptree_surface_release(struct pipe_screen *screen,
+			     struct pipe_surface **surface)
+{
+}
+
+void nv10_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv10_miptree_create;
+	pscreen->texture_blanket = nv10_miptree_blanket;
+	pscreen->texture_release = nv10_miptree_release;
+	pscreen->get_tex_surface = nv10_miptree_surface_get;
+	pscreen->tex_surface_release = nv10_miptree_surface_release;
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_prim_vbuf.c b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
new file mode 100644
index 0000000000..491a881806
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
@@ -0,0 +1,265 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * Build post-transformation, post-clipping vertex buffers and element
+ * lists by hooking into the end of the primitive pipeline and
+ * manipulating the vertex_id field in the vertex headers.
+ *
+ * XXX: work in progress 
+ * 
+ * \author José Fonseca <jrfonseca@tungstengraphics.com>
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "util/u_debug.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "nv10_context.h"
+#include "nv10_state.h"
+
+#include "draw/draw_vbuf.h"
+
+/**
+ * Primitive renderer for nv10.
+ */
+struct nv10_vbuf_render {
+	struct vbuf_render base;
+
+	struct nv10_context *nv10;   
+
+	/** Vertex buffer */
+	struct pipe_buffer* buffer;
+
+	/** Vertex size in bytes */
+	unsigned vertex_size;
+
+	/** Hardware primitive */
+	unsigned hwprim;
+};
+
+
+void nv10_vtxbuf_bind( struct nv10_context* nv10 )
+{
+	int i;
+	for(i = 0; i < 8; i++) {
+		BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_ATTRIB_OFFSET(i), 1);
+		OUT_RING(0/*nv10->vtxbuf*/);
+		BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT(i) ,1);
+		OUT_RING(0/*XXX*/);
+	}
+}
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct nv10_vbuf_render *
+nv10_vbuf_render( struct vbuf_render *render )
+{
+	assert(render);
+	return (struct nv10_vbuf_render *)render;
+}
+
+
+static const struct vertex_info *
+nv10_vbuf_render_get_vertex_info( struct vbuf_render *render )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+
+	nv10_emit_hw_state(nv10);
+
+	return &nv10->vertex_info;
+}
+
+static boolean
+nv10_vbuf_render_allocate_vertices( struct vbuf_render *render,
+		ushort vertex_size,
+		ushort nr_vertices )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+	struct pipe_winsys *winsys = nv10->pipe.winsys;
+	size_t size = (size_t)vertex_size * (size_t)nr_vertices;
+
+	assert(!nv10_render->buffer);
+	nv10_render->buffer = winsys->buffer_create(winsys, 64, PIPE_BUFFER_USAGE_VERTEX, size);
+
+	nv10->dirty |= NV10_NEW_VTXARRAYS;
+
+	if (nv10_render->buffer)
+		return FALSE;
+	return TRUE;
+}
+
+static void *
+nv10_vbuf_render_map_vertices( struct vbuf_render *render )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+	struct pipe_winsys *winsys = nv10->pipe.winsys;
+
+	return winsys->buffer_map(winsys, 
+			nv10_render->buffer, 
+			PIPE_BUFFER_USAGE_CPU_WRITE);
+}
+
+static void
+nv10_vbuf_render_unmap_vertices( struct vbuf_render *render,
+		ushort min_index,
+		ushort max_index )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+	struct pipe_winsys *winsys = nv10->pipe.winsys;
+
+	assert(!nv10_render->buffer);
+	winsys->buffer_unmap(winsys, nv10_render->buffer);
+}
+
+static boolean
+nv10_vbuf_render_set_primitive( struct vbuf_render *render, 
+		unsigned prim )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	unsigned hwp = nvgl_primitive(prim);
+	if (hwp == 0)
+		return FALSE;
+
+	nv10_render->hwprim = hwp;
+	return TRUE;
+}
+
+
+static void 
+nv10_vbuf_render_draw( struct vbuf_render *render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+	int push, i;
+
+	nv10_emit_hw_state(nv10);
+
+	BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
+	OUT_RELOCl(nv10_render->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+
+	BEGIN_RING(celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING(nv10_render->hwprim);
+
+	if (nr_indices & 1) {
+		BEGIN_RING(celsius, NV10TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (indices[0]);
+		indices++; nr_indices--;
+	}
+
+	while (nr_indices) {
+		// XXX too big/small ? check the size
+		push = MIN2(nr_indices, 1200 * 2);
+
+		BEGIN_RING_NI(celsius, NV10TCL_VB_ELEMENT_U16, push >> 1);
+		for (i = 0; i < push; i+=2)
+			OUT_RING((indices[i+1] << 16) | indices[i]);
+
+		nr_indices -= push;
+		indices  += push;
+	}
+
+	BEGIN_RING(celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING  (0);
+}
+
+
+static void
+nv10_vbuf_render_release_vertices( struct vbuf_render *render )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+	struct pipe_screen *pscreen = &nv10->screen->pipe;
+
+	assert(nv10_render->buffer);
+	pipe_buffer_reference(pscreen, &nv10_render->buffer, NULL);
+}
+
+
+static void
+nv10_vbuf_render_destroy( struct vbuf_render *render )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	FREE(nv10_render);
+}
+
+
+/**
+ * Create a new primitive render.
+ */
+static struct vbuf_render *
+nv10_vbuf_render_create( struct nv10_context *nv10 )
+{
+	struct nv10_vbuf_render *nv10_render = CALLOC_STRUCT(nv10_vbuf_render);
+
+	nv10_render->nv10 = nv10;
+
+	nv10_render->base.max_vertex_buffer_bytes = 16*1024;
+	nv10_render->base.max_indices = 1024;
+	nv10_render->base.get_vertex_info = nv10_vbuf_render_get_vertex_info;
+	nv10_render->base.allocate_vertices = nv10_vbuf_render_allocate_vertices;
+	nv10_render->base.map_vertices = nv10_vbuf_render_map_vertices;
+	nv10_render->base.unmap_vertices = nv10_vbuf_render_unmap_vertices;
+	nv10_render->base.set_primitive = nv10_vbuf_render_set_primitive;
+	nv10_render->base.draw = nv10_vbuf_render_draw;
+	nv10_render->base.release_vertices = nv10_vbuf_render_release_vertices;
+	nv10_render->base.destroy = nv10_vbuf_render_destroy;
+
+	return &nv10_render->base;
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *nv10_draw_vbuf_stage( struct nv10_context *nv10 )
+{
+	struct vbuf_render *render;
+	struct draw_stage *stage;
+
+	render = nv10_vbuf_render_create(nv10);
+	if(!render)
+		return NULL;
+
+	stage = draw_vbuf_stage( nv10->draw, render );
+	if(!stage) {
+		render->destroy(render);
+		return NULL;
+	}
+
+	return stage;
+}
diff --git a/src/gallium/drivers/nv10/nv10_screen.c b/src/gallium/drivers/nv10/nv10_screen.c
new file mode 100644
index 0000000000..f417b06c94
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_screen.c
@@ -0,0 +1,226 @@
+#include "pipe/p_screen.h"
+#include "util/u_simple_screen.h"
+
+#include "nv10_context.h"
+#include "nv10_screen.h"
+
+static const char *
+nv10_screen_get_name(struct pipe_screen *screen)
+{
+	struct nv10_screen *nv10screen = nv10_screen(screen);
+	struct nouveau_device *dev = nv10screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv10_screen_get_vendor(struct pipe_screen *screen)
+{
+	return "nouveau";
+}
+
+static int
+nv10_screen_get_param(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 2;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 0;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 1;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 0;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 12;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 12;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 0;
+	case NOUVEAU_CAP_HW_VTXBUF:
+	case NOUVEAU_CAP_HW_IDXBUF:
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv10_screen_get_paramf(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 2.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv10_screen_is_format_supported(struct pipe_screen *screen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv10_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys *ws = screen->winsys;
+	void *map;
+        struct nv10_miptree *nv10mt = (struct nv10_miptree *)surface->texture;
+
+	map = ws->buffer_map(ws, nv10mt->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface->offset;
+}
+
+static void
+nv10_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys *ws = screen->winsys;
+        struct nv10_miptree *nv10mt = (struct nv10_miptree *)surface->texture;
+
+	ws->buffer_unmap(ws, nv10mt->buffer);
+}
+
+static void
+nv10_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv10_screen *screen = nv10_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->celsius);
+
+	FREE(pscreen);
+}
+
+static struct pipe_buffer *
+nv10_surface_buffer(struct pipe_surface *surf)
+{
+	struct nv10_miptree *mt = (struct nv10_miptree *)surf->texture;
+
+	return mt->buffer;
+}
+
+struct pipe_screen *
+nv10_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv10_screen *screen = CALLOC_STRUCT(nv10_screen);
+	unsigned celsius_class;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(nvws);
+	screen->eng2d->buf = nv10_surface_buffer;
+
+	/* 3D object */
+	if (chipset>=0x20)
+		celsius_class=NV11TCL;
+	else if (chipset>=0x17)
+		celsius_class=NV17TCL;
+	else if (chipset>=0x11)
+		celsius_class=NV11TCL;
+	else
+		celsius_class=NV10TCL;
+
+	if (!celsius_class) {
+		NOUVEAU_ERR("Unknown nv1x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, celsius_class, &screen->celsius);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv10_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv10_screen_destroy;
+
+	screen->pipe.get_name = nv10_screen_get_name;
+	screen->pipe.get_vendor = nv10_screen_get_vendor;
+	screen->pipe.get_param = nv10_screen_get_param;
+	screen->pipe.get_paramf = nv10_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv10_screen_is_format_supported;
+
+	screen->pipe.surface_map = nv10_surface_map;
+	screen->pipe.surface_unmap = nv10_surface_unmap;
+
+	nv10_screen_init_miptree_functions(&screen->pipe);
+	u_simple_screen_init(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_screen.h b/src/gallium/drivers/nv10/nv10_screen.h
new file mode 100644
index 0000000000..60102a369a
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_screen.h
@@ -0,0 +1,24 @@
+#ifndef __NV10_SCREEN_H__
+#define __NV10_SCREEN_H__
+
+#include "pipe/p_screen.h"
+#include "nv04/nv04_surface_2d.h"
+
+struct nv10_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	/* HW graphics objects */
+	struct nv04_surface_2d *eng2d;
+	struct nouveau_grobj *celsius;
+	struct nouveau_notifier *sync;
+};
+
+static INLINE struct nv10_screen *
+nv10_screen(struct pipe_screen *screen)
+{
+	return (struct nv10_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv10/nv10_state.c b/src/gallium/drivers/nv10/nv10_state.c
new file mode 100644
index 0000000000..119af66dfd
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_state.c
@@ -0,0 +1,589 @@
+#include "draw/draw_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv10_context.h"
+#include "nv10_state.h"
+
+static void *
+nv10_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv10_blend_state *cb;
+
+	cb = MALLOC(sizeof(struct nv10_blend_state));
+
+	cb->b_enable = cso->blend_enable ? 1 : 0;
+	cb->b_srcfunc = ((nvgl_blend_func(cso->alpha_src_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_src_factor)));
+	cb->b_dstfunc = ((nvgl_blend_func(cso->alpha_dst_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_dst_factor)));
+
+	cb->c_mask = (((cso->colormask & PIPE_MASK_A) ? (0x01<<24) : 0) |
+		      ((cso->colormask & PIPE_MASK_R) ? (0x01<<16) : 0) |
+		      ((cso->colormask & PIPE_MASK_G) ? (0x01<< 8) : 0) |
+		      ((cso->colormask & PIPE_MASK_B) ? (0x01<< 0) : 0));
+
+	cb->d_enable = cso->dither ? 1 : 0;
+
+	return (void *)cb;
+}
+
+static void
+nv10_blend_state_bind(struct pipe_context *pipe, void *blend)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->blend = (struct nv10_blend_state*)blend;
+
+	nv10->dirty |= NV10_NEW_BLEND;
+}
+
+static void
+nv10_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV10TCL_TX_FORMAT_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV10TCL_TX_FORMAT_WRAP_S_SHIFT;
+}
+
+static void *
+nv10_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv10_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv10_sampler_state));
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV10TCL_TX_FORMAT_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV10TCL_TX_FORMAT_WRAP_T_SHIFT));
+
+	ps->en = 0;
+	if (cso->max_anisotropy > 1.0) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+/*		if (cso->max_anisotropy >= 16.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_16X;
+		} else
+		if (cso->max_anisotropy >= 12.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_12X;
+		} else
+		if (cso->max_anisotropy >= 10.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_10X;
+		} else
+		if (cso->max_anisotropy >= 8.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 6.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_6X;
+		} else
+		if (cso->max_anisotropy >= 4.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_2X;
+		}*/
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV10TCL_TX_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV10TCL_TX_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV10TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV10TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV10TCL_TX_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV10TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV10TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV10TCL_TX_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+/*	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}*/
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv10_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv10->tex_sampler[unit] = sampler[unit];
+		nv10->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void
+nv10_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv10_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv10->tex_miptree[unit] = (struct nv10_miptree *)miptree[unit];
+		nv10->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void *
+nv10_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv10_rasterizer_state *rs;
+	int i;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	offset_cw/ccw -nohw
+	 * 	scissor
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 * 	offset_units / offset_scale
+	 */
+	rs = MALLOC(sizeof(struct nv10_rasterizer_state));
+
+	rs->templ = cso;
+	
+	rs->shade_model = cso->flatshade ? 0x1d00 : 0x1d01;
+
+	rs->line_width = (unsigned char)(cso->line_width * 8.0) & 0xff;
+	rs->line_smooth_en = cso->line_smooth ? 1 : 0;
+
+	rs->point_size = *(uint32_t*)&cso->point_size;
+
+	rs->poly_smooth_en = cso->poly_smooth ? 1 : 0;
+
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		rs->front_face = NV10TCL_FRONT_FACE_CCW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_ccw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_cw);
+	} else {
+		rs->front_face = NV10TCL_FRONT_FACE_CW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_cw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_ccw);
+	}
+
+	switch (cso->cull_mode) {
+	case PIPE_WINDING_CCW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CCW)
+			rs->cull_face    = NV10TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV10TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_CW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CW)
+			rs->cull_face    = NV10TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV10TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_BOTH:
+		rs->cull_face_en = 1;
+		rs->cull_face    = NV10TCL_CULL_FACE_FRONT_AND_BACK;
+		break;
+	case PIPE_WINDING_NONE:
+	default:
+		rs->cull_face_en = 0;
+		rs->cull_face    = 0;
+		break;
+	}
+
+	if (cso->point_sprite) {
+		rs->point_sprite = (1 << 0);
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				rs->point_sprite |= (1 << (8 + i));
+		}
+	} else {
+		rs->point_sprite = 0;
+	}
+
+	return (void *)rs;
+}
+
+static void
+nv10_rasterizer_state_bind(struct pipe_context *pipe, void *rast)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->rast = (struct nv10_rasterizer_state*)rast;
+
+	draw_set_rasterizer_state(nv10->draw, (nv10->rast ? nv10->rast->templ : NULL));
+
+	nv10->dirty |= NV10_NEW_RAST;
+}
+
+static void
+nv10_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void *
+nv10_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv10_depth_stencil_alpha_state *hw;
+
+	hw = MALLOC(sizeof(struct nv10_depth_stencil_alpha_state));
+
+	hw->depth.func		= nvgl_comparison_op(cso->depth.func);
+	hw->depth.write_enable	= cso->depth.writemask ? 1 : 0;
+	hw->depth.test_enable	= cso->depth.enabled ? 1 : 0;
+
+	hw->stencil.enable = cso->stencil[0].enabled ? 1 : 0;
+	hw->stencil.wmask = cso->stencil[0].writemask;
+	hw->stencil.func = nvgl_comparison_op(cso->stencil[0].func);
+	hw->stencil.ref	= cso->stencil[0].ref_value;
+	hw->stencil.vmask = cso->stencil[0].valuemask;
+	hw->stencil.fail = nvgl_stencil_op(cso->stencil[0].fail_op);
+	hw->stencil.zfail = nvgl_stencil_op(cso->stencil[0].zfail_op);
+	hw->stencil.zpass = nvgl_stencil_op(cso->stencil[0].zpass_op);
+
+	hw->alpha.enabled = cso->alpha.enabled ? 1 : 0;
+	hw->alpha.func = nvgl_comparison_op(cso->alpha.func);
+	hw->alpha.ref  = float_to_ubyte(cso->alpha.ref_value);
+
+	return (void *)hw;
+}
+
+static void
+nv10_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *dsa)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->dsa = (struct nv10_depth_stencil_alpha_state*)dsa;
+
+	nv10->dirty |= NV10_NEW_DSA;
+}
+
+static void
+nv10_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void *
+nv10_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *templ)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	return draw_create_vertex_shader(nv10->draw, templ);
+}
+
+static void
+nv10_vp_state_bind(struct pipe_context *pipe, void *shader)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	draw_bind_vertex_shader(nv10->draw, (struct draw_vertex_shader *) shader);
+
+	nv10->dirty |= NV10_NEW_VERTPROG;
+}
+
+static void
+nv10_vp_state_delete(struct pipe_context *pipe, void *shader)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	draw_delete_vertex_shader(nv10->draw, (struct draw_vertex_shader *) shader);
+}
+
+static void *
+nv10_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv10_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv10_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	
+	tgsi_scan_shader(cso->tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv10_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nv10_fragment_program *fp = hwcso;
+
+	nv10->fragprog.current = fp;
+	nv10->dirty |= NV10_NEW_FRAGPROG;
+}
+
+static void
+nv10_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nv10_fragment_program *fp = hwcso;
+
+	nv10_fragprog_destroy(nv10, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv10_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->blend_color = (struct pipe_blend_color*)bcol;
+
+	nv10->dirty |= NV10_NEW_BLENDCOL;
+}
+
+static void
+nv10_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	draw_set_clip_state(nv10->draw, clip);
+}
+
+static void
+nv10_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+
+	assert(shader < PIPE_SHADER_TYPES);
+	assert(index == 0);
+
+	if (buf) {
+		void *mapped;
+		if (buf->buffer && buf->buffer->size &&
+                    (mapped = ws->buffer_map(ws, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+		{
+			memcpy(nv10->constbuf[shader], mapped, buf->buffer->size);
+			nv10->constbuf_nr[shader] =
+				buf->buffer->size / (4 * sizeof(float));
+			ws->buffer_unmap(ws, buf->buffer);
+		}
+	}
+}
+
+static void
+nv10_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->framebuffer = (struct pipe_framebuffer_state*)fb;
+
+	nv10->dirty |= NV10_NEW_FRAMEBUFFER;
+}
+
+static void
+nv10_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	NOUVEAU_ERR("line stipple hahaha\n");
+}
+
+static void
+nv10_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->scissor = (struct pipe_scissor_state*)s;
+
+	nv10->dirty |= NV10_NEW_SCISSOR;
+}
+
+static void
+nv10_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->viewport = (struct pipe_viewport_state*)vpt;
+
+	draw_set_viewport_state(nv10->draw, nv10->viewport);
+
+	nv10->dirty |= NV10_NEW_VIEWPORT;
+}
+
+static void
+nv10_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	memcpy(nv10->vtxbuf, vb, sizeof(*vb) * count);
+	nv10->dirty |= NV10_NEW_VTXARRAYS;
+
+	draw_set_vertex_buffers(nv10->draw, count, vb);
+}
+
+static void
+nv10_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	memcpy(nv10->vtxelt, ve, sizeof(*ve) * count);
+	nv10->dirty |= NV10_NEW_VTXARRAYS;
+
+	draw_set_vertex_elements(nv10->draw, count, ve);
+}
+
+void
+nv10_init_state_functions(struct nv10_context *nv10)
+{
+	nv10->pipe.create_blend_state = nv10_blend_state_create;
+	nv10->pipe.bind_blend_state = nv10_blend_state_bind;
+	nv10->pipe.delete_blend_state = nv10_blend_state_delete;
+
+	nv10->pipe.create_sampler_state = nv10_sampler_state_create;
+	nv10->pipe.bind_sampler_states = nv10_sampler_state_bind;
+	nv10->pipe.delete_sampler_state = nv10_sampler_state_delete;
+	nv10->pipe.set_sampler_textures = nv10_set_sampler_texture;
+
+	nv10->pipe.create_rasterizer_state = nv10_rasterizer_state_create;
+	nv10->pipe.bind_rasterizer_state = nv10_rasterizer_state_bind;
+	nv10->pipe.delete_rasterizer_state = nv10_rasterizer_state_delete;
+
+	nv10->pipe.create_depth_stencil_alpha_state =
+		nv10_depth_stencil_alpha_state_create;
+	nv10->pipe.bind_depth_stencil_alpha_state =
+		nv10_depth_stencil_alpha_state_bind;
+	nv10->pipe.delete_depth_stencil_alpha_state =
+		nv10_depth_stencil_alpha_state_delete;
+
+	nv10->pipe.create_vs_state = nv10_vp_state_create;
+	nv10->pipe.bind_vs_state = nv10_vp_state_bind;
+	nv10->pipe.delete_vs_state = nv10_vp_state_delete;
+
+	nv10->pipe.create_fs_state = nv10_fp_state_create;
+	nv10->pipe.bind_fs_state = nv10_fp_state_bind;
+	nv10->pipe.delete_fs_state = nv10_fp_state_delete;
+
+	nv10->pipe.set_blend_color = nv10_set_blend_color;
+	nv10->pipe.set_clip_state = nv10_set_clip_state;
+	nv10->pipe.set_constant_buffer = nv10_set_constant_buffer;
+	nv10->pipe.set_framebuffer_state = nv10_set_framebuffer_state;
+	nv10->pipe.set_polygon_stipple = nv10_set_polygon_stipple;
+	nv10->pipe.set_scissor_state = nv10_set_scissor_state;
+	nv10->pipe.set_viewport_state = nv10_set_viewport_state;
+
+	nv10->pipe.set_vertex_buffers = nv10_set_vertex_buffers;
+	nv10->pipe.set_vertex_elements = nv10_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_state.h b/src/gallium/drivers/nv10/nv10_state.h
new file mode 100644
index 0000000000..3a3fd0d4f4
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_state.h
@@ -0,0 +1,139 @@
+#ifndef __NV10_STATE_H__
+#define __NV10_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv10_blend_state {
+	uint32_t b_enable;
+	uint32_t b_srcfunc;
+	uint32_t b_dstfunc;
+
+	uint32_t c_mask;
+
+	uint32_t d_enable;
+};
+
+struct nv10_sampler_state {
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv10_rasterizer_state {
+	uint32_t shade_model;
+
+	uint32_t line_width;
+	uint32_t line_smooth_en;
+
+	uint32_t point_size;
+
+	uint32_t poly_smooth_en;
+	
+	uint32_t poly_mode_front;
+	uint32_t poly_mode_back;
+
+	uint32_t front_face;
+	uint32_t cull_face;
+	uint32_t cull_face_en;
+
+	uint32_t point_sprite;
+
+	const struct pipe_rasterizer_state *templ;
+};
+
+struct nv10_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv10_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv10_vertex_program {
+	const struct pipe_shader_state *pipe;
+
+	boolean translated;
+	struct nv10_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv10_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+};
+
+struct nv10_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv10_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv10_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+};
+
+
+struct nv10_depth_stencil_alpha_state {
+	struct {
+		uint32_t func;
+		uint32_t write_enable;
+		uint32_t test_enable;
+	} depth;
+
+	struct {
+		uint32_t enable;
+		uint32_t wmask;
+		uint32_t func;
+		uint32_t ref;
+		uint32_t vmask;
+		uint32_t fail;
+		uint32_t zfail;
+		uint32_t zpass;
+	} stencil;
+
+	struct {
+		uint32_t enabled;
+		uint32_t func;
+		uint32_t ref;
+	} alpha;
+};
+
+struct nv10_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv10/nv10_state_emit.c b/src/gallium/drivers/nv10/nv10_state_emit.c
new file mode 100644
index 0000000000..5dec618b93
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_state_emit.c
@@ -0,0 +1,306 @@
+#include "nv10_context.h"
+#include "nv10_state.h"
+
+static void nv10_state_emit_blend(struct nv10_context* nv10)
+{
+	struct nv10_blend_state *b = nv10->blend;
+
+	BEGIN_RING(celsius, NV10TCL_DITHER_ENABLE, 1);
+	OUT_RING  (b->d_enable);
+
+	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_ENABLE, 3);
+	OUT_RING  (b->b_enable);
+	OUT_RING  (b->b_srcfunc);
+	OUT_RING  (b->b_dstfunc);
+
+	BEGIN_RING(celsius, NV10TCL_COLOR_MASK, 1);
+	OUT_RING  (b->c_mask);
+}
+
+static void nv10_state_emit_blend_color(struct nv10_context* nv10)
+{
+	struct pipe_blend_color *c = nv10->blend_color;
+
+	BEGIN_RING(celsius, NV10TCL_BLEND_COLOR, 1);
+	OUT_RING  ((float_to_ubyte(c->color[3]) << 24)|
+		   (float_to_ubyte(c->color[0]) << 16)|
+		   (float_to_ubyte(c->color[1]) << 8) |
+		   (float_to_ubyte(c->color[2]) << 0));
+}
+
+static void nv10_state_emit_rast(struct nv10_context* nv10)
+{
+	struct nv10_rasterizer_state *r = nv10->rast;
+
+	BEGIN_RING(celsius, NV10TCL_SHADE_MODEL, 2);
+	OUT_RING  (r->shade_model);
+	OUT_RING  (r->line_width);
+
+
+	BEGIN_RING(celsius, NV10TCL_POINT_SIZE, 1);
+	OUT_RING  (r->point_size);
+
+	BEGIN_RING(celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (r->poly_mode_front);
+	OUT_RING  (r->poly_mode_back);
+
+
+	BEGIN_RING(celsius, NV10TCL_CULL_FACE, 2);
+	OUT_RING  (r->cull_face);
+	OUT_RING  (r->front_face);
+
+	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 2);
+	OUT_RING  (r->line_smooth_en);
+	OUT_RING  (r->poly_smooth_en);
+
+	BEGIN_RING(celsius, NV10TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (r->cull_face_en);
+}
+
+static void nv10_state_emit_dsa(struct nv10_context* nv10)
+{
+	struct nv10_depth_stencil_alpha_state *d = nv10->dsa;
+
+	BEGIN_RING(celsius, NV10TCL_DEPTH_FUNC, 1);
+	OUT_RING (d->depth.func);
+
+	BEGIN_RING(celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING (d->depth.write_enable);
+
+	BEGIN_RING(celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING (d->depth.test_enable);
+
+#if 0
+	BEGIN_RING(celsius, NV10TCL_STENCIL_ENABLE, 1);
+	OUT_RING (d->stencil.enable);
+	BEGIN_RING(celsius, NV10TCL_STENCIL_MASK, 7);
+	OUT_RINGp ((uint32_t *)&(d->stencil.wmask), 7);
+#endif
+
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING (d->alpha.enabled);
+
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_FUNC, 1);
+	OUT_RING (d->alpha.func);
+
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_REF, 1);
+	OUT_RING (d->alpha.ref);
+}
+
+static void nv10_state_emit_viewport(struct nv10_context* nv10)
+{
+}
+
+static void nv10_state_emit_scissor(struct nv10_context* nv10)
+{
+	// XXX this is so not working
+/*	struct pipe_scissor_state *s = nv10->scissor;
+	BEGIN_RING(celsius, NV10TCL_SCISSOR_HORIZ, 2);
+	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
+}
+
+static void nv10_state_emit_framebuffer(struct nv10_context* nv10)
+{
+	struct pipe_framebuffer_state* fb = nv10->framebuffer;
+	struct pipe_surface *rt, *zeta = NULL;
+	uint32_t rt_format, w, h;
+	int colour_format = 0, zeta_format = 0;
+        struct nv10_miptree *nv10mt = 0;
+
+	w = fb->cbufs[0]->width;
+	h = fb->cbufs[0]->height;
+	colour_format = fb->cbufs[0]->format;
+	rt = fb->cbufs[0];
+
+	if (fb->zsbuf) {
+		if (colour_format) {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		} else {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+		}
+
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	rt_format = NV10TCL_RT_FORMAT_TYPE_LINEAR;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV10TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV10TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (zeta) {
+		BEGIN_RING(celsius, NV10TCL_RT_PITCH, 1);
+		OUT_RING  (rt->stride | (zeta->stride << 16));
+	} else {
+		BEGIN_RING(celsius, NV10TCL_RT_PITCH, 1);
+		OUT_RING  (rt->stride | (rt->stride << 16));
+	}
+
+	nv10mt = (struct nv10_miptree *)rt->texture;
+	nv10->rt[0] = nv10mt->buffer;
+
+	if (zeta_format)
+	{
+		nv10mt = (struct nv10_miptree *)zeta->texture;
+		nv10->zeta = nv10mt->buffer;
+	}
+
+	BEGIN_RING(celsius, NV10TCL_RT_HORIZ, 3);
+	OUT_RING  ((w << 16) | 0);
+	OUT_RING  ((h << 16) | 0);
+	OUT_RING  (rt_format);
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  (((w - 1) << 16) | 0 | 0x08000800);
+	OUT_RING  (((h - 1) << 16) | 0 | 0x08000800);
+}
+
+static void nv10_vertex_layout(struct nv10_context *nv10)
+{
+	struct nv10_fragment_program *fp = nv10->fragprog.current;
+	uint32_t src = 0;
+	int i;
+	struct vertex_info vinfo;
+
+	memset(&vinfo, 0, sizeof(vinfo));
+
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		switch (fp->info.input_semantic_name[i]) {
+			case TGSI_SEMANTIC_POSITION:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			case TGSI_SEMANTIC_COLOR:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			default:
+			case TGSI_SEMANTIC_GENERIC:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+			case TGSI_SEMANTIC_FOG:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+		}
+	}
+	draw_compute_vertex_size(&vinfo);
+}
+
+void
+nv10_emit_hw_state(struct nv10_context *nv10)
+{
+	int i;
+
+	if (nv10->dirty & NV10_NEW_VERTPROG) {
+		//nv10_vertprog_bind(nv10, nv10->vertprog.current);
+		nv10->dirty &= ~NV10_NEW_VERTPROG;
+	}
+
+	if (nv10->dirty & NV10_NEW_FRAGPROG) {
+		nv10_fragprog_bind(nv10, nv10->fragprog.current);
+		/*XXX: clear NV10_NEW_FRAGPROG if no new program uploaded */
+		nv10->dirty_samplers |= (1<<10);
+		nv10->dirty_samplers = 0;
+	}
+
+	if (nv10->dirty_samplers || (nv10->dirty & NV10_NEW_FRAGPROG)) {
+		nv10_fragtex_bind(nv10);
+		nv10->dirty &= ~NV10_NEW_FRAGPROG;
+	}
+
+	if (nv10->dirty & NV10_NEW_VTXARRAYS) {
+		nv10->dirty &= ~NV10_NEW_VTXARRAYS;
+		nv10_vertex_layout(nv10);
+		nv10_vtxbuf_bind(nv10);
+	}
+
+	if (nv10->dirty & NV10_NEW_BLEND) {
+		nv10->dirty &= ~NV10_NEW_BLEND;
+		nv10_state_emit_blend(nv10);
+	}
+
+	if (nv10->dirty & NV10_NEW_BLENDCOL) {
+		nv10->dirty &= ~NV10_NEW_BLENDCOL;
+		nv10_state_emit_blend_color(nv10);
+	}
+
+	if (nv10->dirty & NV10_NEW_RAST) {
+		nv10->dirty &= ~NV10_NEW_RAST;
+		nv10_state_emit_rast(nv10);
+	}
+
+	if (nv10->dirty & NV10_NEW_DSA) {
+		nv10->dirty &= ~NV10_NEW_DSA;
+		nv10_state_emit_dsa(nv10);
+	}
+
+ 	if (nv10->dirty & NV10_NEW_VIEWPORT) {
+		nv10->dirty &= ~NV10_NEW_VIEWPORT;
+		nv10_state_emit_viewport(nv10);
+	}
+
+ 	if (nv10->dirty & NV10_NEW_SCISSOR) {
+		nv10->dirty &= ~NV10_NEW_SCISSOR;
+		nv10_state_emit_scissor(nv10);
+	}
+
+ 	if (nv10->dirty & NV10_NEW_FRAMEBUFFER) {
+		nv10->dirty &= ~NV10_NEW_FRAMEBUFFER;
+		nv10_state_emit_framebuffer(nv10);
+	}
+
+	/* Emit relocs for every referenced buffer.
+	 * This is to ensure the bufmgr has an accurate idea of how
+	 * the buffer is used.  This isn't very efficient, but we don't
+	 * seem to take a significant performance hit.  Will be improved
+	 * at some point.  Vertex arrays are emitted by nv10_vbo.c
+	 */
+
+	/* Render target */
+// XXX figre out who's who for NV10TCL_DMA_* and fill accordingly
+//	BEGIN_RING(celsius, NV10TCL_DMA_COLOR0, 1);
+//	OUT_RELOCo(nv10->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(celsius, NV10TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(nv10->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	if (nv10->zeta) {
+// XXX
+//		BEGIN_RING(celsius, NV10TCL_DMA_ZETA, 1);
+//		OUT_RELOCo(nv10->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(celsius, NV10TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(nv10->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		/* XXX for when we allocate LMA on nv17 */
+/*		BEGIN_RING(celsius, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
+		OUT_RELOCl(nv10->zeta + lma_offset);*/
+	}
+
+	/* Vertex buffer */
+	BEGIN_RING(celsius, NV10TCL_DMA_VTXBUF0, 1);
+	OUT_RELOCo(nv10->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(celsius, NV10TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(nv10->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	/* Texture images */
+	for (i = 0; i < 2; i++) {
+		if (!(nv10->fp_samplers & (1 << i)))
+			continue;
+		BEGIN_RING(celsius, NV10TCL_TX_OFFSET(i), 1);
+		OUT_RELOCl(nv10->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
+			   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		BEGIN_RING(celsius, NV10TCL_TX_FORMAT(i), 1);
+		OUT_RELOCd(nv10->tex[i].buffer, nv10->tex[i].format,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+			   NOUVEAU_BO_OR, NV10TCL_TX_FORMAT_DMA0,
+			   NV10TCL_TX_FORMAT_DMA1);
+	}
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_surface.c b/src/gallium/drivers/nv10/nv10_surface.c
new file mode 100644
index 0000000000..2538151063
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_surface.c
@@ -0,0 +1,72 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv10_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nv10_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nv04_surface_2d *eng2d = nv10->screen->eng2d;
+
+	if (do_flip) {
+		desty += height;
+		while (height--) {
+			eng2d->copy(eng2d, dest, destx, desty--, src,
+				    srcx, srcy++, width, 1);
+		}
+		return;
+	}
+
+	eng2d->copy(eng2d, dest, destx, desty, src, srcx, srcy, width, height);
+}
+
+static void
+nv10_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nv04_surface_2d *eng2d = nv10->screen->eng2d;
+
+	eng2d->fill(eng2d, dest, destx, desty, width, height, value);
+}
+
+void
+nv10_init_surface_functions(struct nv10_context *nv10)
+{
+	nv10->pipe.surface_copy = nv10_surface_copy;
+	nv10->pipe.surface_fill = nv10_surface_fill;
+}
diff --git a/src/gallium/drivers/nv10/nv10_vbo.c b/src/gallium/drivers/nv10/nv10_vbo.c
new file mode 100644
index 0000000000..d0e788ac03
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_vbo.c
@@ -0,0 +1,77 @@
+#include "draw/draw_context.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv10_context.h"
+#include "nv10_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+boolean nv10_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count)
+{
+	struct nv10_context *nv10 = nv10_context( pipe );
+	struct draw_context *draw = nv10->draw;
+	unsigned i;
+
+	nv10_emit_hw_state(nv10);
+
+	/*
+	 * Map vertex buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv10->vtxbuf[i].buffer) {
+			void *buf
+				= pipe->winsys->buffer_map(pipe->winsys,
+						nv10->vtxbuf[i].buffer,
+						PIPE_BUFFER_USAGE_CPU_READ);
+			draw_set_mapped_vertex_buffer(draw, i, buf);
+		}
+	}
+	/* Map index buffer, if present */
+	if (indexBuffer) {
+		void *mapped_indexes
+			= pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
+					PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
+	}
+	else {
+		/* no index/element buffer */
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	draw_set_mapped_constant_buffer(draw,
+					nv10->constbuf[PIPE_SHADER_VERTEX],
+					nv10->constbuf_nr[PIPE_SHADER_VERTEX]);
+
+	/* draw! */
+	draw_arrays(nv10->draw, prim, start, count);
+
+	/*
+	 * unmap vertex/index buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv10->vtxbuf[i].buffer) {
+			pipe->winsys->buffer_unmap(pipe->winsys, nv10->vtxbuf[i].buffer);
+			draw_set_mapped_vertex_buffer(draw, i, NULL);
+		}
+	}
+	if (indexBuffer) {
+		pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	return TRUE;
+}
+
+boolean nv10_draw_arrays( struct pipe_context *pipe,
+				 unsigned prim, unsigned start, unsigned count)
+{
+	return nv10_draw_elements(pipe, NULL, 0, prim, start, count);
+}
+
+
+
diff --git a/src/gallium/drivers/nv20/Makefile b/src/gallium/drivers/nv20/Makefile
new file mode 100644
index 0000000000..93e34f8e92
--- /dev/null
+++ b/src/gallium/drivers/nv20/Makefile
@@ -0,0 +1,20 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv20
+
+C_SOURCES = \
+	nv20_clear.c \
+	nv20_context.c \
+	nv20_fragprog.c \
+	nv20_fragtex.c \
+	nv20_miptree.c \
+	nv20_prim_vbuf.c \
+	nv20_screen.c \
+	nv20_state.c \
+	nv20_state_emit.c \
+	nv20_surface.c \
+	nv20_vbo.c
+#	nv20_vertprog.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/nv20/nv20_clear.c b/src/gallium/drivers/nv20/nv20_clear.c
new file mode 100644
index 0000000000..29f4afd87c
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_clear.c
@@ -0,0 +1,13 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv20_context.h"
+
+void
+nv20_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+	ps->status = PIPE_SURFACE_STATUS_CLEAR;
+}
diff --git a/src/gallium/drivers/nv20/nv20_context.c b/src/gallium/drivers/nv20/nv20_context.c
new file mode 100644
index 0000000000..1659aec8fa
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_context.c
@@ -0,0 +1,419 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "nv20_context.h"
+#include "nv20_screen.h"
+
+static void
+nv20_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	draw_flush(nv20->draw);
+
+	FIRE_RING(fence);
+}
+
+static void
+nv20_destroy(struct pipe_context *pipe)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	if (nv20->draw)
+		draw_destroy(nv20->draw);
+
+	FREE(nv20);
+}
+
+static void nv20_init_hwctx(struct nv20_context *nv20)
+{
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_winsys *nvws = screen->nvws;
+	int i;
+	float projectionmatrix[16];
+	const boolean is_nv25tcl = (nv20->screen->kelvin->grclass == NV25TCL);
+
+	BEGIN_RING(kelvin, NV20TCL_DMA_NOTIFY, 1);
+	OUT_RING  (screen->sync->handle);
+	BEGIN_RING(kelvin, NV20TCL_DMA_TEXTURE0, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->gart->handle); /* TEXTURE1 */
+	BEGIN_RING(kelvin, NV20TCL_DMA_COLOR, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->vram->handle); /* ZETA */
+
+	BEGIN_RING(kelvin, NV20TCL_DMA_QUERY, 1);
+	OUT_RING  (0); /* renouveau: beef0351, unique */
+
+	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 1);
+	OUT_RING  ((0xfff << 16) | 0x0);
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_VERT(0), 1);
+	OUT_RING  ((0xfff << 16) | 0x0);
+
+	for (i = 1; i < NV20TCL_VIEWPORT_CLIP_HORIZ__SIZE; i++) {
+		BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		OUT_RING  (0);
+		BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_VERT(i), 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_MODE, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(kelvin, 0x17e0, 3);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+
+	if (is_nv25tcl) {
+		BEGIN_RING(kelvin, NV20TCL_TX_RCOMP, 1);
+		OUT_RING  (NV20TCL_TX_RCOMP_LEQUAL | 0xdb0);
+	} else {
+		BEGIN_RING(kelvin, 0x1e68, 1);
+		OUT_RING  (0x4b800000); /* 16777216.000000 */
+		BEGIN_RING(kelvin, NV20TCL_TX_RCOMP, 1);
+		OUT_RING  (NV20TCL_TX_RCOMP_LEQUAL);
+	}
+
+	BEGIN_RING(kelvin, 0x290, 1);
+	OUT_RING  ((0x10 << 16) | 1);
+	BEGIN_RING(kelvin, 0x9fc, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, 0x1d80, 1);
+	OUT_RING  (1);
+	BEGIN_RING(kelvin, 0x9f8, 1);
+	OUT_RING  (4);
+	BEGIN_RING(kelvin, 0x17ec, 3);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (0.0);
+
+	if (is_nv25tcl) {
+		BEGIN_RING(kelvin, 0x1d88, 1);
+		OUT_RING  (3);
+
+		BEGIN_RING(kelvin, NV25TCL_DMA_IN_MEMORY9, 1);
+		OUT_RING  (nvws->channel->vram->handle);
+		BEGIN_RING(kelvin, NV25TCL_DMA_IN_MEMORY8, 1);
+		OUT_RING  (nvws->channel->vram->handle);
+	}
+	BEGIN_RING(kelvin, NV20TCL_DMA_FENCE, 1);
+	OUT_RING  (0);	/* renouveau: beef1e10 */
+
+	BEGIN_RING(kelvin, 0x1e98, 1);
+	OUT_RING  (0);
+#if 0
+	if (is_nv25tcl) {
+		BEGIN_RING(NvSub3D, NV25TCL_DMA_IN_MEMORY4, 2);
+		OUT_RING  (NvDmaTT);	/* renouveau: beef0202 */
+		OUT_RING  (NvDmaFB);	/* renouveau: beef0201 */
+
+		BEGIN_RING(NvSub3D, NV20TCL_DMA_TEXTURE1, 1);
+		OUT_RING  (NvDmaTT);	/* renouveau: beef0202 */
+	}
+#endif
+	BEGIN_RING(kelvin, NV20TCL_NOTIFY, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(kelvin, 0x120, 3);
+	OUT_RING  (0);
+	OUT_RING  (1);
+	OUT_RING  (2);
+
+/* error: ILLEGAL_MTHD, PROTECTION_FAULT
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_TRANSLATE_X, 4);
+	OUT_RINGf (0.0);
+	OUT_RINGf (512.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+*/
+
+	if (is_nv25tcl) {
+		BEGIN_RING(kelvin, 0x022c, 2);
+		OUT_RING  (0x280);
+		OUT_RING  (0x07d28000);
+	}
+
+/* * illegal method, protection fault
+	BEGIN_RING(NvSub3D, 0x1c2c, 1);
+	OUT_RING  (0); */
+
+	if (is_nv25tcl) {
+		BEGIN_RING(kelvin, 0x1da4, 1);
+		OUT_RING  (0);
+	}
+
+/* * crashes with illegal method, protection fault
+	BEGIN_RING(NvSub3D, 0x1c18, 1);
+	OUT_RING  (0x200); */
+
+	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 2);
+	OUT_RING  ((0 << 16) | 0);
+	OUT_RING  ((0 << 16) | 0);
+
+	/* *** Set state *** */
+
+	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_FUNC, 2);
+	OUT_RING  (NV20TCL_ALPHA_FUNC_FUNC_ALWAYS);
+	OUT_RING  (0);			/* NV20TCL_ALPHA_FUNC_REF */
+
+	for (i = 0; i < NV20TCL_TX_ENABLE__SIZE; ++i) {
+		BEGIN_RING(kelvin, NV20TCL_TX_ENABLE(i), 1);
+		OUT_RING  (0);
+	}
+	BEGIN_RING(kelvin, NV20TCL_TX_SHADER_OP, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_TX_SHADER_CULL_MODE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_IN_ALPHA(0), 4);
+	OUT_RING  (0x30d410d0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_OUT_RGB(0), 4);
+	OUT_RING  (0x00000c00);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_ENABLE, 1);
+	OUT_RING  (0x00011101);
+	BEGIN_RING(kelvin, NV20TCL_RC_FINAL0, 2);
+	OUT_RING  (0x130e0300);
+	OUT_RING  (0x0c091c80);
+	BEGIN_RING(kelvin, NV20TCL_RC_OUT_ALPHA(0), 4);
+	OUT_RING  (0x00000c00);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_IN_RGB(0), 4);
+	OUT_RING  (0x20c400c0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_COLOR0, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_CONSTANT_COLOR0(0), 4);
+	OUT_RING  (0x035125a0);
+	OUT_RING  (0);
+	OUT_RING  (0x40002000);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_MULTISAMPLE_CONTROL, 1);
+	OUT_RING  (0xffff0000);
+
+	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_DITHER_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_STENCIL_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_SRC, 4);
+	OUT_RING  (NV20TCL_BLEND_FUNC_SRC_ONE);
+	OUT_RING  (NV20TCL_BLEND_FUNC_DST_ZERO);
+	OUT_RING  (0);			/* NV20TCL_BLEND_COLOR */
+	OUT_RING  (NV20TCL_BLEND_EQUATION_FUNC_ADD);
+	BEGIN_RING(kelvin, NV20TCL_STENCIL_MASK, 7);
+	OUT_RING  (0xff);
+	OUT_RING  (NV20TCL_STENCIL_FUNC_FUNC_ALWAYS);
+	OUT_RING  (0);			/* NV20TCL_STENCIL_FUNC_REF */
+	OUT_RING  (0xff);		/* NV20TCL_STENCIL_FUNC_MASK */
+	OUT_RING  (NV20TCL_STENCIL_OP_FAIL_KEEP);
+	OUT_RING  (NV20TCL_STENCIL_OP_ZFAIL_KEEP);
+	OUT_RING  (NV20TCL_STENCIL_OP_ZPASS_KEEP);
+
+	BEGIN_RING(kelvin, NV20TCL_COLOR_LOGIC_OP_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (NV20TCL_COLOR_LOGIC_OP_OP_COPY);
+	BEGIN_RING(kelvin, 0x17cc, 1);
+	OUT_RING  (0);
+	if (is_nv25tcl) {
+		BEGIN_RING(kelvin, 0x1d84, 1);
+		OUT_RING  (1);
+	}
+	BEGIN_RING(kelvin, NV20TCL_LIGHTING_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_LIGHT_CONTROL, 1);
+	OUT_RING  (0x00020000);
+	BEGIN_RING(kelvin, NV20TCL_SEPARATE_SPECULAR_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_LIGHT_MODEL_TWO_SIDE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_ENABLED_LIGHTS, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_NORMALIZE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_STIPPLE_PATTERN(0),
+					NV20TCL_POLYGON_STIPPLE_PATTERN__SIZE);
+	for (i = 0; i < NV20TCL_POLYGON_STIPPLE_PATTERN__SIZE; ++i) {
+		OUT_RING(0xffffffff);
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);		/* NV20TCL.POLYGON_OFFSET_LINE_ENABLE */
+	OUT_RING  (0);		/* NV20TCL.POLYGON_OFFSET_FILL_ENABLE */
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_FUNC, 1);
+	OUT_RING  (NV20TCL_DEPTH_FUNC_LESS);
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_OFFSET_FACTOR, 2);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);	/* NV20TCL.POLYGON_OFFSET_UNITS */
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_UNK17D8, 1);
+	OUT_RING  (1);
+	if (!is_nv25tcl) {
+		BEGIN_RING(kelvin, 0x1d84, 1);
+		OUT_RING  (3);
+	}
+	BEGIN_RING(kelvin, NV20TCL_POINT_SIZE, 1);
+	if (!is_nv25tcl) {
+		OUT_RING  (8);
+	} else {
+		OUT_RINGf (1.0);
+	}
+	if (!is_nv25tcl) {
+		BEGIN_RING(kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 2);
+		OUT_RING  (0);
+		OUT_RING  (0);		/* NV20TCL.POINT_SMOOTH_ENABLE */
+	} else {
+		BEGIN_RING(kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 1);
+		OUT_RING  (0);
+		BEGIN_RING(kelvin, 0x0a1c, 1);
+		OUT_RING  (0x800);
+	}
+	BEGIN_RING(kelvin, NV20TCL_LINE_WIDTH, 1);
+	OUT_RING  (8);
+	BEGIN_RING(kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (NV20TCL_POLYGON_MODE_FRONT_FILL);
+	OUT_RING  (NV20TCL_POLYGON_MODE_BACK_FILL);
+	BEGIN_RING(kelvin, NV20TCL_CULL_FACE, 2);
+	OUT_RING  (NV20TCL_CULL_FACE_BACK);
+	OUT_RING  (NV20TCL_FRONT_FACE_CCW);
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_SHADE_MODEL, 1);
+	OUT_RING  (NV20TCL_SHADE_MODEL_SMOOTH);
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_STIPPLE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_TX_GEN_S(0), 4 * NV20TCL_TX_GEN_S__SIZE);
+	for (i=0; i < 4 * NV20TCL_TX_GEN_S__SIZE; ++i) {
+		OUT_RING(0);
+	}
+	BEGIN_RING(kelvin, NV20TCL_FOG_EQUATION_CONSTANT, 3);
+	OUT_RINGf (1.5);
+	OUT_RINGf (-0.090168);		/* NV20TCL.FOG_EQUATION_LINEAR */
+	OUT_RINGf (0.0);		/* NV20TCL.FOG_EQUATION_QUADRATIC */
+	BEGIN_RING(kelvin, NV20TCL_FOG_MODE, 2);
+	OUT_RING  (NV20TCL_FOG_MODE_EXP_2);
+	OUT_RING  (NV20TCL_FOG_COORD_DIST_COORD_FOG);
+	BEGIN_RING(kelvin, NV20TCL_FOG_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);			/* NV20TCL.FOG_COLOR */
+	BEGIN_RING(kelvin, NV20TCL_ENGINE, 1);
+	OUT_RING  (NV20TCL_ENGINE_FIXED);
+
+	for (i = 0; i < NV20TCL_TX_MATRIX_ENABLE__SIZE; ++i) {
+		BEGIN_RING(kelvin, NV20TCL_TX_MATRIX_ENABLE(i), 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_VTX_ATTR_4F_X(1), 4 * 15);
+	OUT_RINGf(1.0); OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(1.0);
+	OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(1.0); OUT_RINGf(1.0);
+	OUT_RINGf(1.0); OUT_RINGf(1.0); OUT_RINGf(1.0); OUT_RINGf(1.0);
+	for (i = 4; i < 16; ++i) {
+		OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(0.0);	OUT_RINGf(1.0);
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_EDGEFLAG_ENABLE, 1);
+	OUT_RING  (1);
+	BEGIN_RING(kelvin, NV20TCL_COLOR_MASK, 1);
+	OUT_RING (0x00010101);
+	BEGIN_RING(kelvin, NV20TCL_CLEAR_VALUE, 1);
+	OUT_RING (0);
+
+	memset(projectionmatrix, 0, sizeof(projectionmatrix));
+	projectionmatrix[0*4+0] = 1.0;
+	projectionmatrix[1*4+1] = 1.0;
+	projectionmatrix[2*4+2] = 16777215.0;
+	projectionmatrix[3*4+3] = 1.0;
+	BEGIN_RING(kelvin, NV20TCL_PROJECTION_MATRIX(0), 16);
+	for (i = 0; i < 16; i++) {
+		OUT_RINGf  (projectionmatrix[i]);
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_RANGE_NEAR, 2);
+	OUT_RINGf (0.0);
+	OUT_RINGf (16777216.0); /* [0, 1] scaled approx to [0, 2^24] */
+
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_SCALE0_X, 4);
+	OUT_RINGf (0.0); /* x-offset, w/2 + 1.031250 */
+	OUT_RINGf (0.0); /* y-offset, h/2 + 0.030762 */
+	OUT_RINGf (0.0);
+	OUT_RINGf (16777215.0);
+
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_SCALE1_X, 4);
+	OUT_RINGf (0.0); /* no effect?, w/2 */
+	OUT_RINGf (0.0); /* no effect?, h/2 */
+	OUT_RINGf (16777215.0 * 0.5);
+	OUT_RINGf (65535.0);
+
+	FIRE_RING (NULL);
+}
+
+static void
+nv20_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+}
+
+struct pipe_context *
+nv20_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv20_screen *screen = nv20_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv20_context *nv20;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv20 = CALLOC(1, sizeof(struct nv20_context));
+	if (!nv20)
+		return NULL;
+	nv20->screen = screen;
+	nv20->pctx_id = pctx_id;
+
+	nv20->nvws = nvws;
+
+	nv20->pipe.winsys = ws;
+	nv20->pipe.screen = pscreen;
+	nv20->pipe.destroy = nv20_destroy;
+	nv20->pipe.set_edgeflags = nv20_set_edgeflags;
+	nv20->pipe.draw_arrays = nv20_draw_arrays;
+	nv20->pipe.draw_elements = nv20_draw_elements;
+	nv20->pipe.clear = nv20_clear;
+	nv20->pipe.flush = nv20_flush;
+
+	nv20_init_surface_functions(nv20);
+	nv20_init_state_functions(nv20);
+
+	nv20->draw = draw_create();
+	assert(nv20->draw);
+	draw_set_rasterize_stage(nv20->draw, nv20_draw_vbuf_stage(nv20));
+
+	nv20_init_hwctx(nv20);
+
+	return &nv20->pipe;
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_context.h b/src/gallium/drivers/nv20/nv20_context.h
new file mode 100644
index 0000000000..8ad926db20
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_context.h
@@ -0,0 +1,153 @@
+#ifndef __NV20_CONTEXT_H__
+#define __NV20_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv20_screen *ctx = nv20->screen
+#include "nouveau/nouveau_push.h"
+
+#include "nv20_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+#define NV20_NEW_VERTPROG	(1 << 0)
+#define NV20_NEW_FRAGPROG	(1 << 1)
+#define NV20_NEW_VTXARRAYS	(1 << 2)
+#define NV20_NEW_BLEND		(1 << 3)
+#define NV20_NEW_BLENDCOL	(1 << 4)
+#define NV20_NEW_RAST 		(1 << 5)
+#define NV20_NEW_DSA  		(1 << 6)
+#define NV20_NEW_VIEWPORT	(1 << 7)
+#define NV20_NEW_SCISSOR	(1 << 8)
+#define NV20_NEW_FRAMEBUFFER	(1 << 9)
+
+#include "nv20_screen.h"
+
+struct nv20_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv20_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	uint32_t dirty;
+
+	struct nv20_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv20_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned dirty_samplers;
+	unsigned fp_samplers;
+	unsigned vp_samplers;
+
+	uint32_t rt_enable;
+	struct pipe_buffer *rt[4];
+	struct pipe_buffer *zeta;
+	uint32_t lma_offset;
+
+	struct nv20_blend_state *blend;
+	struct pipe_blend_color *blend_color;
+	struct nv20_rasterizer_state *rast;
+	struct nv20_depth_stencil_alpha_state *dsa;
+	struct pipe_viewport_state *viewport;
+	struct pipe_scissor_state *scissor;
+	struct pipe_framebuffer_state *framebuffer;
+
+	//struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	float *constbuf[PIPE_SHADER_TYPES][32][4];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+
+	struct vertex_info vertex_info;
+
+	struct {
+		struct pipe_buffer *buffer;
+		uint32_t format;
+	} tex[2];
+
+	unsigned vb_enable;
+	struct {
+		struct pipe_buffer *buffer;
+		unsigned delta;
+	} vb[16];
+
+/*	struct {
+	
+		struct nouveau_resource *exec_heap;
+		struct nouveau_resource *data_heap;
+
+		struct nv20_vertex_program *active;
+
+		struct nv20_vertex_program *current;
+	} vertprog;
+*/
+	struct {
+		struct nv20_fragment_program *active;
+
+		struct nv20_fragment_program *current;
+		struct pipe_buffer *constant_buf;
+	} fragprog;
+
+	struct pipe_vertex_buffer  vtxbuf[PIPE_MAX_ATTRIBS];
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+};
+
+static INLINE struct nv20_context *
+nv20_context(struct pipe_context *pipe)
+{
+	return (struct nv20_context *)pipe;
+}
+
+extern void nv20_init_state_functions(struct nv20_context *nv20);
+extern void nv20_init_surface_functions(struct nv20_context *nv20);
+
+extern void nv20_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv20_clear.c */
+extern void nv20_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv20_draw.c */
+extern struct draw_stage *nv20_draw_render_stage(struct nv20_context *nv20);
+
+/* nv20_fragprog.c */
+extern void nv20_fragprog_bind(struct nv20_context *,
+			       struct nv20_fragment_program *);
+extern void nv20_fragprog_destroy(struct nv20_context *,
+				  struct nv20_fragment_program *);
+
+/* nv20_fragtex.c */
+extern void nv20_fragtex_bind(struct nv20_context *);
+
+/* nv20_prim_vbuf.c */
+struct draw_stage *nv20_draw_vbuf_stage( struct nv20_context *nv20 );
+extern void nv20_vtxbuf_bind(struct nv20_context* nv20);
+
+/* nv20_state.c and friends */
+extern void nv20_emit_hw_state(struct nv20_context *nv20);
+extern void nv20_state_tex_update(struct nv20_context *nv20);
+
+/* nv20_vbo.c */
+extern boolean nv20_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv20_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count);
+
+
+#endif
diff --git a/src/gallium/drivers/nv20/nv20_fragprog.c b/src/gallium/drivers/nv20/nv20_fragprog.c
new file mode 100644
index 0000000000..4f496369dd
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_fragprog.c
@@ -0,0 +1,21 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv20_context.h"
+
+void
+nv20_fragprog_bind(struct nv20_context *nv20, struct nv20_fragment_program *fp)
+{
+}
+
+void
+nv20_fragprog_destroy(struct nv20_context *nv20,
+		      struct nv20_fragment_program *fp)
+{
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_fragtex.c b/src/gallium/drivers/nv20/nv20_fragtex.c
new file mode 100644
index 0000000000..495a7be912
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_fragtex.c
@@ -0,0 +1,124 @@
+#include "nv20_context.h"
+#include "nouveau/nouveau_util.h"
+
+#define _(m,tf)                                                                \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV20TCL_TX_FORMAT_FORMAT_##tf,                                               \
+}
+
+struct nv20_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+};
+
+static struct nv20_texture_format
+nv20_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8),
+	_(A1R5G5B5_UNORM, A1R5G5B5),
+	_(A4R4G4B4_UNORM, A4R4G4B4),
+	_(L8_UNORM      , L8      ),
+	_(A8_UNORM      , A8      ),
+	_(A8L8_UNORM    , A8L8    ),
+/*	_(RGB_DXT1      , DXT1,   ), */
+/*	_(RGBA_DXT1     , DXT1,   ), */
+/*	_(RGBA_DXT3     , DXT3,   ), */
+/*	_(RGBA_DXT5     , DXT5,   ), */
+	{},
+};
+
+static struct nv20_texture_format *
+nv20_fragtex_format(uint pipe_format)
+{
+	struct nv20_texture_format *tf = nv20_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	return NULL;
+}
+
+
+static void
+nv20_fragtex_build(struct nv20_context *nv20, int unit)
+{
+#if 0
+	struct nv20_sampler_state *ps = nv20->tex_sampler[unit];
+	struct nv20_miptree *nv20mt = nv20->tex_miptree[unit];
+	struct pipe_texture *pt = &nv20mt->base;
+	struct nv20_texture_format *tf;
+	uint32_t txf, txs, txp;
+
+	tf = nv20_fragtex_format(pt->format);
+	if (!tf || !tf->defined) {
+		NOUVEAU_ERR("Unsupported texture format: 0x%x\n", pt->format);
+		return;
+	}
+
+	txf  = tf->format << 8;
+	txf |= (pt->last_level + 1) << 16;
+	txf |= log2i(pt->width[0]) << 20;
+	txf |= log2i(pt->height[0]) << 24;
+	txf |= log2i(pt->depth[0]) << 28;
+	txf |= 8;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV10TCL_TX_FORMAT_CUBE_MAP;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= (2<<4);
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= (1<<4);
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	BEGIN_RING(kelvin, NV10TCL_TX_OFFSET(unit), 8);
+	OUT_RELOCl(nv20mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(nv20mt->buffer,txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING  (ps->wrap);
+	OUT_RING  (0x40000000); /* enable */
+	OUT_RING  (txs);
+	OUT_RING  (ps->filt | 0x2000 /* magic */);
+	OUT_RING  ((pt->width[0] << 16) | pt->height[0]);
+	OUT_RING  (ps->bcol);
+#endif
+}
+
+void
+nv20_fragtex_bind(struct nv20_context *nv20)
+{
+#if 0
+	struct nv20_fragment_program *fp = nv20->fragprog.active;
+	unsigned samplers, unit;
+
+	samplers = nv20->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		BEGIN_RING(kelvin, NV10TCL_TX_ENABLE(unit), 1);
+		OUT_RING  (0);
+	}
+
+	samplers = nv20->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		nv20_fragtex_build(nv20, unit);
+	}
+
+	nv20->fp_samplers = fp->samplers;
+#endif
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_miptree.c b/src/gallium/drivers/nv20/nv20_miptree.c
new file mode 100644
index 0000000000..ef7e9c5428
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_miptree.c
@@ -0,0 +1,206 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv20_context.h"
+#include "nv20_screen.h"
+
+static void
+nv20_miptree_layout(struct nv20_miptree *nv20mt)
+{
+	struct pipe_texture *pt = &nv20mt->base;
+	boolean swizzled = FALSE;
+	uint width = pt->width[0], height = pt->height[0];
+	uint offset = 0;
+	int nr_faces, l, f;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else {
+		nr_faces = 1;
+	}
+	
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (swizzled)
+			nv20mt->level[l].pitch = pt->nblocksx[l] * pt->block.size;
+		else
+			nv20mt->level[l].pitch = pt->nblocksx[0] * pt->block.size;
+		nv20mt->level[l].pitch = (nv20mt->level[l].pitch + 63) & ~63;
+
+		nv20mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			nv20mt->level[l].image_offset[f] = offset;
+			offset += nv20mt->level[l].pitch * pt->height[l];
+		}
+	}
+
+	nv20mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv20_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
+		     const unsigned *stride, struct pipe_buffer *pb)
+{
+	struct nv20_miptree *mt;
+
+	/* Only supports 2D, non-mipmapped textures for the moment */
+	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
+	    pt->depth[0] != 1)
+		return NULL;
+
+	mt = CALLOC_STRUCT(nv20_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->level[0].pitch = stride[0];
+	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+
+	pipe_buffer_reference(pscreen, &mt->buffer, pb);
+	return &mt->base;
+}
+
+static struct pipe_texture *
+nv20_miptree_create(struct pipe_screen *screen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv20_miptree *mt;
+	unsigned buf_usage = PIPE_BUFFER_USAGE_PIXEL |
+	                     NOUVEAU_BUFFER_USAGE_TEXTURE;
+
+	mt = MALLOC(sizeof(struct nv20_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = screen;
+
+	/* Swizzled textures must be POT */
+	if (pt->width[0] & (pt->width[0] - 1) ||
+	    pt->height[0] & (pt->height[0] - 1))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & (PIPE_TEXTURE_USAGE_PRIMARY |
+	                     PIPE_TEXTURE_USAGE_DISPLAY_TARGET))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else {
+		switch (pt->format) {
+		/* TODO: Figure out which formats can be swizzled */
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_X8R8G8B8_UNORM:
+		case PIPE_FORMAT_R16_SNORM:
+			break;
+		default:
+			mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+		}
+	}
+
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
+		buf_usage |= PIPE_BUFFER_USAGE_CPU_READ_WRITE;
+
+	nv20_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256, buf_usage, mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+	
+	return &mt->base;
+}
+
+static void
+nv20_miptree_release(struct pipe_screen *screen, struct pipe_texture **pt)
+{
+	struct pipe_texture *mt = *pt;
+
+	*pt = NULL;
+	if (--mt->refcount <= 0) {
+		struct nv20_miptree *nv20mt = (struct nv20_miptree *)mt;
+		int l;
+
+		pipe_buffer_reference(screen, &nv20mt->buffer, NULL);
+		for (l = 0; l <= mt->last_level; l++) {
+			if (nv20mt->level[l].image_offset)
+				FREE(nv20mt->level[l].image_offset);
+		}
+		FREE(nv20mt);
+	}
+}
+
+static struct pipe_surface *
+nv20_miptree_surface_get(struct pipe_screen *screen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv20_miptree *nv20mt = (struct nv20_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = nv20mt->level[level].pitch;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = nv20mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ps->offset = nv20mt->level[level].image_offset[zslice];
+	} else {
+		ps->offset = nv20mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv20_miptree_surface_release(struct pipe_screen *pscreen,
+			     struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+
+	*psurface = NULL;
+	if (--ps->refcount > 0)
+		return;
+
+	pipe_texture_reference(&ps->texture, NULL);
+	FREE(ps);
+}
+
+void nv20_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv20_miptree_create;
+	pscreen->texture_blanket = nv20_miptree_blanket;
+	pscreen->texture_release = nv20_miptree_release;
+	pscreen->get_tex_surface = nv20_miptree_surface_get;
+	pscreen->tex_surface_release = nv20_miptree_surface_release;
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_prim_vbuf.c b/src/gallium/drivers/nv20/nv20_prim_vbuf.c
new file mode 100644
index 0000000000..319e1f6557
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_prim_vbuf.c
@@ -0,0 +1,430 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * Build post-transformation, post-clipping vertex buffers and element
+ * lists by hooking into the end of the primitive pipeline and
+ * manipulating the vertex_id field in the vertex headers.
+ *
+ * XXX: work in progress 
+ * 
+ * \author José Fonseca <jrfonseca@tungstengraphics.com>
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "util/u_debug.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+#include "draw/draw_vbuf.h"
+
+/**
+ * Primitive renderer for nv20.
+ */
+struct nv20_vbuf_render {
+	struct vbuf_render base;
+
+	struct nv20_context *nv20;   
+
+	/** Vertex buffer in VRAM */
+	struct pipe_buffer *pbuffer;
+
+	/** Vertex buffer in normal memory */
+	void *mbuffer;
+
+	/** Vertex size in bytes */
+	/*unsigned vertex_size;*/
+
+	/** Hardware primitive */
+	unsigned hwprim;
+};
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct nv20_vbuf_render *
+nv20_vbuf_render(struct vbuf_render *render)
+{
+	assert(render);
+	return (struct nv20_vbuf_render *)render;
+}
+
+void nv20_vtxbuf_bind( struct nv20_context* nv20 )
+{
+#if 0
+	int i;
+	for(i = 0; i < NV20TCL_VTXBUF_ADDRESS__SIZE; i++) {
+		BEGIN_RING(kelvin, NV20TCL_VTXBUF_ADDRESS(i), 1);
+		OUT_RING(0/*nv20->vtxbuf*/);
+		BEGIN_RING(kelvin, NV20TCL_VTXFMT(i) ,1);
+		OUT_RING(0/*XXX*/);
+	}
+#endif
+}
+
+static const struct vertex_info *
+nv20_vbuf_render_get_vertex_info( struct vbuf_render *render )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	struct nv20_context *nv20 = nv20_render->nv20;
+
+	nv20_emit_hw_state(nv20);
+
+	return &nv20->vertex_info;
+}
+
+static void *
+nv20__allocate_mbuffer(struct nv20_vbuf_render *nv20_render, size_t size)
+{
+	nv20_render->mbuffer = MALLOC(size);
+	return nv20_render->mbuffer;
+}
+
+static void
+nv20__allocate_pbuffer(struct nv20_vbuf_render *nv20_render, size_t size)
+{
+	struct pipe_winsys *winsys = nv20_render->nv20->pipe.winsys;
+	nv20_render->pbuffer = winsys->buffer_create(winsys, 64,
+					PIPE_BUFFER_USAGE_VERTEX, size);
+}
+
+static boolean
+nv20_vbuf_render_allocate_vertices( struct vbuf_render *render,
+		ushort vertex_size,
+		ushort nr_vertices )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	size_t size = (size_t)vertex_size * (size_t)nr_vertices;
+	void *buf;
+
+	assert(!nv20_render->pbuffer);
+	assert(!nv20_render->mbuffer);
+
+	/*
+	 * For small amount of vertices, don't bother with pipe vertex
+	 * buffer, the data will be passed directly via the fifo.
+	 */
+	/* XXX: Pipe vertex buffers don't work. */
+	if (0 && size > 16 * 1024) {
+		nv20__allocate_pbuffer(nv20_render, size);
+		/* umm yeah so this is ugly */
+		buf = nv20_render->pbuffer;
+	} else {
+		buf = nv20__allocate_mbuffer(nv20_render, size);
+	}
+
+	if (buf)
+		nv20_render->nv20->dirty |= NV20_NEW_VTXARRAYS;
+
+	return buf ? TRUE : FALSE;
+}
+
+static void *
+nv20_vbuf_render_map_vertices( struct vbuf_render *render )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	struct pipe_winsys *winsys = nv20_render->nv20->pipe.winsys;
+
+	if (nv20_render->pbuffer) {
+		return winsys->buffer_map(winsys,
+				nv20_render->pbuffer,
+				PIPE_BUFFER_USAGE_CPU_WRITE);
+	} else if (nv20_render->mbuffer) {
+		return nv20_render->mbuffer;
+	} else
+		assert(0);
+
+	/* warnings be gone */
+	return NULL;
+}
+
+static void
+nv20_vbuf_render_unmap_vertices( struct vbuf_render *render,
+		ushort min_index,
+		ushort max_index )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	struct pipe_winsys *winsys = nv20_render->nv20->pipe.winsys;
+
+	if (nv20_render->pbuffer)
+		winsys->buffer_unmap(winsys, nv20_render->pbuffer);
+}
+
+static boolean
+nv20_vbuf_render_set_primitive( struct vbuf_render *render, 
+		unsigned prim )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	unsigned hwp = nvgl_primitive(prim);
+	if (hwp == 0)
+		return FALSE;
+
+	nv20_render->hwprim = hwp;
+	return TRUE;
+}
+
+static uint32_t
+nv20__vtxhwformat(unsigned stride, unsigned fields, unsigned type)
+{
+	return (stride << NV20TCL_VTXFMT_STRIDE_SHIFT) |
+		(fields << NV20TCL_VTXFMT_SIZE_SHIFT) |
+		(type << NV20TCL_VTXFMT_TYPE_SHIFT);
+}
+
+static unsigned
+nv20__emit_format(struct nv20_context *nv20, enum attrib_emit type, int hwattr)
+{
+	uint32_t hwfmt = 0;
+	unsigned fields;
+
+	switch (type) {
+	case EMIT_OMIT:
+		hwfmt = nv20__vtxhwformat(0, 0, 2);
+		fields = 0;
+		break;
+	case EMIT_1F:
+		hwfmt = nv20__vtxhwformat(4, 1, 2);
+		fields = 1;
+		break;
+	case EMIT_2F:
+		hwfmt = nv20__vtxhwformat(8, 2, 2);
+		fields = 2;
+		break;
+	case EMIT_3F:
+		hwfmt = nv20__vtxhwformat(12, 3, 2);
+		fields = 3;
+		break;
+	case EMIT_4F:
+		hwfmt = nv20__vtxhwformat(16, 4, 2);
+		fields = 4;
+		break;
+	default:
+		NOUVEAU_ERR("unhandled attrib_emit %d\n", type);
+		return 0;
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_VTXFMT(hwattr), 1);
+	OUT_RING(hwfmt);
+	return fields;
+}
+
+static unsigned
+nv20__emit_vertex_array_format(struct nv20_context *nv20)
+{
+	struct vertex_info *vinfo = &nv20->vertex_info;
+	int hwattr = NV20TCL_VTXFMT__SIZE;
+	int attr = 0;
+	unsigned nr_fields = 0;
+
+	while (hwattr-- > 0) {
+		if (vinfo->hwfmt[0] & (1 << hwattr)) {
+			nr_fields += nv20__emit_format(nv20,
+					vinfo->attrib[attr].emit, hwattr);
+			attr++;
+		} else
+			nv20__emit_format(nv20, EMIT_OMIT, hwattr);
+	}
+
+	return nr_fields;
+}
+
+static void
+nv20__draw_mbuffer(struct nv20_vbuf_render *nv20_render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv20_context *nv20 = nv20_render->nv20;
+	struct vertex_info *vinfo = &nv20->vertex_info;
+	unsigned nr_fields;
+	int max_push;
+	ubyte *data = nv20_render->mbuffer;
+	int vsz = 4 * vinfo->size;
+
+	nr_fields = nv20__emit_vertex_array_format(nv20);
+
+	BEGIN_RING(kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING(nv20_render->hwprim);
+
+	max_push = 1200 / nr_fields;
+	while (nr_indices) {
+		int i;
+		int push = MIN2(nr_indices, max_push);
+
+		BEGIN_RING_NI(kelvin, NV20TCL_VERTEX_DATA, push * nr_fields);
+		for (i = 0; i < push; i++) {
+			/* XXX: fixme to handle other than floats? */
+			int f = nr_fields;
+			float *attrv = (float*)&data[indices[i] * vsz];
+			while (f-- > 0)
+				OUT_RINGf(*attrv++);
+		}
+
+		nr_indices -= push;
+		indices += push;
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING(NV20TCL_VERTEX_BEGIN_END_STOP);
+}
+
+static void
+nv20__draw_pbuffer(struct nv20_vbuf_render *nv20_render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv20_context *nv20 = nv20_render->nv20;
+	int push, i;
+
+	NOUVEAU_ERR("nv20__draw_pbuffer: this path is broken.\n");
+
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
+	OUT_RELOCl(nv20_render->pbuffer, 0,
+			NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING(nv20_render->hwprim);
+
+	if (nr_indices & 1) {
+		BEGIN_RING(kelvin, NV10TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (indices[0]);
+		indices++; nr_indices--;
+	}
+
+	while (nr_indices) {
+		// XXX too big/small ? check the size
+		push = MIN2(nr_indices, 1200 * 2);
+
+		BEGIN_RING_NI(kelvin, NV10TCL_VB_ELEMENT_U16, push >> 1);
+		for (i = 0; i < push; i+=2)
+			OUT_RING((indices[i+1] << 16) | indices[i]);
+
+		nr_indices -= push;
+		indices  += push;
+	}
+
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING  (0);
+}
+
+static void
+nv20_vbuf_render_draw( struct vbuf_render *render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+
+	nv20_emit_hw_state(nv20_render->nv20);
+
+	if (nv20_render->pbuffer)
+		nv20__draw_pbuffer(nv20_render, indices, nr_indices);
+	else if (nv20_render->mbuffer)
+		nv20__draw_mbuffer(nv20_render, indices, nr_indices);
+	else
+		assert(0);
+}
+
+
+static void
+nv20_vbuf_render_release_vertices( struct vbuf_render *render )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	struct nv20_context *nv20 = nv20_render->nv20;
+	struct pipe_screen *pscreen = &nv20->screen->pipe;
+
+	if (nv20_render->pbuffer) {
+		pipe_buffer_reference(pscreen, &nv20_render->pbuffer, NULL);
+	} else if (nv20_render->mbuffer) {
+		FREE(nv20_render->mbuffer);
+		nv20_render->mbuffer = NULL;
+	} else
+		assert(0);
+}
+
+
+static void
+nv20_vbuf_render_destroy( struct vbuf_render *render )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+
+	assert(!nv20_render->pbuffer);
+	assert(!nv20_render->mbuffer);
+
+	FREE(nv20_render);
+}
+
+
+/**
+ * Create a new primitive render.
+ */
+static struct vbuf_render *
+nv20_vbuf_render_create( struct nv20_context *nv20 )
+{
+	struct nv20_vbuf_render *nv20_render = CALLOC_STRUCT(nv20_vbuf_render);
+
+	nv20_render->nv20 = nv20;
+
+	nv20_render->base.max_vertex_buffer_bytes = 16*1024;
+	nv20_render->base.max_indices = 1024;
+	nv20_render->base.get_vertex_info = nv20_vbuf_render_get_vertex_info;
+	nv20_render->base.allocate_vertices =
+					nv20_vbuf_render_allocate_vertices;
+	nv20_render->base.map_vertices = nv20_vbuf_render_map_vertices;
+	nv20_render->base.unmap_vertices = nv20_vbuf_render_unmap_vertices;
+	nv20_render->base.set_primitive = nv20_vbuf_render_set_primitive;
+	nv20_render->base.draw = nv20_vbuf_render_draw;
+	nv20_render->base.release_vertices = nv20_vbuf_render_release_vertices;
+	nv20_render->base.destroy = nv20_vbuf_render_destroy;
+
+	return &nv20_render->base;
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *nv20_draw_vbuf_stage( struct nv20_context *nv20 )
+{
+	struct vbuf_render *render;
+	struct draw_stage *stage;
+
+	render = nv20_vbuf_render_create(nv20);
+	if(!render)
+		return NULL;
+
+	stage = draw_vbuf_stage( nv20->draw, render );
+	if(!stage) {
+		render->destroy(render);
+		return NULL;
+	}
+
+	return stage;
+}
diff --git a/src/gallium/drivers/nv20/nv20_screen.c b/src/gallium/drivers/nv20/nv20_screen.c
new file mode 100644
index 0000000000..5f2b7b4f71
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_screen.c
@@ -0,0 +1,222 @@
+#include "pipe/p_screen.h"
+#include "util/u_simple_screen.h"
+
+#include "nv20_context.h"
+#include "nv20_screen.h"
+
+static const char *
+nv20_screen_get_name(struct pipe_screen *screen)
+{
+	struct nv20_screen *nv20screen = nv20_screen(screen);
+	struct nouveau_device *dev = nv20screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv20_screen_get_vendor(struct pipe_screen *screen)
+{
+	return "nouveau";
+}
+
+static int
+nv20_screen_get_param(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 2;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 0;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 1;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 0;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 12;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 12;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 0;
+	case NOUVEAU_CAP_HW_VTXBUF:
+	case NOUVEAU_CAP_HW_IDXBUF:
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv20_screen_get_paramf(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 2.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv20_screen_is_format_supported(struct pipe_screen *screen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv20_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys *ws = screen->winsys;
+	void *map;
+	struct nv20_miptree *nv20mt = (struct nv20_miptree *)surface->texture;
+
+	map = ws->buffer_map(ws, nv20mt->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface->offset;
+}
+
+static void
+nv20_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv20_miptree *nv20mt = (struct nv20_miptree *)surface->texture;
+
+	ws->buffer_unmap(ws, nv20mt->buffer);
+}
+
+static void
+nv20_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv20_screen *screen = nv20_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->kelvin);
+
+	FREE(pscreen);
+}
+
+static struct pipe_buffer *
+nv20_surface_buffer(struct pipe_surface *surf)
+{
+	struct nv20_miptree *mt = (struct nv20_miptree *)surf->texture;
+
+	return mt->buffer;
+}
+
+struct pipe_screen *
+nv20_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv20_screen *screen = CALLOC_STRUCT(nv20_screen);
+	unsigned kelvin_class = 0;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(nvws);
+	screen->eng2d->buf = nv20_surface_buffer;
+
+	/* 3D object */
+	if (chipset >= 0x25)
+		kelvin_class = NV25TCL;
+	else if (chipset >= 0x20)
+		kelvin_class = NV20TCL;
+
+	if (!kelvin_class || chipset >= 0x30) {
+		NOUVEAU_ERR("Unknown nv2x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, kelvin_class, &screen->kelvin);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv20_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv20_screen_destroy;
+
+	screen->pipe.get_name = nv20_screen_get_name;
+	screen->pipe.get_vendor = nv20_screen_get_vendor;
+	screen->pipe.get_param = nv20_screen_get_param;
+	screen->pipe.get_paramf = nv20_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv20_screen_is_format_supported;
+
+	screen->pipe.surface_map = nv20_surface_map;
+	screen->pipe.surface_unmap = nv20_surface_unmap;
+
+	nv20_screen_init_miptree_functions(&screen->pipe);
+	u_simple_screen_init(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_screen.h b/src/gallium/drivers/nv20/nv20_screen.h
new file mode 100644
index 0000000000..bf2f2c0d9f
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_screen.h
@@ -0,0 +1,24 @@
+#ifndef __NV20_SCREEN_H__
+#define __NV20_SCREEN_H__
+
+#include "pipe/p_screen.h"
+#include "nv04/nv04_surface_2d.h"
+
+struct nv20_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	/* HW graphics objects */
+	struct nv04_surface_2d *eng2d;
+	struct nouveau_grobj *kelvin;
+	struct nouveau_notifier *sync;
+};
+
+static INLINE struct nv20_screen *
+nv20_screen(struct pipe_screen *screen)
+{
+	return (struct nv20_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv20/nv20_state.c b/src/gallium/drivers/nv20/nv20_state.c
new file mode 100644
index 0000000000..ecec4f49a0
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_state.c
@@ -0,0 +1,582 @@
+#include "draw/draw_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+static void *
+nv20_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv20_blend_state *cb;
+
+	cb = MALLOC(sizeof(struct nv20_blend_state));
+
+	cb->b_enable = cso->blend_enable ? 1 : 0;
+	cb->b_srcfunc = ((nvgl_blend_func(cso->alpha_src_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_src_factor)));
+	cb->b_dstfunc = ((nvgl_blend_func(cso->alpha_dst_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_dst_factor)));
+
+	cb->c_mask = (((cso->colormask & PIPE_MASK_A) ? (0x01<<24) : 0) |
+		      ((cso->colormask & PIPE_MASK_R) ? (0x01<<16) : 0) |
+		      ((cso->colormask & PIPE_MASK_G) ? (0x01<< 8) : 0) |
+		      ((cso->colormask & PIPE_MASK_B) ? (0x01<< 0) : 0));
+
+	cb->d_enable = cso->dither ? 1 : 0;
+
+	return (void *)cb;
+}
+
+static void
+nv20_blend_state_bind(struct pipe_context *pipe, void *blend)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->blend = (struct nv20_blend_state*)blend;
+
+	nv20->dirty |= NV20_NEW_BLEND;
+}
+
+static void
+nv20_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV20TCL_TX_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV20TCL_TX_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV20TCL_TX_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV20TCL_TX_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV20TCL_TX_WRAP_S_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV20TCL_TX_WRAP_S_REPEAT;
+		break;
+	}
+
+	return (ret >> NV20TCL_TX_WRAP_S_SHIFT);
+}
+
+static void *
+nv20_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv20_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv20_sampler_state));
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV20TCL_TX_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV20TCL_TX_WRAP_T_SHIFT));
+
+	ps->en = 0;
+	if (cso->max_anisotropy > 1.0) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+/*		if (cso->max_anisotropy >= 8.0) {
+			ps->en |= NV20TCL_TX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 4.0) {
+			ps->en |= NV20TCL_TX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV20TCL_TX_ENABLE_ANISO_2X;
+		}*/
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV20TCL_TX_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV20TCL_TX_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |=
+				NV20TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV20TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV20TCL_TX_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |=
+				NV20TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |=
+				NV20TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV20TCL_TX_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+/*	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}*/
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv20_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv20->tex_sampler[unit] = sampler[unit];
+		nv20->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void
+nv20_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv20_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv20->tex_miptree[unit] = (struct nv20_miptree *)miptree[unit];
+		nv20->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void *
+nv20_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv20_rasterizer_state *rs;
+	int i;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	offset_cw/ccw -nohw
+	 * 	scissor
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 * 	offset_units / offset_scale
+	 */
+	rs = MALLOC(sizeof(struct nv20_rasterizer_state));
+
+	rs->templ = cso;
+	
+	rs->shade_model = cso->flatshade ? NV20TCL_SHADE_MODEL_FLAT :
+						NV20TCL_SHADE_MODEL_SMOOTH;
+
+	rs->line_width = (unsigned char)(cso->line_width * 8.0) & 0xff;
+	rs->line_smooth_en = cso->line_smooth ? 1 : 0;
+
+	/* XXX: nv20 and nv25 different! */
+	rs->point_size = *(uint32_t*)&cso->point_size;
+
+	rs->poly_smooth_en = cso->poly_smooth ? 1 : 0;
+
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		rs->front_face = NV20TCL_FRONT_FACE_CCW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_ccw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_cw);
+	} else {
+		rs->front_face = NV20TCL_FRONT_FACE_CW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_cw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_ccw);
+	}
+
+	switch (cso->cull_mode) {
+	case PIPE_WINDING_CCW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CCW)
+			rs->cull_face    = NV20TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV20TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_CW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CW)
+			rs->cull_face    = NV20TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV20TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_BOTH:
+		rs->cull_face_en = 1;
+		rs->cull_face    = NV20TCL_CULL_FACE_FRONT_AND_BACK;
+		break;
+	case PIPE_WINDING_NONE:
+	default:
+		rs->cull_face_en = 0;
+		rs->cull_face    = 0;
+		break;
+	}
+
+	if (cso->point_sprite) {
+		rs->point_sprite = (1 << 0);
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				rs->point_sprite |= (1 << (8 + i));
+		}
+	} else {
+		rs->point_sprite = 0;
+	}
+
+	return (void *)rs;
+}
+
+static void
+nv20_rasterizer_state_bind(struct pipe_context *pipe, void *rast)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->rast = (struct nv20_rasterizer_state*)rast;
+
+	draw_set_rasterizer_state(nv20->draw, (nv20->rast ? nv20->rast->templ : NULL));
+
+	nv20->dirty |= NV20_NEW_RAST;
+}
+
+static void
+nv20_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void *
+nv20_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv20_depth_stencil_alpha_state *hw;
+
+	hw = MALLOC(sizeof(struct nv20_depth_stencil_alpha_state));
+
+	hw->depth.func		= nvgl_comparison_op(cso->depth.func);
+	hw->depth.write_enable	= cso->depth.writemask ? 1 : 0;
+	hw->depth.test_enable	= cso->depth.enabled ? 1 : 0;
+
+	hw->stencil.enable = cso->stencil[0].enabled ? 1 : 0;
+	hw->stencil.wmask = cso->stencil[0].writemask;
+	hw->stencil.func = nvgl_comparison_op(cso->stencil[0].func);
+	hw->stencil.ref	= cso->stencil[0].ref_value;
+	hw->stencil.vmask = cso->stencil[0].valuemask;
+	hw->stencil.fail = nvgl_stencil_op(cso->stencil[0].fail_op);
+	hw->stencil.zfail = nvgl_stencil_op(cso->stencil[0].zfail_op);
+	hw->stencil.zpass = nvgl_stencil_op(cso->stencil[0].zpass_op);
+
+	hw->alpha.enabled = cso->alpha.enabled ? 1 : 0;
+	hw->alpha.func = nvgl_comparison_op(cso->alpha.func);
+	hw->alpha.ref  = float_to_ubyte(cso->alpha.ref_value);
+
+	return (void *)hw;
+}
+
+static void
+nv20_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *dsa)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->dsa = (struct nv20_depth_stencil_alpha_state*)dsa;
+
+	nv20->dirty |= NV20_NEW_DSA;
+}
+
+static void
+nv20_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void *
+nv20_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *templ)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	return draw_create_vertex_shader(nv20->draw, templ);
+}
+
+static void
+nv20_vp_state_bind(struct pipe_context *pipe, void *shader)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	draw_bind_vertex_shader(nv20->draw, (struct draw_vertex_shader *) shader);
+
+	nv20->dirty |= NV20_NEW_VERTPROG;
+}
+
+static void
+nv20_vp_state_delete(struct pipe_context *pipe, void *shader)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	draw_delete_vertex_shader(nv20->draw, (struct draw_vertex_shader *) shader);
+}
+
+static void *
+nv20_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv20_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv20_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	
+	tgsi_scan_shader(cso->tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv20_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nv20_fragment_program *fp = hwcso;
+
+	nv20->fragprog.current = fp;
+	nv20->dirty |= NV20_NEW_FRAGPROG;
+}
+
+static void
+nv20_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nv20_fragment_program *fp = hwcso;
+
+	nv20_fragprog_destroy(nv20, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv20_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->blend_color = (struct pipe_blend_color*)bcol;
+
+	nv20->dirty |= NV20_NEW_BLENDCOL;
+}
+
+static void
+nv20_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	draw_set_clip_state(nv20->draw, clip);
+}
+
+static void
+nv20_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+
+	assert(shader < PIPE_SHADER_TYPES);
+	assert(index == 0);
+
+	if (buf) {
+		void *mapped;
+		if (buf->buffer && buf->buffer->size &&
+                    (mapped = ws->buffer_map(ws, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+		{
+			memcpy(nv20->constbuf[shader], mapped, buf->buffer->size);
+			nv20->constbuf_nr[shader] =
+				buf->buffer->size / (4 * sizeof(float));
+			ws->buffer_unmap(ws, buf->buffer);
+		}
+	}
+}
+
+static void
+nv20_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->framebuffer = (struct pipe_framebuffer_state*)fb;
+
+	nv20->dirty |= NV20_NEW_FRAMEBUFFER;
+}
+
+static void
+nv20_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	NOUVEAU_ERR("line stipple hahaha\n");
+}
+
+static void
+nv20_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->scissor = (struct pipe_scissor_state*)s;
+
+	nv20->dirty |= NV20_NEW_SCISSOR;
+}
+
+static void
+nv20_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->viewport = (struct pipe_viewport_state*)vpt;
+
+	draw_set_viewport_state(nv20->draw, nv20->viewport);
+
+	nv20->dirty |= NV20_NEW_VIEWPORT;
+}
+
+static void
+nv20_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	memcpy(nv20->vtxbuf, vb, sizeof(*vb) * count);
+	nv20->dirty |= NV20_NEW_VTXARRAYS;
+
+	draw_set_vertex_buffers(nv20->draw, count, vb);
+}
+
+static void
+nv20_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	memcpy(nv20->vtxelt, ve, sizeof(*ve) * count);
+	nv20->dirty |= NV20_NEW_VTXARRAYS;
+
+	draw_set_vertex_elements(nv20->draw, count, ve);
+}
+
+void
+nv20_init_state_functions(struct nv20_context *nv20)
+{
+	nv20->pipe.create_blend_state = nv20_blend_state_create;
+	nv20->pipe.bind_blend_state = nv20_blend_state_bind;
+	nv20->pipe.delete_blend_state = nv20_blend_state_delete;
+
+	nv20->pipe.create_sampler_state = nv20_sampler_state_create;
+	nv20->pipe.bind_sampler_states = nv20_sampler_state_bind;
+	nv20->pipe.delete_sampler_state = nv20_sampler_state_delete;
+	nv20->pipe.set_sampler_textures = nv20_set_sampler_texture;
+
+	nv20->pipe.create_rasterizer_state = nv20_rasterizer_state_create;
+	nv20->pipe.bind_rasterizer_state = nv20_rasterizer_state_bind;
+	nv20->pipe.delete_rasterizer_state = nv20_rasterizer_state_delete;
+
+	nv20->pipe.create_depth_stencil_alpha_state =
+		nv20_depth_stencil_alpha_state_create;
+	nv20->pipe.bind_depth_stencil_alpha_state =
+		nv20_depth_stencil_alpha_state_bind;
+	nv20->pipe.delete_depth_stencil_alpha_state =
+		nv20_depth_stencil_alpha_state_delete;
+
+	nv20->pipe.create_vs_state = nv20_vp_state_create;
+	nv20->pipe.bind_vs_state = nv20_vp_state_bind;
+	nv20->pipe.delete_vs_state = nv20_vp_state_delete;
+
+	nv20->pipe.create_fs_state = nv20_fp_state_create;
+	nv20->pipe.bind_fs_state = nv20_fp_state_bind;
+	nv20->pipe.delete_fs_state = nv20_fp_state_delete;
+
+	nv20->pipe.set_blend_color = nv20_set_blend_color;
+	nv20->pipe.set_clip_state = nv20_set_clip_state;
+	nv20->pipe.set_constant_buffer = nv20_set_constant_buffer;
+	nv20->pipe.set_framebuffer_state = nv20_set_framebuffer_state;
+	nv20->pipe.set_polygon_stipple = nv20_set_polygon_stipple;
+	nv20->pipe.set_scissor_state = nv20_set_scissor_state;
+	nv20->pipe.set_viewport_state = nv20_set_viewport_state;
+
+	nv20->pipe.set_vertex_buffers = nv20_set_vertex_buffers;
+	nv20->pipe.set_vertex_elements = nv20_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_state.h b/src/gallium/drivers/nv20/nv20_state.h
new file mode 100644
index 0000000000..34f402fdcb
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_state.h
@@ -0,0 +1,139 @@
+#ifndef __NV20_STATE_H__
+#define __NV20_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv20_blend_state {
+	uint32_t b_enable;
+	uint32_t b_srcfunc;
+	uint32_t b_dstfunc;
+
+	uint32_t c_mask;
+
+	uint32_t d_enable;
+};
+
+struct nv20_sampler_state {
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv20_rasterizer_state {
+	uint32_t shade_model;
+
+	uint32_t line_width;
+	uint32_t line_smooth_en;
+
+	uint32_t point_size;
+
+	uint32_t poly_smooth_en;
+	
+	uint32_t poly_mode_front;
+	uint32_t poly_mode_back;
+
+	uint32_t front_face;
+	uint32_t cull_face;
+	uint32_t cull_face_en;
+
+	uint32_t point_sprite;
+
+	const struct pipe_rasterizer_state *templ;
+};
+
+struct nv20_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv20_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv20_vertex_program {
+	const struct pipe_shader_state *pipe;
+
+	boolean translated;
+	struct nv20_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv20_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+};
+
+struct nv20_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv20_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv20_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+};
+
+
+struct nv20_depth_stencil_alpha_state {
+	struct {
+		uint32_t func;
+		uint32_t write_enable;
+		uint32_t test_enable;
+	} depth;
+
+	struct {
+		uint32_t enable;
+		uint32_t wmask;
+		uint32_t func;
+		uint32_t ref;
+		uint32_t vmask;
+		uint32_t fail;
+		uint32_t zfail;
+		uint32_t zpass;
+	} stencil;
+
+	struct {
+		uint32_t enabled;
+		uint32_t func;
+		uint32_t ref;
+	} alpha;
+};
+
+struct nv20_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv20/nv20_state_emit.c b/src/gallium/drivers/nv20/nv20_state_emit.c
new file mode 100644
index 0000000000..0f4df9ca31
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_state_emit.c
@@ -0,0 +1,396 @@
+#include "nv20_context.h"
+#include "nv20_state.h"
+#include "draw/draw_context.h"
+
+static void nv20_state_emit_blend(struct nv20_context* nv20)
+{
+	struct nv20_blend_state *b = nv20->blend;
+
+	BEGIN_RING(kelvin, NV20TCL_DITHER_ENABLE, 1);
+	OUT_RING  (b->d_enable);
+
+	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (b->b_enable);
+
+	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_SRC, 2);
+	OUT_RING  (b->b_srcfunc);
+	OUT_RING  (b->b_dstfunc);
+
+	BEGIN_RING(kelvin, NV20TCL_COLOR_MASK, 1);
+	OUT_RING  (b->c_mask);
+}
+
+static void nv20_state_emit_blend_color(struct nv20_context* nv20)
+{
+	struct pipe_blend_color *c = nv20->blend_color;
+
+	BEGIN_RING(kelvin, NV20TCL_BLEND_COLOR, 1);
+	OUT_RING  ((float_to_ubyte(c->color[3]) << 24)|
+		   (float_to_ubyte(c->color[0]) << 16)|
+		   (float_to_ubyte(c->color[1]) << 8) |
+		   (float_to_ubyte(c->color[2]) << 0));
+}
+
+static void nv20_state_emit_rast(struct nv20_context* nv20)
+{
+	struct nv20_rasterizer_state *r = nv20->rast;
+
+	BEGIN_RING(kelvin, NV20TCL_SHADE_MODEL, 2);
+	OUT_RING  (r->shade_model);
+	OUT_RING  (r->line_width);
+
+
+	BEGIN_RING(kelvin, NV20TCL_POINT_SIZE, 1);
+	OUT_RING  (r->point_size);
+
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (r->poly_mode_front);
+	OUT_RING  (r->poly_mode_back);
+
+
+	BEGIN_RING(kelvin, NV20TCL_CULL_FACE, 2);
+	OUT_RING  (r->cull_face);
+	OUT_RING  (r->front_face);
+
+	BEGIN_RING(kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 2);
+	OUT_RING  (r->line_smooth_en);
+	OUT_RING  (r->poly_smooth_en);
+
+	BEGIN_RING(kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (r->cull_face_en);
+}
+
+static void nv20_state_emit_dsa(struct nv20_context* nv20)
+{
+	struct nv20_depth_stencil_alpha_state *d = nv20->dsa;
+
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_FUNC, 1);
+	OUT_RING (d->depth.func);
+
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING (d->depth.write_enable);
+
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING (d->depth.test_enable);
+
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_UNK17D8, 1);
+	OUT_RING (1);
+
+#if 0
+	BEGIN_RING(kelvin, NV20TCL_STENCIL_ENABLE, 1);
+	OUT_RING (d->stencil.enable);
+	BEGIN_RING(kelvin, NV20TCL_STENCIL_MASK, 7);
+	OUT_RINGp ((uint32_t *)&(d->stencil.wmask), 7);
+#endif
+
+	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING (d->alpha.enabled);
+
+	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_FUNC, 1);
+	OUT_RING (d->alpha.func);
+
+	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_REF, 1);
+	OUT_RING (d->alpha.ref);
+}
+
+static void nv20_state_emit_viewport(struct nv20_context* nv20)
+{
+}
+
+static void nv20_state_emit_scissor(struct nv20_context* nv20)
+{
+	/* NV20TCL_SCISSOR_* is probably a software method */
+/*	struct pipe_scissor_state *s = nv20->scissor;
+	BEGIN_RING(kelvin, NV20TCL_SCISSOR_HORIZ, 2);
+	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
+}
+
+static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
+{
+	struct pipe_framebuffer_state* fb = nv20->framebuffer;
+	struct pipe_surface *rt, *zeta = NULL;
+	uint32_t rt_format, w, h;
+	int colour_format = 0, zeta_format = 0;
+	struct nv20_miptree *nv20mt = 0;
+
+	w = fb->cbufs[0]->width;
+	h = fb->cbufs[0]->height;
+	colour_format = fb->cbufs[0]->format;
+	rt = fb->cbufs[0];
+
+	if (fb->zsbuf) {
+		if (colour_format) {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		} else {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+		}
+
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	rt_format = NV20TCL_RT_FORMAT_TYPE_LINEAR | 0x20;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV20TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV20TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (zeta) {
+		BEGIN_RING(kelvin, NV20TCL_RT_PITCH, 1);
+		OUT_RING  (rt->stride | (zeta->stride << 16));
+	} else {
+		BEGIN_RING(kelvin, NV20TCL_RT_PITCH, 1);
+		OUT_RING  (rt->stride | (rt->stride << 16));
+	}
+
+	nv20mt = (struct nv20_miptree *)rt->texture;
+	nv20->rt[0] = nv20mt->buffer;
+
+	if (zeta_format)
+	{
+		nv20mt = (struct nv20_miptree *)zeta->texture;
+		nv20->zeta = nv20mt->buffer;
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 3);
+	OUT_RING  ((w << 16) | 0);
+	OUT_RING  ((h << 16) | 0); /*NV20TCL_RT_VERT */
+	OUT_RING  (rt_format); /* NV20TCL_RT_FORMAT */
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  (((w - 1) << 16) | 0);
+	OUT_RING  (((h - 1) << 16) | 0);
+}
+
+static void nv20_vertex_layout(struct nv20_context *nv20)
+{
+	struct nv20_fragment_program *fp = nv20->fragprog.current;
+	struct draw_context *dc = nv20->draw;
+	int src;
+	int i;
+	struct vertex_info *vinfo = &nv20->vertex_info;
+	const enum interp_mode colorInterp = INTERP_LINEAR;
+	boolean colors[2] = { FALSE };
+	boolean generics[12] = { FALSE };
+	boolean fog = FALSE;
+
+	memset(vinfo, 0, sizeof(*vinfo));
+
+	/*
+	 * Assumed NV20 hardware vertex attribute order:
+	 * 0 position, 1 ?, 2 ?, 3 col0,
+	 * 4 col1?, 5 ?, 6 ?, 7 ?,
+	 * 8 ?, 9 tex0, 10 tex1, 11 tex2,
+	 * 12 tex3, 13 ?, 14 ?, 15 ?
+	 * unaccounted: wgh, nor, fog
+	 * There are total 16 attrs.
+	 * vinfo->hwfmt[0] has a used-bit corresponding to each of these.
+	 * relation to TGSI_SEMANTIC_*:
+	 * - POSITION: position (always used)
+	 * - COLOR: col1, col0
+	 * - GENERIC: tex3, tex2, tex1, tex0, normal, weight
+	 * - FOG: fog
+	 */
+
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		int isn = fp->info.input_semantic_name[i];
+		int isi = fp->info.input_semantic_index[i];
+		switch (isn) {
+		case TGSI_SEMANTIC_POSITION:
+			break;
+		case TGSI_SEMANTIC_COLOR:
+			assert(isi < 2);
+			colors[isi] = TRUE;
+			break;
+		case TGSI_SEMANTIC_GENERIC:
+			assert(isi < 12);
+			generics[isi] = TRUE;
+			break;
+		case TGSI_SEMANTIC_FOG:
+			fog = TRUE;
+			break;
+		default:
+			assert(0 && "unknown input_semantic_name");
+		}
+	}
+
+	/* always do position */ {
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_POSITION, 0);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR, src);
+		vinfo->hwfmt[0] |= (1 << 0);
+	}
+
+	/* two unnamed generics */
+	for (i = 4; i < 6; i++) {
+		if (!generics[i])
+			continue;
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+		vinfo->hwfmt[0] |= (1 << (i - 3));
+	}
+
+	if (colors[0]) {
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_COLOR, 0);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+		vinfo->hwfmt[0] |= (1 << 3);
+	}
+
+	if (colors[1]) {
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_COLOR, 1);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+		vinfo->hwfmt[0] |= (1 << 4);
+	}
+
+	/* four unnamed generics */
+	for (i = 6; i < 10; i++) {
+		if (!generics[i])
+			continue;
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+		vinfo->hwfmt[0] |= (1 << (i - 1));
+	}
+
+	/* tex0, tex1, tex2, tex3 */
+	for (i = 0; i < 4; i++) {
+		if (!generics[i])
+			continue;
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+		vinfo->hwfmt[0] |= (1 << (i + 9));
+	}
+
+	/* two unnamed generics */
+	for (i = 10; i < 12; i++) {
+		if (!generics[i])
+			continue;
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+		vinfo->hwfmt[0] |= (1 << (i + 3));
+	}
+
+	if (fog) {
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_FOG, 0);
+		draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+		vinfo->hwfmt[0] |= (1 << 15);
+	}
+
+	draw_compute_vertex_size(vinfo);
+}
+
+void
+nv20_emit_hw_state(struct nv20_context *nv20)
+{
+	int i;
+
+	if (nv20->dirty & NV20_NEW_VERTPROG) {
+		//nv20_vertprog_bind(nv20, nv20->vertprog.current);
+		nv20->dirty &= ~NV20_NEW_VERTPROG;
+	}
+
+	if (nv20->dirty & NV20_NEW_FRAGPROG) {
+		nv20_fragprog_bind(nv20, nv20->fragprog.current);
+		/*XXX: clear NV20_NEW_FRAGPROG if no new program uploaded */
+		nv20->dirty_samplers |= (1<<10);
+		nv20->dirty_samplers = 0;
+	}
+
+	if (nv20->dirty_samplers || (nv20->dirty & NV20_NEW_FRAGPROG)) {
+		nv20_fragtex_bind(nv20);
+		nv20->dirty &= ~NV20_NEW_FRAGPROG;
+	}
+
+	if (nv20->dirty & NV20_NEW_VTXARRAYS) {
+		nv20->dirty &= ~NV20_NEW_VTXARRAYS;
+		nv20_vertex_layout(nv20);
+		nv20_vtxbuf_bind(nv20);
+	}
+
+	if (nv20->dirty & NV20_NEW_BLEND) {
+		nv20->dirty &= ~NV20_NEW_BLEND;
+		nv20_state_emit_blend(nv20);
+	}
+
+	if (nv20->dirty & NV20_NEW_BLENDCOL) {
+		nv20->dirty &= ~NV20_NEW_BLENDCOL;
+		nv20_state_emit_blend_color(nv20);
+	}
+
+	if (nv20->dirty & NV20_NEW_RAST) {
+		nv20->dirty &= ~NV20_NEW_RAST;
+		nv20_state_emit_rast(nv20);
+	}
+
+	if (nv20->dirty & NV20_NEW_DSA) {
+		nv20->dirty &= ~NV20_NEW_DSA;
+		nv20_state_emit_dsa(nv20);
+	}
+
+ 	if (nv20->dirty & NV20_NEW_VIEWPORT) {
+		nv20->dirty &= ~NV20_NEW_VIEWPORT;
+		nv20_state_emit_viewport(nv20);
+	}
+
+ 	if (nv20->dirty & NV20_NEW_SCISSOR) {
+		nv20->dirty &= ~NV20_NEW_SCISSOR;
+		nv20_state_emit_scissor(nv20);
+	}
+
+ 	if (nv20->dirty & NV20_NEW_FRAMEBUFFER) {
+		nv20->dirty &= ~NV20_NEW_FRAMEBUFFER;
+		nv20_state_emit_framebuffer(nv20);
+	}
+
+	/* Emit relocs for every referenced buffer.
+	 * This is to ensure the bufmgr has an accurate idea of how
+	 * the buffer is used.  This isn't very efficient, but we don't
+	 * seem to take a significant performance hit.  Will be improved
+	 * at some point.  Vertex arrays are emitted by nv20_vbo.c
+	 */
+
+	/* Render target */
+	BEGIN_RING(kelvin, NV20TCL_DMA_COLOR, 1);
+	OUT_RELOCo(nv20->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(kelvin, NV20TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(nv20->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	if (nv20->zeta) {
+		BEGIN_RING(kelvin, NV20TCL_DMA_ZETA, 1);
+		OUT_RELOCo(nv20->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(kelvin, NV20TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(nv20->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		/* XXX for when we allocate LMA on nv17 */
+/*		BEGIN_RING(kelvin, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
+		OUT_RELOCl(nv20->zeta + lma_offset);*/
+	}
+
+	/* Vertex buffer */
+	BEGIN_RING(kelvin, NV20TCL_DMA_VTXBUF0, 1);
+	OUT_RELOCo(nv20->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(kelvin, NV20TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(nv20->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	/* Texture images */
+	for (i = 0; i < 2; i++) {
+		if (!(nv20->fp_samplers & (1 << i)))
+			continue;
+		BEGIN_RING(kelvin, NV20TCL_TX_OFFSET(i), 1);
+		OUT_RELOCl(nv20->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
+			   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		BEGIN_RING(kelvin, NV20TCL_TX_FORMAT(i), 1);
+		OUT_RELOCd(nv20->tex[i].buffer, nv20->tex[i].format,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+			   NOUVEAU_BO_OR, NV20TCL_TX_FORMAT_DMA0,
+			   NV20TCL_TX_FORMAT_DMA1);
+	}
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_surface.c b/src/gallium/drivers/nv20/nv20_surface.c
new file mode 100644
index 0000000000..6cd607583c
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_surface.c
@@ -0,0 +1,72 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv20_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nv20_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nv04_surface_2d *eng2d = nv20->screen->eng2d;
+
+	if (do_flip) {
+		desty += height;
+		while (height--) {
+			eng2d->copy(eng2d, dest, destx, desty--, src,
+				    srcx, srcy++, width, 1);
+		}
+		return;
+	}
+
+	eng2d->copy(eng2d, dest, destx, desty, src, srcx, srcy, width, height);
+}
+
+static void
+nv20_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nv04_surface_2d *eng2d = nv20->screen->eng2d;
+
+	eng2d->fill(eng2d, dest, destx, desty, width, height, value);
+}
+
+void
+nv20_init_surface_functions(struct nv20_context *nv20)
+{
+	nv20->pipe.surface_copy = nv20_surface_copy;
+	nv20->pipe.surface_fill = nv20_surface_fill;
+}
diff --git a/src/gallium/drivers/nv20/nv20_vbo.c b/src/gallium/drivers/nv20/nv20_vbo.c
new file mode 100644
index 0000000000..24d8f4bef0
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_vbo.c
@@ -0,0 +1,78 @@
+#include "draw/draw_context.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+boolean nv20_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count)
+{
+	struct nv20_context *nv20 = nv20_context( pipe );
+	struct draw_context *draw = nv20->draw;
+	unsigned i;
+
+	nv20_emit_hw_state(nv20);
+
+	/*
+	 * Map vertex buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv20->vtxbuf[i].buffer) {
+			void *buf
+				= pipe->winsys->buffer_map(pipe->winsys,
+						nv20->vtxbuf[i].buffer,
+						PIPE_BUFFER_USAGE_CPU_READ);
+			draw_set_mapped_vertex_buffer(draw, i, buf);
+		}
+	}
+	/* Map index buffer, if present */
+	if (indexBuffer) {
+		void *mapped_indexes
+			= pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
+					PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
+	}
+	else {
+		/* no index/element buffer */
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	draw_set_mapped_constant_buffer(draw,
+					nv20->constbuf[PIPE_SHADER_VERTEX],
+					nv20->constbuf_nr[PIPE_SHADER_VERTEX]);
+
+	/* draw! */
+	draw_arrays(nv20->draw, prim, start, count);
+
+	/*
+	 * unmap vertex/index buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv20->vtxbuf[i].buffer) {
+			pipe->winsys->buffer_unmap(pipe->winsys, nv20->vtxbuf[i].buffer);
+			draw_set_mapped_vertex_buffer(draw, i, NULL);
+		}
+	}
+	if (indexBuffer) {
+		pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	draw_flush(nv20->draw);
+	return TRUE;
+}
+
+boolean nv20_draw_arrays( struct pipe_context *pipe,
+				 unsigned prim, unsigned start, unsigned count)
+{
+	return nv20_draw_elements(pipe, NULL, 0, prim, start, count);
+}
+
+
+
diff --git a/src/gallium/drivers/nv20/nv20_vertprog.c b/src/gallium/drivers/nv20/nv20_vertprog.c
new file mode 100644
index 0000000000..5db0e807ff
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_vertprog.c
@@ -0,0 +1,838 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  2. Arb. swz/negation
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 8
+#define MASK_Y 4
+#define MASK_Z 2
+#define MASK_W 1
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE 0
+#define DEF_CTEST 0
+#include "nv20_shader.h"
+
+#define swz(s,x,y,z,w) nv20_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv20_sr_neg((s))
+#define abs(s) nv20_sr_abs((s))
+
+struct nv20_vpc {
+	struct nv20_vertex_program *vp;
+
+	struct nv20_vertex_program_exec *vpi;
+
+	unsigned output_map[PIPE_MAX_SHADER_OUTPUTS];
+
+	int high_temp;
+	int temp_temp_count;
+
+	struct nv20_sreg *imm;
+	unsigned nr_imm;
+};
+
+static struct nv20_sreg
+temp(struct nv20_vpc *vpc)
+{
+	int idx;
+
+	idx  = vpc->temp_temp_count++;
+	idx += vpc->high_temp + 1;
+	return nv20_sr(NV30SR_TEMP, idx);
+}
+
+static struct nv20_sreg
+constant(struct nv20_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nv20_vertex_program *vp = vpc->vp;
+	struct nv20_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nv20_sr(NV30SR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nv20_sr(NV30SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv20_vp_arith((cc), (s), NV30_VP_INST_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nv20_vpc *vpc, uint32_t *hw, int pos, struct nv20_sreg src)
+{
+	struct nv20_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV30SR_TEMP:
+		sr |= (NV30_VP_SRC_REG_TYPE_TEMP << NV30_VP_SRC_REG_TYPE_SHIFT);
+		sr |= (src.index << NV30_VP_SRC_TEMP_SRC_SHIFT);
+		break;
+	case NV30SR_INPUT:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NV30_VP_INST_INPUT_SRC_SHIFT);
+		break;
+	case NV30SR_CONST:
+		sr |= (NV30_VP_SRC_REG_TYPE_CONST <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NV30SR_NONE:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV30_VP_SRC_NEGATE;
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NV30_VP_SRC_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV30_VP_SRC_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV30_VP_SRC_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV30_VP_SRC_SWZ_W_SHIFT));
+
+/*
+ * |VVV|
+ * d�.�b
+ *  \u/
+ *
+ */
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NV30_VP_SRC0_HIGH_MASK) >>
+			  NV30_VP_SRC0_HIGH_SHIFT) << NV30_VP_INST_SRC0H_SHIFT;
+		hw[2] |= (sr & NV30_VP_SRC0_LOW_MASK) <<
+			  NV30_VP_INST_SRC0L_SHIFT;
+		break;
+	case 1:
+		hw[2] |= sr << NV30_VP_INST_SRC1_SHIFT;
+		break;
+	case 2:
+		hw[2] |= ((sr & NV30_VP_SRC2_HIGH_MASK) >>
+			  NV30_VP_SRC2_HIGH_SHIFT) << NV30_VP_INST_SRC2H_SHIFT;
+		hw[3] |= (sr & NV30_VP_SRC2_LOW_MASK) <<
+			  NV30_VP_INST_SRC2L_SHIFT;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nv20_vpc *vpc, uint32_t *hw, int slot, struct nv20_sreg dst)
+{
+	struct nv20_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NV30SR_TEMP:
+		hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
+		break;
+	case NV30SR_OUTPUT:
+		switch (dst.index) {
+		case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+		case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+		case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+		case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+		case NV30_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
+		case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+		case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+		case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+		case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+		case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+		case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+		case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+		case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+		case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+		default:
+			break;
+		}
+
+		hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
+		hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+
+		/*XXX: no way this is entirely correct, someone needs to
+		 *     figure out what exactly it is.
+		 */
+		hw[3] |= 0x800;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nv20_vp_arith(struct nv20_vpc *vpc, int slot, int op,
+	      struct nv20_sreg dst, int mask,
+	      struct nv20_sreg s0, struct nv20_sreg s1,
+	      struct nv20_sreg s2)
+{
+	struct nv20_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NV30_VP_INST_COND_TR << NV30_VP_INST_COND_SHIFT);
+	hw[0] |= ((0 << NV30_VP_INST_COND_SWZ_X_SHIFT) |
+		  (1 << NV30_VP_INST_COND_SWZ_Y_SHIFT) |
+		  (2 << NV30_VP_INST_COND_SWZ_Z_SHIFT) |
+		  (3 << NV30_VP_INST_COND_SWZ_W_SHIFT));
+
+	hw[1] |= (op << NV30_VP_INST_VEC_OPCODE_SHIFT);
+//	hw[3] |= NV30_VP_INST_SCA_DEST_TEMP_MASK;
+//	hw[3] |= (mask << NV30_VP_INST_VEC_WRITEMASK_SHIFT);
+
+	if (dst.type == NV30SR_OUTPUT) {
+		if (slot)
+			hw[3] |= (mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
+		else
+			hw[3] |= (mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
+	} else {
+		if (slot)
+			hw[3] |= (mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
+		else
+			hw[3] |= (mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
+	}
+
+	emit_dst(vpc, hw, slot, dst);
+	emit_src(vpc, hw, 0, s0);
+	emit_src(vpc, hw, 1, s1);
+	emit_src(vpc, hw, 2, s2);
+}
+
+static INLINE struct nv20_sreg
+tgsi_src(struct nv20_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nv20_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv20_sr(NV30SR_INPUT, fsrc->SrcRegister.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		if (vpc->high_temp < fsrc->SrcRegister.Index)
+			vpc->high_temp = fsrc->SrcRegister.Index;
+		src = nv20_sr(NV30SR_TEMP, fsrc->SrcRegister.Index);
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv20_sreg
+tgsi_dst(struct nv20_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nv20_sreg dst;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = nv20_sr(NV30SR_OUTPUT,
+			      vpc->output_map[fdst->DstRegister.Index]);
+
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = nv20_sr(NV30SR_TEMP, fdst->DstRegister.Index);
+		if (vpc->high_temp < dst.index)
+			vpc->high_temp = dst.index;
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+nv20_vertprog_parse_instruction(struct nv20_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nv20_sreg src[3], dst, tmp;
+	struct nv20_sreg none = nv20_sr(NV30SR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	vpc->temp_temp_count = 0;
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		/*XXX: index comparison is broken now that consts come from
+		 *     two different register files.
+		 */
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_IMMEDIATE:
+			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, 1, OP_EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, 1, OP_RSQ, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(vpc, 0, OP_SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, 0, OP_MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv20_vertprog_parse_decl_output(struct nv20_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV30_VP_INST_DEST_POS;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_BFC0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_BFC1;
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV30_VP_INST_DEST_FOGC;
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NV30_VP_INST_DEST_PSZ;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV30_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->output_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv20_vertprog_prepare(struct nv20_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int nr_imm = 0;
+
+	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nv20_sreg));
+		assert(vpc->imm);
+	}
+
+	return TRUE;
+}
+
+static void
+nv20_vertprog_translate(struct nv20_context *nv20,
+			struct nv20_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nv20_vpc *vpc = NULL;
+
+	tgsi_dump(vp->pipe.tokens,0);
+
+	vpc = CALLOC(1, sizeof(struct nv20_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+	vpc->high_temp = -1;
+
+	if (!nv20_vertprog_prepare(vpc)) {
+		FREE(vpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &parse.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_OUTPUT:
+				if (!nv20_vertprog_parse_decl_output(vpc, fdec))
+					goto out_err;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(imm->Immediate.NrTokens == 4 + 1);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u.ImmediateFloat32[0].Float,
+					 imm->u.ImmediateFloat32[1].Float,
+					 imm->u.ImmediateFloat32[2].Float,
+					 imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv20_vertprog_parse_instruction(vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NV30_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	FREE(vpc);
+}
+
+static boolean
+nv20_vertprog_validate(struct nv20_context *nv20)
+{ 
+	struct nouveau_winsys *nvws = nv20->nvws;
+	struct pipe_winsys *ws = nv20->pipe.winsys;
+	struct nouveau_grobj *rankine = nv20->screen->rankine;
+	struct nv20_vertex_program *vp;
+	struct pipe_buffer *constbuf;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	vp = nv20->vertprog;
+	constbuf = nv20->constbuf[PIPE_SHADER_VERTEX];
+
+	/* Translate TGSI shader into hw bytecode */
+	if (!vp->translated) {
+		nv20_vertprog_translate(nv20, vp);
+		if (!vp->translated)
+			return FALSE;
+	}
+
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nv20->screen->vp_exec_heap;
+		struct nouveau_stateobj *so;
+		uint vplen = vp->nr_insns;
+
+		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nv20_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->exec);
+			}
+
+			if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		so = so_new(2, 0);
+		so_method(so, rankine, NV34TCL_VP_START_FROM_ID, 1);
+		so_data  (so, vp->exec->start);
+		so_ref(so, &vp->so);
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nv20->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nv20_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv20_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv20_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NV30_VP_INST_CONST_SRC_MASK;
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NV30_VP_INST_CONST_SRC_SHIFT;
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (constbuf) {
+			map = ws->buffer_map(ws, constbuf,
+					     PIPE_BUFFER_USAGE_CPU_READ);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nv20_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (i + vp->data->start);
+			OUT_RINGp ((uint32_t *)vpd->value, 4);
+		}
+
+		if (constbuf) {
+			ws->buffer_unmap(ws, constbuf);
+		}
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP inst %d: 0x%08x 0x%08x 0x%08x 0x%08x\n",
+				i, vp->insns[i].data[0], vp->insns[i].data[1],
+				vp->insns[i].data[2], vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (vp->insns[i].data, 4);
+		}
+	}
+
+	if (vp->so != nv20->state.hw[NV30_STATE_VERTPROG]) {
+		so_ref(vp->so, &nv20->state.hw[NV30_STATE_VERTPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv20_vertprog_destroy(struct nv20_context *nv20, struct nv20_vertex_program *vp)
+{
+	struct nouveau_winsys *nvws = nv20->screen->nvws;
+
+	vp->translated = FALSE;
+
+	if (vp->nr_insns) {
+		FREE(vp->insns);
+		vp->insns = NULL;
+		vp->nr_insns = 0;
+	}
+
+	if (vp->nr_consts) {
+		FREE(vp->consts);
+		vp->consts = NULL;
+		vp->nr_consts = 0;
+	}
+
+	nvws->res_free(&vp->exec);
+	vp->exec_start = 0;
+	nvws->res_free(&vp->data);
+	vp->data_start = 0;
+	vp->data_start_min = 0;
+
+	vp->ir = vp->or = 0;
+	so_ref(NULL, &vp->so);
+}
+
+struct nv20_state_entry nv20_state_vertprog = {
+	.validate = nv20_vertprog_validate,
+	.dirty = {
+		.pipe = NV30_NEW_VERTPROG /*| NV30_NEW_UCP*/,
+		.hw = NV30_STATE_VERTPROG,
+	}
+};
diff --git a/src/gallium/drivers/nv30/Makefile b/src/gallium/drivers/nv30/Makefile
new file mode 100644
index 0000000000..4c29e2eab3
--- /dev/null
+++ b/src/gallium/drivers/nv30/Makefile
@@ -0,0 +1,28 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv30
+
+C_SOURCES = \
+	nv30_clear.c \
+	nv30_context.c \
+	nv30_draw.c \
+	nv30_fragprog.c \
+	nv30_fragtex.c \
+	nv30_miptree.c \
+	nv30_query.c \
+	nv30_screen.c \
+	nv30_state.c \
+	nv30_state_blend.c \
+	nv30_state_emit.c \
+	nv30_state_fb.c \
+	nv30_state_rasterizer.c \
+	nv30_state_scissor.c \
+	nv30_state_stipple.c \
+	nv30_state_viewport.c \
+	nv30_state_zsa.c \
+	nv30_surface.c \
+	nv30_vbo.c \
+	nv30_vertprog.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/nv30/nv30_clear.c b/src/gallium/drivers/nv30/nv30_clear.c
new file mode 100644
index 0000000000..8c3ca204d5
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_clear.c
@@ -0,0 +1,13 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv30_context.h"
+
+void
+nv30_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+	ps->status = PIPE_SURFACE_STATUS_CLEAR;
+}
diff --git a/src/gallium/drivers/nv30/nv30_context.c b/src/gallium/drivers/nv30/nv30_context.c
new file mode 100644
index 0000000000..61654f8756
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_context.c
@@ -0,0 +1,72 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "nv30_context.h"
+#include "nv30_screen.h"
+
+static void
+nv30_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	
+	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+		BEGIN_RING(rankine, 0x1fd8, 1);
+		OUT_RING  (2);
+		BEGIN_RING(rankine, 0x1fd8, 1);
+		OUT_RING  (1);
+	}
+
+	FIRE_RING(fence);
+}
+
+static void
+nv30_destroy(struct pipe_context *pipe)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	if (nv30->draw)
+		draw_destroy(nv30->draw);
+	FREE(nv30);
+}
+
+struct pipe_context *
+nv30_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv30_screen *screen = nv30_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv30_context *nv30;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv30 = CALLOC(1, sizeof(struct nv30_context));
+	if (!nv30)
+		return NULL;
+	nv30->screen = screen;
+	nv30->pctx_id = pctx_id;
+
+	nv30->nvws = nvws;
+
+	nv30->pipe.winsys = ws;
+	nv30->pipe.screen = pscreen;
+	nv30->pipe.destroy = nv30_destroy;
+	nv30->pipe.draw_arrays = nv30_draw_arrays;
+	nv30->pipe.draw_elements = nv30_draw_elements;
+	nv30->pipe.clear = nv30_clear;
+	nv30->pipe.flush = nv30_flush;
+
+	nv30_init_query_functions(nv30);
+	nv30_init_surface_functions(nv30);
+	nv30_init_state_functions(nv30);
+
+	/* Create, configure, and install fallback swtnl path */
+	nv30->draw = draw_create();
+	draw_wide_point_threshold(nv30->draw, 9999999.0);
+	draw_wide_line_threshold(nv30->draw, 9999999.0);
+	draw_enable_line_stipple(nv30->draw, FALSE);
+	draw_enable_point_sprites(nv30->draw, FALSE);
+	draw_set_rasterize_stage(nv30->draw, nv30_draw_render_stage(nv30));
+
+	return &nv30->pipe;
+}
+	
diff --git a/src/gallium/drivers/nv30/nv30_context.h b/src/gallium/drivers/nv30/nv30_context.h
new file mode 100644
index 0000000000..b933769700
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_context.h
@@ -0,0 +1,212 @@
+#ifndef __NV30_CONTEXT_H__
+#define __NV30_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv30_screen *ctx = nv30->screen
+#include "nouveau/nouveau_push.h"
+#include "nouveau/nouveau_stateobj.h"
+
+#include "nv30_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+enum nv30_state_index {
+	NV30_STATE_FB = 0,
+	NV30_STATE_VIEWPORT = 1,
+	NV30_STATE_BLEND = 2,
+	NV30_STATE_RAST = 3,
+	NV30_STATE_ZSA = 4,
+	NV30_STATE_BCOL = 5,
+	NV30_STATE_CLIP = 6,
+	NV30_STATE_SCISSOR = 7,
+	NV30_STATE_STIPPLE = 8,
+	NV30_STATE_FRAGPROG = 9,
+	NV30_STATE_VERTPROG = 10,
+	NV30_STATE_FRAGTEX0 = 11,
+	NV30_STATE_FRAGTEX1 = 12,
+	NV30_STATE_FRAGTEX2 = 13,
+	NV30_STATE_FRAGTEX3 = 14,
+	NV30_STATE_FRAGTEX4 = 15,
+	NV30_STATE_FRAGTEX5 = 16,
+	NV30_STATE_FRAGTEX6 = 17,
+	NV30_STATE_FRAGTEX7 = 18,
+	NV30_STATE_FRAGTEX8 = 19,
+	NV30_STATE_FRAGTEX9 = 20,
+	NV30_STATE_FRAGTEX10 = 21,
+	NV30_STATE_FRAGTEX11 = 22,
+	NV30_STATE_FRAGTEX12 = 23,
+	NV30_STATE_FRAGTEX13 = 24,
+	NV30_STATE_FRAGTEX14 = 25,
+	NV30_STATE_FRAGTEX15 = 26,
+	NV30_STATE_VERTTEX0 = 27,
+	NV30_STATE_VERTTEX1 = 28,
+	NV30_STATE_VERTTEX2 = 29,
+	NV30_STATE_VERTTEX3 = 30,
+	NV30_STATE_VTXBUF = 31,
+	NV30_STATE_VTXFMT = 32,
+	NV30_STATE_VTXATTR = 33,
+	NV30_STATE_MAX = 34
+};
+
+#include "nv30_screen.h"
+
+#define NV30_NEW_BLEND		(1 <<  0)
+#define NV30_NEW_RAST		(1 <<  1)
+#define NV30_NEW_ZSA		(1 <<  2)
+#define NV30_NEW_SAMPLER	(1 <<  3)
+#define NV30_NEW_FB		(1 <<  4)
+#define NV30_NEW_STIPPLE	(1 <<  5)
+#define NV30_NEW_SCISSOR	(1 <<  6)
+#define NV30_NEW_VIEWPORT	(1 <<  7)
+#define NV30_NEW_BCOL		(1 <<  8)
+#define NV30_NEW_VERTPROG	(1 <<  9)
+#define NV30_NEW_FRAGPROG	(1 << 10)
+#define NV30_NEW_ARRAYS		(1 << 11)
+#define NV30_NEW_UCP		(1 << 12)
+
+struct nv30_rasterizer_state {
+	struct pipe_rasterizer_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv30_zsa_state {
+	struct pipe_depth_stencil_alpha_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv30_blend_state {
+	struct pipe_blend_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+
+struct nv30_state {
+	unsigned scissor_enabled;
+	unsigned stipple_enabled;
+	unsigned viewport_bypass;
+	unsigned fp_samplers;
+
+	uint64_t dirty;
+	struct nouveau_stateobj *hw[NV30_STATE_MAX];
+};
+
+struct nv30_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv30_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	/* HW state derived from pipe states */
+	struct nv30_state state;
+
+	/* Context state */
+	unsigned dirty;
+	struct pipe_scissor_state scissor;
+	unsigned stipple[32];
+	struct nv30_vertex_program *vertprog;
+	struct nv30_fragment_program *fragprog;
+	struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+	struct nv30_rasterizer_state *rasterizer;
+	struct nv30_zsa_state *zsa;
+	struct nv30_blend_state *blend;
+	struct pipe_blend_color blend_colour;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct pipe_buffer *idxbuf;
+	unsigned idxbuf_format;
+	struct nv30_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv30_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned nr_samplers;
+	unsigned nr_textures;
+	unsigned dirty_samplers;
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+	unsigned vtxelt_nr;
+	const unsigned *edgeflags;
+};
+
+static INLINE struct nv30_context *
+nv30_context(struct pipe_context *pipe)
+{
+	return (struct nv30_context *)pipe;
+}
+
+struct nv30_state_entry {
+	boolean (*validate)(struct nv30_context *nv30);
+	struct {
+		unsigned pipe;
+		unsigned hw;
+	} dirty;
+};
+
+extern void nv30_init_state_functions(struct nv30_context *nv30);
+extern void nv30_init_surface_functions(struct nv30_context *nv30);
+extern void nv30_init_query_functions(struct nv30_context *nv30);
+
+extern void nv30_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv30_draw.c */
+extern struct draw_stage *nv30_draw_render_stage(struct nv30_context *nv30);
+
+/* nv30_vertprog.c */
+extern void nv30_vertprog_destroy(struct nv30_context *,
+				  struct nv30_vertex_program *);
+
+/* nv30_fragprog.c */
+extern void nv30_fragprog_destroy(struct nv30_context *,
+				  struct nv30_fragment_program *);
+
+/* nv30_fragtex.c */
+extern void nv30_fragtex_bind(struct nv30_context *);
+
+/* nv30_state.c and friends */
+extern boolean nv30_state_validate(struct nv30_context *nv30);
+extern void nv30_state_emit(struct nv30_context *nv30);
+extern struct nv30_state_entry nv30_state_rasterizer;
+extern struct nv30_state_entry nv30_state_scissor;
+extern struct nv30_state_entry nv30_state_stipple;
+extern struct nv30_state_entry nv30_state_fragprog;
+extern struct nv30_state_entry nv30_state_vertprog;
+extern struct nv30_state_entry nv30_state_blend;
+extern struct nv30_state_entry nv30_state_blend_colour;
+extern struct nv30_state_entry nv30_state_zsa;
+extern struct nv30_state_entry nv30_state_viewport;
+extern struct nv30_state_entry nv30_state_framebuffer;
+extern struct nv30_state_entry nv30_state_fragtex;
+extern struct nv30_state_entry nv30_state_vbo;
+
+/* nv30_vbo.c */
+extern boolean nv30_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv30_draw_elements(struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+
+/* nv30_clear.c */
+extern void nv30_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+#endif
diff --git a/src/gallium/drivers/nv30/nv30_draw.c b/src/gallium/drivers/nv30/nv30_draw.c
new file mode 100644
index 0000000000..74fc138c05
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_draw.c
@@ -0,0 +1,61 @@
+#include "draw/draw_pipe.h"
+
+#include "nv30_context.h"
+
+struct nv30_draw_stage {
+	struct draw_stage draw;
+	struct nv30_context *nv30;
+};
+
+static void
+nv30_draw_point(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_line(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_tri(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_flush(struct draw_stage *draw, unsigned flags)
+{
+}
+
+static void
+nv30_draw_reset_stipple_counter(struct draw_stage *draw)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_destroy(struct draw_stage *draw)
+{
+	FREE(draw);
+}
+
+struct draw_stage *
+nv30_draw_render_stage(struct nv30_context *nv30)
+{
+	struct nv30_draw_stage *nv30draw = CALLOC_STRUCT(nv30_draw_stage);
+
+	nv30draw->nv30 = nv30;
+	nv30draw->draw.draw = nv30->draw;
+	nv30draw->draw.point = nv30_draw_point;
+	nv30draw->draw.line = nv30_draw_line;
+	nv30draw->draw.tri = nv30_draw_tri;
+	nv30draw->draw.flush = nv30_draw_flush;
+	nv30draw->draw.reset_stipple_counter = nv30_draw_reset_stipple_counter;
+	nv30draw->draw.destroy = nv30_draw_destroy;
+
+	return &nv30draw->draw;
+}
+
diff --git a/src/gallium/drivers/nv30/nv30_fragprog.c b/src/gallium/drivers/nv30/nv30_fragprog.c
new file mode 100644
index 0000000000..320ba3f4bf
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_fragprog.c
@@ -0,0 +1,911 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv30_context.h"
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 1
+#define MASK_Y 2
+#define MASK_Z 4
+#define MASK_W 8
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE NV30_FP_OP_DST_SCALE_1X
+#define DEF_CTEST NV30_FP_OP_COND_TR
+#include "nv30_shader.h"
+
+#define swz(s,x,y,z,w) nv30_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv30_sr_neg((s))
+#define abs(s) nv30_sr_abs((s))
+#define scale(s,v) nv30_sr_scale((s), NV30_FP_OP_DST_SCALE_##v)
+
+#define MAX_CONSTS 128
+#define MAX_IMM 32
+struct nv30_fpc {
+	struct nv30_fragment_program *fp;
+
+	uint attrib_map[PIPE_MAX_SHADER_INPUTS];
+
+	int high_temp;
+	int temp_temp_count;
+	int num_regs;
+
+	uint depth_id;
+	uint colour_id;
+
+	unsigned inst_offset;
+
+	struct {
+		int pipe;
+		float vals[4];
+	} consts[MAX_CONSTS];
+	int nr_consts;
+
+	struct nv30_sreg imm[MAX_IMM];
+	unsigned nr_imm;
+};
+
+static INLINE struct nv30_sreg
+temp(struct nv30_fpc *fpc)
+{
+	int idx;
+
+	idx  = fpc->temp_temp_count++;
+	idx += fpc->high_temp + 1;
+	return nv30_sr(NV30SR_TEMP, idx);
+}
+
+static INLINE struct nv30_sreg
+constant(struct nv30_fpc *fpc, int pipe, float vals[4])
+{
+	int idx;
+
+	if (fpc->nr_consts == MAX_CONSTS)
+		assert(0);
+	idx = fpc->nr_consts++;
+
+	fpc->consts[idx].pipe = pipe;
+	if (pipe == -1)
+		memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
+	return nv30_sr(NV30SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv30_fp_arith((cc), (s), NV30_FP_OP_OPCODE_##o, \
+			(d), (m), (s0), (s1), (s2))
+#define tex(cc,s,o,u,d,m,s0,s1,s2) \
+	nv30_fp_tex((cc), (s), NV30_FP_OP_OPCODE_##o, (u), \
+		    (d), (m), (s0), none, none)
+
+static void
+grow_insns(struct nv30_fpc *fpc, int size)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+
+	fp->insn_len += size;
+	fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
+}
+
+static void
+emit_src(struct nv30_fpc *fpc, int pos, struct nv30_sreg src)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV30SR_INPUT:
+		sr |= (NV30_FP_REG_TYPE_INPUT << NV30_FP_REG_TYPE_SHIFT);
+		hw[0] |= (src.index << NV30_FP_OP_INPUT_SRC_SHIFT);
+		break;
+	case NV30SR_OUTPUT:
+		sr |= NV30_FP_REG_SRC_HALF;
+		/* fall-through */
+	case NV30SR_TEMP:
+		sr |= (NV30_FP_REG_TYPE_TEMP << NV30_FP_REG_TYPE_SHIFT);
+		sr |= (src.index << NV30_FP_REG_SRC_SHIFT);
+		break;
+	case NV30SR_CONST:
+		grow_insns(fpc, 4);
+		hw = &fp->insn[fpc->inst_offset];
+		if (fpc->consts[src.index].pipe >= 0) {
+			struct nv30_fragment_program_data *fpd;
+
+			fp->consts = realloc(fp->consts, ++fp->nr_consts *
+					     sizeof(*fpd));
+			fpd = &fp->consts[fp->nr_consts - 1];
+			fpd->offset = fpc->inst_offset + 4;
+			fpd->index = fpc->consts[src.index].pipe;
+			memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
+		} else {
+			memcpy(&fp->insn[fpc->inst_offset + 4],
+				fpc->consts[src.index].vals,
+				sizeof(uint32_t) * 4);
+		}
+
+		sr |= (NV30_FP_REG_TYPE_CONST << NV30_FP_REG_TYPE_SHIFT);	
+		break;
+	case NV30SR_NONE:
+		sr |= (NV30_FP_REG_TYPE_INPUT << NV30_FP_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV30_FP_REG_NEGATE;
+
+	if (src.abs)
+		hw[1] |= (1 << (29 + pos));
+
+	sr |= ((src.swz[0] << NV30_FP_REG_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV30_FP_REG_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV30_FP_REG_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV30_FP_REG_SWZ_W_SHIFT));
+
+	hw[pos + 1] |= sr;
+}
+
+static void
+emit_dst(struct nv30_fpc *fpc, struct nv30_sreg dst)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+
+	switch (dst.type) {
+	case NV30SR_TEMP:
+		if (fpc->num_regs < (dst.index + 1))
+			fpc->num_regs = dst.index + 1;
+		break;
+	case NV30SR_OUTPUT:
+		if (dst.index == 1) {
+			fp->fp_control |= 0xe;
+		} else {
+			hw[0] |= NV30_FP_OP_OUT_REG_HALF;
+		}
+		break;
+	case NV30SR_NONE:
+		hw[0] |= (1 << 30);
+		break;
+	default:
+		assert(0);
+	}
+
+	hw[0] |= (dst.index << NV30_FP_OP_OUT_REG_SHIFT);
+}
+
+static void
+nv30_fp_arith(struct nv30_fpc *fpc, int sat, int op,
+	      struct nv30_sreg dst, int mask,
+	      struct nv30_sreg s0, struct nv30_sreg s1, struct nv30_sreg s2)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+	uint32_t *hw;
+
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	hw = &fp->insn[fpc->inst_offset];
+	memset(hw, 0, sizeof(uint32_t) * 4);
+
+	if (op == NV30_FP_OP_OPCODE_KIL)
+		fp->fp_control |= NV34TCL_FP_CONTROL_USES_KIL;
+	hw[0] |= (op << NV30_FP_OP_OPCODE_SHIFT);
+	hw[0] |= (mask << NV30_FP_OP_OUTMASK_SHIFT);
+	hw[2] |= (dst.dst_scale << NV30_FP_OP_DST_SCALE_SHIFT);
+
+	if (sat)
+		hw[0] |= NV30_FP_OP_OUT_SAT;
+
+	if (dst.cc_update)
+		hw[0] |= NV30_FP_OP_COND_WRITE_ENABLE;
+	hw[1] |= (dst.cc_test << NV30_FP_OP_COND_SHIFT);
+	hw[1] |= ((dst.cc_swz[0] << NV30_FP_OP_COND_SWZ_X_SHIFT) |
+		  (dst.cc_swz[1] << NV30_FP_OP_COND_SWZ_Y_SHIFT) |
+		  (dst.cc_swz[2] << NV30_FP_OP_COND_SWZ_Z_SHIFT) |
+		  (dst.cc_swz[3] << NV30_FP_OP_COND_SWZ_W_SHIFT));
+
+	emit_dst(fpc, dst);
+	emit_src(fpc, 0, s0);
+	emit_src(fpc, 1, s1);
+	emit_src(fpc, 2, s2);
+}
+
+static void
+nv30_fp_tex(struct nv30_fpc *fpc, int sat, int op, int unit,
+	    struct nv30_sreg dst, int mask,
+	    struct nv30_sreg s0, struct nv30_sreg s1, struct nv30_sreg s2)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+
+	nv30_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
+
+	fp->insn[fpc->inst_offset] |= (unit << NV30_FP_OP_TEX_UNIT_SHIFT);
+	fp->samplers |= (1 << unit);
+}
+
+static INLINE struct nv30_sreg
+tgsi_src(struct nv30_fpc *fpc, const struct tgsi_full_src_register *fsrc)
+{
+	struct nv30_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv30_sr(NV30SR_INPUT,
+			      fpc->attrib_map[fsrc->SrcRegister.Index]);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(fpc, fsrc->SrcRegister.Index, NULL);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		assert(fsrc->SrcRegister.Index < fpc->nr_imm);
+		src = fpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = nv30_sr(NV30SR_TEMP, fsrc->SrcRegister.Index + 1);
+		if (fpc->high_temp < src.index)
+			fpc->high_temp = src.index;
+		break;
+	/* This is clearly insane, but gallium hands us shaders like this.
+	 * Luckily fragprog results are just temp regs..
+	 */
+	case TGSI_FILE_OUTPUT:
+		if (fsrc->SrcRegister.Index == fpc->colour_id)
+			return nv30_sr(NV30SR_OUTPUT, 0);
+		else
+			return nv30_sr(NV30SR_OUTPUT, 1);
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+tgsi_dst(struct nv30_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
+	int idx;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		if (fdst->DstRegister.Index == fpc->colour_id)
+			return nv30_sr(NV30SR_OUTPUT, 0);
+		else
+			return nv30_sr(NV30SR_OUTPUT, 1);
+		break;
+	case TGSI_FILE_TEMPORARY:
+		idx = fdst->DstRegister.Index + 1;
+		if (fpc->high_temp < idx)
+			fpc->high_temp = idx;
+		return nv30_sr(NV30SR_TEMP, idx);
+	case TGSI_FILE_NULL:
+		return nv30_sr(NV30SR_NONE, 0);
+	default:
+		NOUVEAU_ERR("bad dst file %d\n", fdst->DstRegister.File);
+		return nv30_sr(NV30SR_NONE, 0);
+	}
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+src_native_swz(struct nv30_fpc *fpc, const struct tgsi_full_src_register *fsrc,
+	       struct nv30_sreg *src)
+{
+	const struct nv30_sreg none = nv30_sr(NV30SR_NONE, 0);
+	struct nv30_sreg tgsi = tgsi_src(fpc, fsrc);
+	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
+	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
+			fsrc->SrcRegisterExtSwz.NegateY,
+			fsrc->SrcRegisterExtSwz.NegateZ,
+			fsrc->SrcRegisterExtSwz.NegateW };
+	uint c;
+
+	for (c = 0; c < 4; c++) {
+		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
+		case TGSI_EXTSWIZZLE_X:
+		case TGSI_EXTSWIZZLE_Y:
+		case TGSI_EXTSWIZZLE_Z:
+		case TGSI_EXTSWIZZLE_W:
+			mask |= (1 << c);
+			break;
+		case TGSI_EXTSWIZZLE_ZERO:
+			zero_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		case TGSI_EXTSWIZZLE_ONE:
+			one_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		default:
+			assert(0);
+		}
+
+		if (!tgsi.negate && neg[c])
+			neg_mask |= (1 << c);
+	}
+
+	if (mask == MASK_ALL && !neg_mask)
+		return TRUE;
+
+	*src = temp(fpc);
+
+	if (mask)
+		arith(fpc, 0, MOV, *src, mask, tgsi, none, none);
+
+	if (zero_mask)
+		arith(fpc, 0, SFL, *src, zero_mask, *src, none, none);
+
+	if (one_mask)
+		arith(fpc, 0, STR, *src, one_mask, *src, none, none);
+
+	if (neg_mask) {
+		struct nv30_sreg one = temp(fpc);
+		arith(fpc, 0, STR, one, neg_mask, one, none, none);
+		arith(fpc, 0, MUL, *src, neg_mask, *src, neg(one), none);
+	}
+
+	return FALSE;
+}
+
+static boolean
+nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
+				const struct tgsi_full_instruction *finst)
+{
+	const struct nv30_sreg none = nv30_sr(NV30SR_NONE, 0);
+	struct nv30_sreg src[3], dst, tmp;
+	int mask, sat, unit = 0;
+	int ai = -1, ci = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	fpc->temp_temp_count = 0;
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(fpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_TEMPORARY:
+			if (!src_native_swz(fpc, fsrc, &src[i]))
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				NOUVEAU_MSG("extra src attr %d\n",
+					 fsrc->SrcRegister.Index);
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_IMMEDIATE:
+			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		case TGSI_FILE_SAMPLER:
+			unit = fsrc->SrcRegister.Index;
+			break;
+		case TGSI_FILE_OUTPUT:
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(fpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+	sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_CMP:
+		tmp = temp(fpc);
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		tmp.cc_update = 1;
+		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NV30_VP_INST_COND_LT;
+		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
+		break;
+	case TGSI_OPCODE_COS:
+		arith(fpc, sat, COS, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[1], none);
+		arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
+		      swz(src[1], W, W, W, W), none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(fpc, sat, EX2, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(fpc, sat, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(fpc, sat, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_KILP:
+		arith(fpc, 0, KIL, none, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_KIL:
+		dst = nv30_sr(NV30SR_NONE, 0);
+		dst.cc_update = 1;
+		arith(fpc, 0, MOV, dst, MASK_ALL, src[0], none, none);
+		dst.cc_update = 0; dst.cc_test = NV30_FP_OP_COND_LT;
+		arith(fpc, 0, KIL, dst, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(fpc, sat, LG2, dst, mask, src[0], none, none);
+		break;
+//	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_LRP:
+		arith(fpc, sat, LRP, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(fpc, sat, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_NOISE1:
+	case TGSI_OPCODE_NOISE2:
+	case TGSI_OPCODE_NOISE3:
+	case TGSI_OPCODE_NOISE4:
+		arith(fpc, sat, SFL, dst, mask, none, none, none);
+		break;
+	case TGSI_OPCODE_POW:
+		arith(fpc, sat, POW, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(fpc, sat, RCP, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_RET:
+		assert(0);
+		break;
+	case TGSI_OPCODE_RFL:
+		arith(fpc, 0, RFL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(fpc, sat, RSQ, dst, mask, abs(swz(src[0], X, X, X, X)), none, none);
+		break;
+	case TGSI_OPCODE_SCS:
+		if (mask & MASK_X) {
+			arith(fpc, sat, COS, dst, MASK_X,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		if (mask & MASK_Y) {
+			arith(fpc, sat, SIN, dst, MASK_Y,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		break;
+	case TGSI_OPCODE_SIN:
+		arith(fpc, sat, SIN, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(fpc, sat, SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
+		break;
+	case TGSI_OPCODE_TEX:
+		tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXB:
+		tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXP:
+		tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(fpc);
+		arith(fpc, 0, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(fpc, sat, MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_fragprog_parse_decl_attrib(struct nv30_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV30_FP_OP_INPUT_SRC_POSITION;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_FP_OP_INPUT_SRC_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_FP_OP_INPUT_SRC_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV30_FP_OP_INPUT_SRC_FOGC;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV30_FP_OP_INPUT_SRC_TC(fdec->Semantic.
+						     SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad input semantic\n");
+		return FALSE;
+	}
+
+	fpc->attrib_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv30_fragprog_parse_decl_output(struct nv30_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		fpc->depth_id = fdec->DeclarationRange.First;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		fpc->colour_id = fdec->DeclarationRange.First;
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_fragprog_prepare(struct nv30_fpc *fpc)
+{
+	struct tgsi_parse_context p;
+	/*int high_temp = -1, i;*/
+
+	tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_INPUT:
+				if (!nv30_fragprog_parse_decl_attrib(fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (!nv30_fragprog_parse_decl_output(fpc, fdec))
+					goto out_err;
+				break;
+			/*case TGSI_FILE_TEMPORARY:
+				if (fdec->DeclarationRange.Last > high_temp) {
+					high_temp =
+						fdec->DeclarationRange.Last;
+				}
+				break;*/
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			struct tgsi_full_immediate *imm;
+			float vals[4];
+			
+			imm = &p.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(fpc->nr_imm < MAX_IMM);
+
+			vals[0] = imm->u.ImmediateFloat32[0].Float;
+			vals[1] = imm->u.ImmediateFloat32[1].Float;
+			vals[2] = imm->u.ImmediateFloat32[2].Float;
+			vals[3] = imm->u.ImmediateFloat32[3].Float;
+			fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
+		}
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	/*if (++high_temp) {
+		fpc->r_temp = CALLOC(high_temp, sizeof(struct nv30_sreg));
+		for (i = 0; i < high_temp; i++)
+			fpc->r_temp[i] = temp(fpc);
+		fpc->r_temps_discard = 0;
+	}*/
+
+	return TRUE;
+
+out_err:
+	/*if (fpc->r_temp)
+		FREE(fpc->r_temp);*/
+	tgsi_parse_free(&p);
+	return FALSE;
+}
+
+static void
+nv30_fragprog_translate(struct nv30_context *nv30,
+			struct nv30_fragment_program *fp)
+{
+	struct tgsi_parse_context parse;
+	struct nv30_fpc *fpc = NULL;
+
+	tgsi_dump(fp->pipe.tokens,0);
+
+	fpc = CALLOC(1, sizeof(struct nv30_fpc));
+	if (!fpc)
+		return;
+	fpc->fp = fp;
+	fpc->high_temp = -1;
+	fpc->num_regs = 2;
+
+	if (!nv30_fragprog_prepare(fpc)) {
+		FREE(fpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, fp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv30_fragprog_parse_instruction(fpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	fp->fp_control |= (fpc->num_regs-1)/2;
+	fp->fp_reg_control = (1<<16)|0x4;
+
+	/* Terminate final instruction */
+	fp->insn[fpc->inst_offset] |= 0x00000001;
+
+	/* Append NOP + END instruction, may or may not be necessary. */
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	fp->insn[fpc->inst_offset + 0] = 0x00000001;
+	fp->insn[fpc->inst_offset + 1] = 0x00000000;
+	fp->insn[fpc->inst_offset + 2] = 0x00000000;
+	fp->insn[fpc->inst_offset + 3] = 0x00000000;
+	
+	fp->translated = TRUE;
+	fp->on_hw = FALSE;
+out_err:
+	tgsi_parse_free(&parse);
+	FREE(fpc);
+}
+
+static void
+nv30_fragprog_upload(struct nv30_context *nv30,
+		     struct nv30_fragment_program *fp)
+{
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	const uint32_t le = 1;
+	uint32_t *map;
+	int i;
+
+	map = ws->buffer_map(ws, fp->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+#if 0
+	for (i = 0; i < fp->insn_len; i++) {
+		fflush(stdout); fflush(stderr);
+		NOUVEAU_ERR("%d 0x%08x\n", i, fp->insn[i]);
+		fflush(stdout); fflush(stderr);
+	}
+#endif
+
+	if ((*(const uint8_t *)&le)) {
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = fp->insn[i];
+		}
+	} else {
+		/* Weird swapping for big-endian chips */
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = ((fp->insn[i] & 0xffff) << 16) |
+				  ((fp->insn[i] >> 16) & 0xffff);
+		}
+	}
+
+	ws->buffer_unmap(ws, fp->buffer);
+}
+
+static boolean
+nv30_fragprog_validate(struct nv30_context *nv30)
+{
+	struct nv30_fragment_program *fp = nv30->fragprog;
+	struct pipe_buffer *constbuf =
+		nv30->constbuf[PIPE_SHADER_FRAGMENT];
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	struct nouveau_stateobj *so;
+	boolean new_consts = FALSE;
+	int i;
+
+	if (fp->translated)
+		goto update_constants;
+
+	/*nv30->fallback_swrast &= ~NV30_NEW_FRAGPROG;*/
+	nv30_fragprog_translate(nv30, fp);
+	if (!fp->translated) {
+		/*nv30->fallback_swrast |= NV30_NEW_FRAGPROG;*/
+		return FALSE;
+	}
+
+	fp->buffer = ws->buffer_create(ws, 0x100, 0, fp->insn_len * 4);
+	nv30_fragprog_upload(nv30, fp);
+
+	so = so_new(8, 1);
+	so_method(so, nv30->screen->rankine, NV34TCL_FP_ACTIVE_PROGRAM, 1);
+	so_reloc (so, fp->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+		  NV34TCL_FP_ACTIVE_PROGRAM_DMA0, NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
+	so_method(so, nv30->screen->rankine, NV34TCL_FP_CONTROL, 1);
+	so_data  (so, fp->fp_control);
+	so_method(so, nv30->screen->rankine, NV34TCL_FP_REG_CONTROL, 1);
+	so_data  (so, fp->fp_reg_control);
+	so_method(so, nv30->screen->rankine, NV34TCL_TX_UNITS_ENABLE, 1);
+	so_data  (so, fp->samplers);
+	so_ref(so, &fp->so);
+
+update_constants:
+	if (fp->nr_consts) {
+		float *map;
+		
+		map = ws->buffer_map(ws, constbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		for (i = 0; i < fp->nr_consts; i++) {
+			struct nv30_fragment_program_data *fpd = &fp->consts[i];
+			uint32_t *p = &fp->insn[fpd->offset];
+			uint32_t *cb = (uint32_t *)&map[fpd->index * 4];
+
+			if (!memcmp(p, cb, 4 * sizeof(float)))
+				continue;
+			memcpy(p, cb, 4 * sizeof(float));
+			new_consts = TRUE;
+		}
+		ws->buffer_unmap(ws, constbuf);
+
+		if (new_consts)
+			nv30_fragprog_upload(nv30, fp);
+	}
+
+	if (new_consts || fp->so != nv30->state.hw[NV30_STATE_FRAGPROG]) {
+		so_ref(fp->so, &nv30->state.hw[NV30_STATE_FRAGPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv30_fragprog_destroy(struct nv30_context *nv30,
+		      struct nv30_fragment_program *fp)
+{
+	if (fp->insn_len)
+		FREE(fp->insn);
+}
+
+struct nv30_state_entry nv30_state_fragprog = {
+	.validate = nv30_fragprog_validate,
+	.dirty = {
+		.pipe = NV30_NEW_FRAGPROG,
+		.hw = NV30_STATE_FRAGPROG
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_fragtex.c b/src/gallium/drivers/nv30/nv30_fragtex.c
new file mode 100644
index 0000000000..b1d2663af3
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_fragtex.c
@@ -0,0 +1,163 @@
+#include "nv30_context.h"
+#include "nouveau/nouveau_util.h"
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w)                        \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV34TCL_TX_FORMAT_FORMAT_##tf,                                               \
+  (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y |           \
+   NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w |           \
+   NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y |           \
+   NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w)            \
+}
+
+struct nv30_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+};
+
+static struct nv30_texture_format
+nv30_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(A1R5G5B5_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(A4R4G4B4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(R5G6B5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X),
+	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X),
+	_(A8L8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y),
+//	_(Z16_UNORM     , Z16     ,   S1,   S1,   S1,  ONE, X, X, X, X),
+//	_(Z24S8_UNORM   , Z24     ,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	{},
+};
+
+static struct nv30_texture_format *
+nv30_fragtex_format(uint pipe_format)
+{
+	struct nv30_texture_format *tf = nv30_texture_formats;
+	char fs[128];
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", pf_name(pipe_format));
+	return NULL;
+}
+
+
+static struct nouveau_stateobj *
+nv30_fragtex_build(struct nv30_context *nv30, int unit)
+{
+	struct nv30_sampler_state *ps = nv30->tex_sampler[unit];
+	struct nv30_miptree *nv30mt = nv30->tex_miptree[unit];
+	struct pipe_texture *pt = &nv30mt->base;
+	struct nv30_texture_format *tf;
+	struct nouveau_stateobj *so;
+	uint32_t txf, txs , txp;
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	tf = nv30_fragtex_format(pt->format);
+	if (!tf)
+		assert(0);
+
+	txf  = tf->format;
+	txf |= ((pt->last_level>0) ? NV34TCL_TX_FORMAT_MIPMAP : 0);
+	txf |= log2i(pt->width[0]) << 20;
+	txf |= log2i(pt->height[0]) << 24;
+	txf |= log2i(pt->depth[0]) << 28;
+	txf |= NV34TCL_TX_FORMAT_NO_BORDER | 0x10000;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV34TCL_TX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_1D;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return NULL;
+	}
+
+	if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		txp = 0;
+	} else {
+		txp  = nv30mt->level[0].pitch;
+		txf |= (1<<13) /*FIXME: NV34TCL_TX_FORMAT_LINEAR ? */;
+	}
+
+	txs = tf->swizzle;
+
+	so = so_new(16, 2);
+	so_method(so, nv30->screen->rankine, NV34TCL_TX_OFFSET(unit), 8);
+	so_reloc (so, nv30mt->buffer, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, nv30mt->buffer, txf, tex_flags | NOUVEAU_BO_OR,
+		  NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
+	so_data  (so, ps->wrap);
+	so_data  (so, NV34TCL_TX_ENABLE_ENABLE | ps->en);
+	so_data  (so, txs);
+	so_data  (so, ps->filt | 0x2000 /*voodoo*/);
+	so_data  (so, (pt->width[0] << NV34TCL_TX_NPOT_SIZE_W_SHIFT) |
+		       pt->height[0]);
+	so_data  (so, ps->bcol);
+
+	return so;
+}
+
+static boolean
+nv30_fragtex_validate(struct nv30_context *nv30)
+{
+	struct nv30_fragment_program *fp = nv30->fragprog;
+	struct nv30_state *state = &nv30->state;
+	struct nouveau_stateobj *so;
+	unsigned samplers, unit;
+
+	samplers = state->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = so_new(2, 0);
+		so_method(so, nv30->screen->rankine, NV34TCL_TX_ENABLE(unit), 1);
+		so_data  (so, 0);
+		so_ref(so, &nv30->state.hw[NV30_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV30_STATE_FRAGTEX0 + unit));
+	}
+
+	samplers = nv30->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = nv30_fragtex_build(nv30, unit);
+		so_ref(so, &nv30->state.hw[NV30_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV30_STATE_FRAGTEX0 + unit));
+	}
+
+	nv30->state.fp_samplers = fp->samplers;
+	return FALSE;
+}
+
+struct nv30_state_entry nv30_state_fragtex = {
+	.validate = nv30_fragtex_validate,
+	.dirty = {
+		.pipe = NV30_NEW_SAMPLER | NV30_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_miptree.c b/src/gallium/drivers/nv30/nv30_miptree.c
new file mode 100644
index 0000000000..b11ed8c24e
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_miptree.c
@@ -0,0 +1,235 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv30_context.h"
+
+static void
+nv30_miptree_layout(struct nv30_miptree *nv30mt)
+{
+	struct pipe_texture *pt = &nv30mt->base;
+	uint width = pt->width[0], height = pt->height[0], depth = pt->depth[0];
+	uint offset = 0;
+	int nr_faces, l, f;
+	uint wide_pitch = pt->tex_usage & (PIPE_TEXTURE_USAGE_SAMPLER |
+		                           PIPE_TEXTURE_USAGE_DEPTH_STENCIL |
+		                           PIPE_TEXTURE_USAGE_RENDER_TARGET |
+		                           PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
+		                           PIPE_TEXTURE_USAGE_PRIMARY);
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		nr_faces = pt->depth[0];
+	} else {
+		nr_faces = 1;
+	}
+
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->depth[l] = depth;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (wide_pitch && (pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR))
+			nv30mt->level[l].pitch = align(pt->width[0] * pt->block.size, 64);
+		else
+			nv30mt->level[l].pitch = pt->width[l] * pt->block.size;
+
+		nv30mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+		depth  = MAX2(1, depth  >> 1);
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l < pt->last_level; l++) {
+			nv30mt->level[l].image_offset[f] = offset;
+
+			if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR) &&
+			    pt->width[l + 1] > 1 && pt->height[l + 1] > 1)
+				offset += align(nv30mt->level[l].pitch * pt->height[l], 64);
+			else
+				offset += nv30mt->level[l].pitch * pt->height[l];
+		}
+
+		nv30mt->level[l].image_offset[f] = offset;
+		offset += nv30mt->level[l].pitch * pt->height[l];
+	}
+
+	nv30mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv30_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv30_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv30_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->shadow_tex = NULL;
+	mt->shadow_surface = NULL;
+
+	/* Swizzled textures must be POT */
+	if (pt->width[0] & (pt->width[0] - 1) ||
+	    pt->height[0] & (pt->height[0] - 1))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & (PIPE_TEXTURE_USAGE_PRIMARY |
+	                     PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
+	                     PIPE_TEXTURE_USAGE_DEPTH_STENCIL))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else {
+		switch (pt->format) {
+		/* TODO: Figure out which formats can be swizzled */
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_X8R8G8B8_UNORM:
+		case PIPE_FORMAT_R16_SNORM:
+		{
+			if (debug_get_bool_option("NOUVEAU_NO_SWIZZLE", FALSE))
+				mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+ 			break;
+		}
+		default:
+			mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+		}
+	}
+
+	nv30_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256,
+				       PIPE_BUFFER_USAGE_PIXEL |
+				       NOUVEAU_BUFFER_USAGE_TEXTURE,
+				       mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+
+	return &mt->base;
+}
+
+static struct pipe_texture *
+nv30_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
+		     const unsigned *stride, struct pipe_buffer *pb)
+{
+	struct nv30_miptree *mt;
+
+	/* Only supports 2D, non-mipmapped textures for the moment */
+	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
+	    pt->depth[0] != 1)
+		return NULL;
+
+	mt = CALLOC_STRUCT(nv30_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->level[0].pitch = stride[0];
+	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+
+	pipe_buffer_reference(pscreen, &mt->buffer, pb);
+	return &mt->base;
+}
+
+static void
+nv30_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **ppt)
+{
+	struct pipe_texture *pt = *ppt;
+	struct nv30_miptree *mt = (struct nv30_miptree *)pt;
+	int l;
+
+	*ppt = NULL;
+	if (--pt->refcount)
+		return;
+
+	pipe_buffer_reference(pscreen, &mt->buffer, NULL);
+	for (l = 0; l <= pt->last_level; l++) {
+		if (mt->level[l].image_offset)
+			FREE(mt->level[l].image_offset);
+	}
+
+	if (mt->shadow_tex) {
+		if (mt->shadow_surface)
+			pscreen->tex_surface_release(pscreen, &mt->shadow_surface);
+		nv30_miptree_release(pscreen, &mt->shadow_tex);
+	}
+
+	FREE(mt);
+}
+
+static struct pipe_surface *
+nv30_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv30_miptree *nv30mt = (struct nv30_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = nv30mt->level[level].pitch;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+	ps->face = face;
+	ps->level = level;
+	ps->zslice = zslice;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = nv30mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ps->offset = nv30mt->level[level].image_offset[zslice];
+	} else {
+		ps->offset = nv30mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv30_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+
+	*psurface = NULL;
+	if (--ps->refcount > 0)
+		return;
+
+	pipe_texture_reference(&ps->texture, NULL);
+	FREE(ps);
+}
+
+void
+nv30_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv30_miptree_create;
+	pscreen->texture_blanket = nv30_miptree_blanket;
+	pscreen->texture_release = nv30_miptree_release;
+	pscreen->get_tex_surface = nv30_miptree_surface_new;
+	pscreen->tex_surface_release = nv30_miptree_surface_del;
+}
diff --git a/src/gallium/drivers/nv30/nv30_query.c b/src/gallium/drivers/nv30/nv30_query.c
new file mode 100644
index 0000000000..2f974cf5c4
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_query.c
@@ -0,0 +1,122 @@
+#include "pipe/p_context.h"
+
+#include "nv30_context.h"
+
+struct nv30_query {
+	struct nouveau_resource *object;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static INLINE struct nv30_query *
+nv30_query(struct pipe_query *pipe)
+{
+	return (struct nv30_query *)pipe;
+}
+
+static struct pipe_query *
+nv30_query_create(struct pipe_context *pipe, unsigned query_type)
+{
+	struct nv30_query *q;
+
+	q = CALLOC(1, sizeof(struct nv30_query));
+	q->type = query_type;
+
+	return (struct pipe_query *)q;
+}
+
+static void
+nv30_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+
+	if (q->object)
+		nv30->nvws->res_free(&q->object);
+	FREE(q);
+}
+
+static void
+nv30_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+
+	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	/* Happens when end_query() is called, then another begin_query()
+	 * without querying the result in-between.  For now we'll wait for
+	 * the existing query to notify completion, but it could be better.
+	 */
+	if (q->object) {
+		uint64_t tmp;
+		pipe->get_query_result(pipe, pq, 1, &tmp);
+	}
+
+	if (nv30->nvws->res_alloc(nv30->screen->query_heap, 1, NULL, &q->object))
+		assert(0);
+	nv30->nvws->notifier_reset(nv30->screen->query, q->object->start);
+
+	BEGIN_RING(rankine, NV34TCL_QUERY_RESET, 1);
+	OUT_RING  (1);
+	BEGIN_RING(rankine, NV34TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (1);
+
+	q->ready = FALSE;
+}
+
+static void
+nv30_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+
+	BEGIN_RING(rankine, NV34TCL_QUERY_GET, 1);
+	OUT_RING  ((0x01 << NV34TCL_QUERY_GET_UNK24_SHIFT) |
+		   ((q->object->start * 32) << NV34TCL_QUERY_GET_OFFSET_SHIFT));
+	FIRE_RING(NULL);
+}
+
+static boolean
+nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+		  boolean wait, uint64_t *result)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+	struct nouveau_winsys *nvws = nv30->nvws;
+
+	assert(q->object && q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	if (!q->ready) {
+		unsigned status;
+
+		status = nvws->notifier_status(nv30->screen->query,
+					       q->object->start);
+		if (status != NV_NOTIFY_STATE_STATUS_COMPLETED) {
+			if (wait == FALSE)
+				return FALSE;
+			nvws->notifier_wait(nv30->screen->query, q->object->start,
+					    NV_NOTIFY_STATE_STATUS_COMPLETED,
+					    0);
+		}
+
+		q->result = nvws->notifier_retval(nv30->screen->query,
+						  q->object->start);
+		q->ready = TRUE;
+		nvws->res_free(&q->object);
+	}
+
+	*result = q->result;
+	return TRUE;
+}
+
+void
+nv30_init_query_functions(struct nv30_context *nv30)
+{
+	nv30->pipe.create_query = nv30_query_create;
+	nv30->pipe.destroy_query = nv30_query_destroy;
+	nv30->pipe.begin_query = nv30_query_begin;
+	nv30->pipe.end_query = nv30_query_end;
+	nv30->pipe.get_query_result = nv30_query_result;
+}
diff --git a/src/gallium/drivers/nv30/nv30_screen.c b/src/gallium/drivers/nv30/nv30_screen.c
new file mode 100644
index 0000000000..c97a73f0b1
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_screen.c
@@ -0,0 +1,401 @@
+#include "pipe/p_screen.h"
+#include "util/u_simple_screen.h"
+
+#include "nv30_context.h"
+#include "nv30_screen.h"
+
+#define NV30TCL_CHIPSET_3X_MASK 0x00000003
+#define NV34TCL_CHIPSET_3X_MASK 0x00000010
+#define NV35TCL_CHIPSET_3X_MASK 0x000001e0
+
+static const char *
+nv30_screen_get_name(struct pipe_screen *pscreen)
+{
+	struct nv30_screen *screen = nv30_screen(pscreen);
+	struct nouveau_device *dev = screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv30_screen_get_vendor(struct pipe_screen *pscreen)
+{
+	return "nouveau";
+}
+
+static int
+nv30_screen_get_param(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 16;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 1;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 2;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+		return 0;
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 0;
+	case NOUVEAU_CAP_HW_VTXBUF:
+	case NOUVEAU_CAP_HW_IDXBUF:
+		return 1;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv30_screen_get_paramf(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 8.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv30_screen_surface_format_supported(struct pipe_screen *pscreen,
+				     enum pipe_format format,
+				     enum pipe_texture_target target,
+				     unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_A8L8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+		case PIPE_FORMAT_Z24S8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static struct pipe_buffer *
+nv30_surface_buffer(struct pipe_surface *surf)
+{
+	struct nv30_miptree *mt = (struct nv30_miptree *)surf->texture;
+
+	return mt->buffer;
+}
+
+static void *
+nv30_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys	*ws = screen->winsys;
+	struct pipe_surface	*surface_to_map;
+	void			*map;
+
+	if (!(surface->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		struct nv30_miptree *mt = (struct nv30_miptree *)surface->texture;
+
+		if (!mt->shadow_tex) {
+			unsigned old_tex_usage = surface->texture->tex_usage;
+			surface->texture->tex_usage = NOUVEAU_TEXTURE_USAGE_LINEAR |
+			                              PIPE_TEXTURE_USAGE_DYNAMIC;
+			mt->shadow_tex = screen->texture_create(screen, surface->texture);
+			surface->texture->tex_usage = old_tex_usage;
+
+			assert(mt->shadow_tex->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR);
+		}
+
+		mt->shadow_surface = screen->get_tex_surface
+		(
+			screen, mt->shadow_tex,
+			surface->face, surface->level, surface->zslice,
+			surface->usage
+		);
+
+		surface_to_map = mt->shadow_surface;
+	}
+	else
+		surface_to_map = surface;
+
+	assert(surface_to_map);
+
+	map = ws->buffer_map(ws, nv30_surface_buffer(surface_to_map), flags);
+	if (!map)
+		return NULL;
+
+	return map + surface_to_map->offset;
+}
+
+static void
+nv30_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys	*ws = screen->winsys;
+	struct pipe_surface	*surface_to_unmap;
+
+	/* TODO: Copy from shadow just before push buffer is flushed instead.
+	         There are probably some programs that map/unmap excessively
+	         before rendering. */
+	if (!(surface->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		struct nv30_miptree *mt = (struct nv30_miptree *)surface->texture;
+
+		assert(mt->shadow_tex);
+
+		surface_to_unmap = mt->shadow_surface;
+	}
+	else
+		surface_to_unmap = surface;
+
+	assert(surface_to_unmap);
+
+	ws->buffer_unmap(ws, nv30_surface_buffer(surface_to_unmap));
+
+	if (surface_to_unmap != surface) {
+		struct nv30_screen *nvscreen = nv30_screen(screen);
+
+		nvscreen->eng2d->copy(nvscreen->eng2d, surface, 0, 0,
+		                      surface_to_unmap, 0, 0,
+		                      surface->width, surface->height);
+
+		screen->tex_surface_release(screen, &surface_to_unmap);
+	}
+}
+
+static void
+nv30_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv30_screen *screen = nv30_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->res_free(&screen->vp_exec_heap);
+	nvws->res_free(&screen->vp_data_heap);
+	nvws->res_free(&screen->query_heap);
+	nvws->notifier_free(&screen->query);
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->rankine);
+
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv30_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv30_screen *screen = CALLOC_STRUCT(nv30_screen);
+	struct nouveau_stateobj *so;
+	unsigned rankine_class = 0;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret, i;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(nvws);
+	screen->eng2d->buf = nv30_surface_buffer;
+
+	/* 3D object */
+	switch (chipset & 0xf0) {
+	case 0x30:
+		if (NV30TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f)))
+			rankine_class = 0x0397;
+		else
+		if (NV34TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f)))
+			rankine_class = 0x0697;
+		else
+		if (NV35TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f)))
+			rankine_class = 0x0497;
+		break;
+	default:
+		break;
+	}
+
+	if (!rankine_class) {
+		NOUVEAU_ERR("Unknown nv3x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, rankine_class, &screen->rankine);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv30_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Query objects */
+	ret = nvws->notifier_alloc(nvws, 32, &screen->query);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query objects: %d\n", ret);
+		nv30_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	ret = nvws->res_init(&screen->query_heap, 0, 32);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query object heap: %d\n", ret);
+		nv30_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Vtxprog resources */
+	if (nvws->res_init(&screen->vp_exec_heap, 0, 256) ||
+	    nvws->res_init(&screen->vp_data_heap, 0, 256)) {
+		nv30_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Static rankine initialisation */
+	so = so_new(128, 0);
+	so_method(so, screen->rankine, NV34TCL_DMA_NOTIFY, 1);
+	so_data  (so, screen->sync->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_TEXTURE0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_COLOR1, 1);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_COLOR0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_VTXBUF0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+/*	so_method(so, screen->rankine, NV34TCL_DMA_FENCE, 2);
+	so_data  (so, 0);
+	so_data  (so, screen->query->handle);*/
+	so_method(so, screen->rankine, NV34TCL_DMA_IN_MEMORY7, 1);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_IN_MEMORY8, 1);
+	so_data  (so, nvws->channel->vram->handle);
+
+	for (i=1; i<8; i++) {
+		so_method(so, screen->rankine, NV34TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		so_data  (so, 0);
+		so_method(so, screen->rankine, NV34TCL_VIEWPORT_CLIP_VERT(i), 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, screen->rankine, 0x220, 1);
+	so_data  (so, 1);
+
+	so_method(so, screen->rankine, 0x03b0, 1);
+	so_data  (so, 0x00100000);
+	so_method(so, screen->rankine, 0x1454, 1);
+	so_data  (so, 0);
+	so_method(so, screen->rankine, 0x1d80, 1);
+	so_data  (so, 3);
+	so_method(so, screen->rankine, 0x1450, 1);
+	so_data  (so, 0x00030004);
+
+	/* NEW */
+	so_method(so, screen->rankine, 0x1e98, 1);
+	so_data  (so, 0);
+	so_method(so, screen->rankine, 0x17e0, 3);
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(1.0));
+	so_method(so, screen->rankine, 0x1f80, 16);
+	for (i=0; i<16; i++) {
+		so_data  (so, (i==8) ? 0x0000ffff : 0);
+	}
+
+	so_method(so, screen->rankine, 0x120, 3);
+	so_data  (so, 0);
+	so_data  (so, 1);
+	so_data  (so, 2);
+
+	so_method(so, screen->rankine, 0x1d88, 1);
+	so_data  (so, 0x00001200);
+
+	so_method(so, screen->rankine, NV34TCL_RC_ENABLE, 1);
+	so_data  (so, 0);
+
+	so_method(so, screen->rankine, NV34TCL_DEPTH_RANGE_NEAR, 2);
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(1.0));
+
+	so_method(so, screen->rankine, NV34TCL_MULTISAMPLE_CONTROL, 1);
+	so_data  (so, 0xffff0000);
+
+	/* enables use of vp rather than fixed-function somehow */
+	so_method(so, screen->rankine, 0x1e94, 1);
+	so_data  (so, 0x13);
+
+	so_emit(nvws, so);
+	so_ref(NULL, &so);
+	nvws->push_flush(nvws, 0, NULL);
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv30_screen_destroy;
+
+	screen->pipe.get_name = nv30_screen_get_name;
+	screen->pipe.get_vendor = nv30_screen_get_vendor;
+	screen->pipe.get_param = nv30_screen_get_param;
+	screen->pipe.get_paramf = nv30_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv30_screen_surface_format_supported;
+
+	screen->pipe.surface_map = nv30_surface_map;
+	screen->pipe.surface_unmap = nv30_surface_unmap;
+
+	nv30_screen_init_miptree_functions(&screen->pipe);
+	u_simple_screen_init(&screen->pipe);
+
+	return &screen->pipe;
+}
diff --git a/src/gallium/drivers/nv30/nv30_screen.h b/src/gallium/drivers/nv30/nv30_screen.h
new file mode 100644
index 0000000000..b11e470f94
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_screen.h
@@ -0,0 +1,37 @@
+#ifndef __NV30_SCREEN_H__
+#define __NV30_SCREEN_H__
+
+#include "pipe/p_screen.h"
+#include "nv04/nv04_surface_2d.h"
+
+struct nv30_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	unsigned cur_pctx;
+
+	/* HW graphics objects */
+	struct nv04_surface_2d *eng2d;
+	struct nouveau_grobj *rankine;
+	struct nouveau_notifier *sync;
+
+	/* Query object resources */
+	struct nouveau_notifier *query;
+	struct nouveau_resource *query_heap;
+
+	/* Vtxprog resources */
+	struct nouveau_resource *vp_exec_heap;
+	struct nouveau_resource *vp_data_heap;
+
+	/* Current 3D state of channel */
+	struct nouveau_stateobj *state[NV30_STATE_MAX];
+};
+
+static INLINE struct nv30_screen *
+nv30_screen(struct pipe_screen *screen)
+{
+	return (struct nv30_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv30/nv30_shader.h b/src/gallium/drivers/nv30/nv30_shader.h
new file mode 100644
index 0000000000..dd3a36f78f
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_shader.h
@@ -0,0 +1,490 @@
+#ifndef __NV30_SHADER_H__
+#define __NV30_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * 128bit opcodes, split into 4 32-bit ones for ease of use.
+ *
+ * Non-native instructions
+ *   ABS - MOV + NV40_VP_INST0_DEST_ABS
+ *   POW - EX2 + MUL + LG2
+ *   SUB - ADD, second source negated
+ *   SWZ - MOV
+ *   XPD -  
+ *
+ * Register access
+ *   - Only one INPUT can be accessed per-instruction (move extras into TEMPs)
+ *   - Only one CONST can be accessed per-instruction (move extras into TEMPs)
+ *
+ * Relative Addressing
+ *   According to the value returned for
+ *   MAX_PROGRAM_NATIVE_ADDRESS_REGISTERS_ARB
+ *
+ *   there are only two address registers available.  The destination in the
+ *   ARL instruction is set to TEMP <n> (The temp isn't actually written).
+ *
+ *   When using vanilla ARB_v_p, the proprietary driver will squish both the
+ *   available ADDRESS regs into the first hardware reg in the X and Y
+ *   components.
+ *
+ *   To use an address reg as an index into consts, the CONST_SRC is set to
+ *   (const_base + offset) and INDEX_CONST is set.
+ *
+ *   To access the second address reg use ADDR_REG_SELECT_1. A particular
+ *   component of the address regs is selected with ADDR_SWZ.
+ *
+ *   Only one address register can be accessed per instruction.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details) Conditional
+ * execution of an instruction is enabled by setting COND_TEST_ENABLE, and
+ * selecting the condition which will allow the test to pass with
+ * COND_{FL,LT,...}.  It is possible to swizzle the values in the condition
+ * register, which allows for testing against an individual component.
+ *
+ * Branching:
+ *
+ *   The BRA/CAL instructions seem to follow a slightly different opcode
+ *   layout.  The destination instruction ID (IADDR) overlaps a source field.
+ *   Instruction ID's seem to be numbered based on the UPLOAD_FROM_ID FIFO
+ *   command, and is incremented automatically on each UPLOAD_INST FIFO
+ *   command.
+ *
+ *   Conditional branching is achieved by using the condition tests described
+ *   above.  There doesn't appear to be dedicated looping instructions, but
+ *   this can be done using a temp reg + conditional branching.
+ *
+ *   Subroutines may be uploaded before the main program itself, but the first
+ *   executed instruction is determined by the PROGRAM_START_ID FIFO command.
+ *
+ */
+
+/* DWORD 0 */
+
+#define NV30_VP_INST_ADDR_REG_SELECT_1        (1 << 24)
+#define NV30_VP_INST_SRC2_ABS           (1 << 23) /* guess */
+#define NV30_VP_INST_SRC1_ABS           (1 << 22) /* guess */
+#define NV30_VP_INST_SRC0_ABS           (1 << 21) /* guess */
+#define NV30_VP_INST_VEC_RESULT         (1 << 20)
+#define NV30_VP_INST_DEST_TEMP_ID_SHIFT        16
+#define NV30_VP_INST_DEST_TEMP_ID_MASK        (0x0F << 16)
+#define NV30_VP_INST_COND_UPDATE_ENABLE        (1<<15)
+#define NV30_VP_INST_VEC_DEST_TEMP_MASK      (0xF << 16)
+#define NV30_VP_INST_COND_TEST_ENABLE        (1<<14)
+#define NV30_VP_INST_COND_SHIFT          11
+#define NV30_VP_INST_COND_MASK          (0x07 << 11)
+#  define NV30_VP_INST_COND_FL  0 /* guess */  
+#  define NV30_VP_INST_COND_LT  1  
+#  define NV30_VP_INST_COND_EQ  2
+#  define NV30_VP_INST_COND_LE  3
+#  define NV30_VP_INST_COND_GT  4
+#  define NV30_VP_INST_COND_NE  5
+#  define NV30_VP_INST_COND_GE  6
+#  define NV30_VP_INST_COND_TR  7 /* guess */
+#define NV30_VP_INST_COND_SWZ_X_SHIFT        9
+#define NV30_VP_INST_COND_SWZ_X_MASK        (0x03 <<  9)
+#define NV30_VP_INST_COND_SWZ_Y_SHIFT        7
+#define NV30_VP_INST_COND_SWZ_Y_MASK        (0x03 <<  7)
+#define NV30_VP_INST_COND_SWZ_Z_SHIFT        5
+#define NV30_VP_INST_COND_SWZ_Z_MASK        (0x03 <<  5)
+#define NV30_VP_INST_COND_SWZ_W_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_W_MASK        (0x03 <<  3)
+#define NV30_VP_INST_COND_SWZ_ALL_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_ALL_MASK        (0xFF <<  3)
+#define NV30_VP_INST_ADDR_SWZ_SHIFT        1
+#define NV30_VP_INST_ADDR_SWZ_MASK        (0x03 <<  1)
+#define NV30_VP_INST_SCA_OPCODEH_SHIFT        0
+#define NV30_VP_INST_SCA_OPCODEH_MASK        (0x01 <<  0)
+
+/* DWORD 1 */
+#define NV30_VP_INST_SCA_OPCODEL_SHIFT        28
+#define NV30_VP_INST_SCA_OPCODEL_MASK        (0x0F << 28)
+#  define NV30_VP_INST_OP_NOP  0x00
+#  define NV30_VP_INST_OP_RCP  0x02
+#  define NV30_VP_INST_OP_RCC  0x03
+#  define NV30_VP_INST_OP_RSQ  0x04
+#  define NV30_VP_INST_OP_EXP  0x05
+#  define NV30_VP_INST_OP_LOG  0x06
+#  define NV30_VP_INST_OP_LIT  0x07
+#  define NV30_VP_INST_OP_BRA  0x09
+#  define NV30_VP_INST_OP_CAL  0x0B
+#  define NV30_VP_INST_OP_RET  0x0C
+#  define NV30_VP_INST_OP_LG2  0x0D
+#  define NV30_VP_INST_OP_EX2  0x0E
+#  define NV30_VP_INST_OP_SIN  0x0F
+#  define NV30_VP_INST_OP_COS  0x10
+#define NV30_VP_INST_VEC_OPCODE_SHIFT        23
+#define NV30_VP_INST_VEC_OPCODE_MASK        (0x1F << 23)
+#  define NV30_VP_INST_OP_NOPV  0x00
+#  define NV30_VP_INST_OP_MOV  0x01
+#  define NV30_VP_INST_OP_MUL  0x02
+#  define NV30_VP_INST_OP_ADD  0x03
+#  define NV30_VP_INST_OP_MAD  0x04
+#  define NV30_VP_INST_OP_DP3  0x05
+#  define NV30_VP_INST_OP_DP4  0x07
+#  define NV30_VP_INST_OP_DPH  0x06
+#  define NV30_VP_INST_OP_DST  0x08
+#  define NV30_VP_INST_OP_MIN  0x09
+#  define NV30_VP_INST_OP_MAX  0x0A
+#  define NV30_VP_INST_OP_SLT  0x0B
+#  define NV30_VP_INST_OP_SGE  0x0C
+#  define NV30_VP_INST_OP_ARL  0x0D
+#  define NV30_VP_INST_OP_FRC  0x0E
+#  define NV30_VP_INST_OP_FLR  0x0F
+#  define NV30_VP_INST_OP_SEQ  0x10
+#  define NV30_VP_INST_OP_SFL  0x11
+#  define NV30_VP_INST_OP_SGT  0x12
+#  define NV30_VP_INST_OP_SLE  0x13
+#  define NV30_VP_INST_OP_SNE  0x14
+#  define NV30_VP_INST_OP_STR  0x15
+#  define NV30_VP_INST_OP_SSG  0x16
+#  define NV30_VP_INST_OP_ARR  0x17
+#  define NV30_VP_INST_OP_ARA  0x18
+#define NV30_VP_INST_CONST_SRC_SHIFT        14
+#define NV30_VP_INST_CONST_SRC_MASK        (0xFF << 14)
+#define NV30_VP_INST_INPUT_SRC_SHIFT        9    /*NV20*/
+#define NV30_VP_INST_INPUT_SRC_MASK        (0x0F <<  9)  /*NV20*/
+#  define NV30_VP_INST_IN_POS  0    /* These seem to match the bindings specified in */
+#  define NV30_VP_INST_IN_WEIGHT  1    /* the ARB_v_p spec (2.14.3.1) */
+#  define NV30_VP_INST_IN_NORMAL  2    
+#  define NV30_VP_INST_IN_COL0  3    /* Should probably confirm them all though */
+#  define NV30_VP_INST_IN_COL1  4
+#  define NV30_VP_INST_IN_FOGC  5
+#  define NV30_VP_INST_IN_TC0  8
+#  define NV30_VP_INST_IN_TC(n)  (8+n)
+#define NV30_VP_INST_SRC0H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC0H_MASK          (0x1FF << 0)  /*NV20*/
+
+/* Please note: the IADDR fields overlap other fields because they are used
+ * only for branch instructions.  See Branching: label above
+ *
+ * DWORD 2
+ */
+#define NV30_VP_INST_SRC0L_SHIFT        26    /*NV20*/
+#define NV30_VP_INST_SRC0L_MASK         (0x3F  <<26)  /* NV30_VP_SRC0_LOW_MASK << 26 */
+#define NV30_VP_INST_SRC1_SHIFT         11    /*NV20*/
+#define NV30_VP_INST_SRC1_MASK          (0x7FFF<<11)  /*NV20*/
+#define NV30_VP_INST_SRC2H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC2H_MASK          (0x7FF << 0)  /* NV30_VP_SRC2_HIGH_MASK >> 4*/
+#define NV30_VP_INST_IADDR_SHIFT        2
+#define NV30_VP_INST_IADDR_MASK          (0xF <<  28)   /* NV30_VP_SRC2_LOW_MASK << 28 */
+
+/* DWORD 3 */
+#define NV30_VP_INST_SRC2L_SHIFT        28    /*NV20*/
+#define NV30_VP_INST_SRC2L_MASK          (0x0F  <<28)  /*NV20*/
+#define NV30_VP_INST_STEMP_WRITEMASK_SHIFT      24
+#define NV30_VP_INST_STEMP_WRITEMASK_MASK      (0x0F << 24)
+#define NV30_VP_INST_VTEMP_WRITEMASK_SHIFT      20
+#define NV30_VP_INST_VTEMP_WRITEMASK_MASK      (0x0F << 20)
+#define NV30_VP_INST_SDEST_WRITEMASK_SHIFT      16
+#define NV30_VP_INST_SDEST_WRITEMASK_MASK      (0x0F << 16)
+#define NV30_VP_INST_VDEST_WRITEMASK_SHIFT      12    /*NV20*/
+#define NV30_VP_INST_VDEST_WRITEMASK_MASK      (0x0F << 12)  /*NV20*/
+#define NV30_VP_INST_DEST_SHIFT        2
+#define NV30_VP_INST_DEST_MASK        (0x0F <<  2)
+#  define NV30_VP_INST_DEST_POS  0
+#  define NV30_VP_INST_DEST_BFC0  1
+#  define NV30_VP_INST_DEST_BFC1  2
+#  define NV30_VP_INST_DEST_COL0  3
+#  define NV30_VP_INST_DEST_COL1  4
+#  define NV30_VP_INST_DEST_FOGC  5
+#  define NV30_VP_INST_DEST_PSZ   6
+#  define NV30_VP_INST_DEST_TC(n)  (8+n)
+
+#define NV30_VP_INST_LAST                           (1 << 0)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV30_VP_SRC0_HIGH_SHIFT                                                6
+#define NV30_VP_SRC0_HIGH_MASK                                        0x00007FC0
+#define NV30_VP_SRC0_LOW_MASK                                         0x0000003F
+#define NV30_VP_SRC2_HIGH_SHIFT                                                4
+#define NV30_VP_SRC2_HIGH_MASK                                        0x00007FF0
+#define NV30_VP_SRC2_LOW_MASK                                         0x0000000F
+
+
+/* Source-register definition - matches NV20 exactly */
+#define NV30_VP_SRC_NEGATE          (1<<14)
+#define NV30_VP_SRC_SWZ_X_SHIFT        12
+#define NV30_VP_SRC_REG_SWZ_X_MASK        (0x03  <<12)
+#define NV30_VP_SRC_SWZ_Y_SHIFT        10
+#define NV30_VP_SRC_REG_SWZ_Y_MASK        (0x03  <<10)
+#define NV30_VP_SRC_SWZ_Z_SHIFT        8
+#define NV30_VP_SRC_REG_SWZ_Z_MASK        (0x03  << 8)
+#define NV30_VP_SRC_SWZ_W_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_W_MASK        (0x03  << 6)
+#define NV30_VP_SRC_REG_SWZ_ALL_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_ALL_MASK        (0xFF  << 6)
+#define NV30_VP_SRC_TEMP_SRC_SHIFT        2
+#define NV30_VP_SRC_REG_TEMP_ID_MASK        (0x0F  << 0)
+#define NV30_VP_SRC_REG_TYPE_SHIFT        0
+#define NV30_VP_SRC_REG_TYPE_MASK        (0x03  << 0)
+#define NV30_VP_SRC_REG_TYPE_TEMP  1
+#define NV30_VP_SRC_REG_TYPE_INPUT  2
+#define NV30_VP_SRC_REG_TYPE_CONST  3 /* guess */
+
+/*
+ * Each fragment program opcode appears to be comprised of 4 32-bit values.
+ *
+ *   0 - Opcode, output reg/mask, ATTRIB source
+ *   1 - Source 0
+ *   2 - Source 1
+ *   3 - Source 2
+ *
+ * There appears to be no special difference between result regs and temp regs.
+ *     result.color == R0.xyzw
+ *     result.depth == R1.z
+ * When the fragprog contains instructions to write depth, NV30_TCL_PRIMITIVE_3D_UNK1D78=0
+ * otherwise it is set to 1.
+ *
+ * Constants are inserted directly after the instruction that uses them.
+ * 
+ * It appears that it's not possible to use two input registers in one
+ * instruction as the input sourcing is done in the instruction dword
+ * and not the source selection dwords.  As such instructions such as:
+ * 
+ *     ADD result.color, fragment.color, fragment.texcoord[0];
+ *
+ * must be split into two MOV's and then an ADD (nvidia does this) but
+ * I'm not sure why it's not just one MOV and then source the second input
+ * in the ADD instruction..
+ *
+ * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
+ * negation requires multiplication with a const.
+ *
+ * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO/SWIZZLE_ONE
+ * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as SWIZZLE_ZERO
+ * is implemented simply by not writing to the relevant components of the destination.
+ *
+ * Conditional execution
+ *   TODO
+ * 
+ * Non-native instructions:
+ *   LIT
+ *   LRP - MAD+MAD
+ *   SUB - ADD, negate second source
+ *   RSQ - LG2 + EX2
+ *   POW - LG2 + MUL + EX2
+ *   SCS - COS + SIN
+ *   XPD
+ */
+
+//== Opcode / Destination selection ==
+#define NV30_FP_OP_PROGRAM_END          (1 << 0)
+#define NV30_FP_OP_OUT_REG_SHIFT        1
+#define NV30_FP_OP_OUT_REG_MASK          (31 << 1)  /* uncertain */
+/* Needs to be set when writing outputs to get expected result.. */
+#define NV30_FP_OP_OUT_REG_HALF          (1 << 7)
+#define NV30_FP_OP_COND_WRITE_ENABLE        (1 << 8)
+#define NV30_FP_OP_OUTMASK_SHIFT        9
+#define NV30_FP_OP_OUTMASK_MASK          (0xF << 9)
+#  define NV30_FP_OP_OUT_X  (1<<9)
+#  define NV30_FP_OP_OUT_Y  (1<<10)
+#  define NV30_FP_OP_OUT_Z  (1<<11)
+#  define NV30_FP_OP_OUT_W  (1<<12)
+/* Uncertain about these, especially the input_src values.. it's possible that
+ * they can be dynamically changed.
+ */
+#define NV30_FP_OP_INPUT_SRC_SHIFT        13
+#define NV30_FP_OP_INPUT_SRC_MASK        (15 << 13)
+#  define NV30_FP_OP_INPUT_SRC_POSITION  0x0
+#  define NV30_FP_OP_INPUT_SRC_COL0  0x1
+#  define NV30_FP_OP_INPUT_SRC_COL1  0x2
+#  define NV30_FP_OP_INPUT_SRC_FOGC  0x3
+#  define NV30_FP_OP_INPUT_SRC_TC0    0x4
+#  define NV30_FP_OP_INPUT_SRC_TC(n)  (0x4 + n)
+#define NV30_FP_OP_TEX_UNIT_SHIFT        17
+#define NV30_FP_OP_TEX_UNIT_MASK        (0xF << 17) /* guess */
+#define NV30_FP_OP_PRECISION_SHIFT        22
+#define NV30_FP_OP_PRECISION_MASK        (3 << 22)
+#   define NV30_FP_PRECISION_FP32  0
+#   define NV30_FP_PRECISION_FP16  1
+#   define NV30_FP_PRECISION_FX12  2
+#define NV30_FP_OP_OPCODE_SHIFT          24
+#define NV30_FP_OP_OPCODE_MASK          (0x3F << 24)
+#  define NV30_FP_OP_OPCODE_NOP  0x00
+#  define NV30_FP_OP_OPCODE_MOV  0x01
+#  define NV30_FP_OP_OPCODE_MUL  0x02
+#  define NV30_FP_OP_OPCODE_ADD  0x03
+#  define NV30_FP_OP_OPCODE_MAD  0x04
+#  define NV30_FP_OP_OPCODE_DP3  0x05
+#  define NV30_FP_OP_OPCODE_DP4  0x06
+#  define NV30_FP_OP_OPCODE_DST  0x07
+#  define NV30_FP_OP_OPCODE_MIN  0x08
+#  define NV30_FP_OP_OPCODE_MAX  0x09
+#  define NV30_FP_OP_OPCODE_SLT  0x0A
+#  define NV30_FP_OP_OPCODE_SGE  0x0B
+#  define NV30_FP_OP_OPCODE_SLE  0x0C
+#  define NV30_FP_OP_OPCODE_SGT  0x0D
+#  define NV30_FP_OP_OPCODE_SNE  0x0E
+#  define NV30_FP_OP_OPCODE_SEQ  0x0F
+#  define NV30_FP_OP_OPCODE_FRC  0x10
+#  define NV30_FP_OP_OPCODE_FLR  0x11
+#  define NV30_FP_OP_OPCODE_KIL  0x12
+#  define NV30_FP_OP_OPCODE_PK4B   0x13
+#  define NV30_FP_OP_OPCODE_UP4B   0x14
+#  define NV30_FP_OP_OPCODE_DDX  0x15 /* can only write XY */
+#  define NV30_FP_OP_OPCODE_DDY  0x16 /* can only write XY */
+#  define NV30_FP_OP_OPCODE_TEX  0x17
+#  define NV30_FP_OP_OPCODE_TXP  0x18
+#  define NV30_FP_OP_OPCODE_TXD  0x19
+#  define NV30_FP_OP_OPCODE_RCP  0x1A
+#  define NV30_FP_OP_OPCODE_RSQ  0x1B
+#  define NV30_FP_OP_OPCODE_EX2  0x1C
+#  define NV30_FP_OP_OPCODE_LG2  0x1D
+#  define NV30_FP_OP_OPCODE_LIT  0x1E
+#  define NV30_FP_OP_OPCODE_LRP  0x1F
+#  define NV30_FP_OP_OPCODE_STR  0x20 
+#  define NV30_FP_OP_OPCODE_SFL  0x21
+#  define NV30_FP_OP_OPCODE_COS  0x22
+#  define NV30_FP_OP_OPCODE_SIN  0x23
+#  define NV30_FP_OP_OPCODE_PK2H   0x24
+#  define NV30_FP_OP_OPCODE_UP2H   0x25
+#  define NV30_FP_OP_OPCODE_POW  0x26
+#  define NV30_FP_OP_OPCODE_PK4UB  0x27
+#  define NV30_FP_OP_OPCODE_UP4UB  0x28
+#  define NV30_FP_OP_OPCODE_PK2US  0x29
+#  define NV30_FP_OP_OPCODE_UP2US  0x2A
+#  define NV30_FP_OP_OPCODE_DP2A   0x2E
+#  define NV30_FP_OP_OPCODE_TXB  0x31
+#  define NV30_FP_OP_OPCODE_RFL  0x36
+#  define NV30_FP_OP_OPCODE_DIV  0x3A
+#define NV30_FP_OP_OUT_SAT          (1 << 31)
+
+/* high order bits of SRC0 */
+#define NV30_FP_OP_OUT_ABS          (1 << 29)
+#define NV30_FP_OP_COND_SWZ_W_SHIFT        27
+#define NV30_FP_OP_COND_SWZ_W_MASK        (3 << 27)
+#define NV30_FP_OP_COND_SWZ_Z_SHIFT        25
+#define NV30_FP_OP_COND_SWZ_Z_MASK        (3 << 25)
+#define NV30_FP_OP_COND_SWZ_Y_SHIFT        23
+#define NV30_FP_OP_COND_SWZ_Y_MASK        (3 << 23)
+#define NV30_FP_OP_COND_SWZ_X_SHIFT        21
+#define NV30_FP_OP_COND_SWZ_X_MASK        (3 << 21)
+#define NV30_FP_OP_COND_SWZ_ALL_SHIFT        21
+#define NV30_FP_OP_COND_SWZ_ALL_MASK        (0xFF << 21)
+#define NV30_FP_OP_COND_SHIFT          18
+#define NV30_FP_OP_COND_MASK          (0x07 << 18)
+#  define NV30_FP_OP_COND_FL  0
+#  define NV30_FP_OP_COND_LT  1
+#  define NV30_FP_OP_COND_EQ  2
+#  define NV30_FP_OP_COND_LE  3
+#  define NV30_FP_OP_COND_GT  4
+#  define NV30_FP_OP_COND_NE  5
+#  define NV30_FP_OP_COND_GE  6
+#  define NV30_FP_OP_COND_TR  7
+
+/* high order bits of SRC1 */
+#define NV30_FP_OP_DST_SCALE_SHIFT        28
+#define NV30_FP_OP_DST_SCALE_MASK        (3 << 28)
+#define NV30_FP_OP_DST_SCALE_1X                                                0
+#define NV30_FP_OP_DST_SCALE_2X                                                1
+#define NV30_FP_OP_DST_SCALE_4X                                                2
+#define NV30_FP_OP_DST_SCALE_8X                                                3
+#define NV30_FP_OP_DST_SCALE_INV_2X                                            5
+#define NV30_FP_OP_DST_SCALE_INV_4X                                            6
+#define NV30_FP_OP_DST_SCALE_INV_8X                                            7
+
+
+/* high order bits of SRC2 */
+#define NV30_FP_OP_INDEX_INPUT          (1 << 30)
+
+//== Register selection ==
+#define NV30_FP_REG_TYPE_SHIFT          0
+#define NV30_FP_REG_TYPE_MASK          (3 << 0)
+#  define NV30_FP_REG_TYPE_TEMP  0
+#  define NV30_FP_REG_TYPE_INPUT  1
+#  define NV30_FP_REG_TYPE_CONST  2
+#define NV30_FP_REG_SRC_SHIFT          2 /* uncertain */
+#define NV30_FP_REG_SRC_MASK          (31 << 2)
+#define NV30_FP_REG_SRC_HALF          (1 << 8)
+#define NV30_FP_REG_SWZ_ALL_SHIFT        9
+#define NV30_FP_REG_SWZ_ALL_MASK        (255 << 9)
+#define NV30_FP_REG_SWZ_X_SHIFT          9
+#define NV30_FP_REG_SWZ_X_MASK          (3 << 9)
+#define NV30_FP_REG_SWZ_Y_SHIFT          11
+#define NV30_FP_REG_SWZ_Y_MASK          (3 << 11)
+#define NV30_FP_REG_SWZ_Z_SHIFT          13
+#define NV30_FP_REG_SWZ_Z_MASK          (3 << 13)
+#define NV30_FP_REG_SWZ_W_SHIFT          15
+#define NV30_FP_REG_SWZ_W_MASK          (3 << 15)
+#  define NV30_FP_SWIZZLE_X  0
+#  define NV30_FP_SWIZZLE_Y  1
+#  define NV30_FP_SWIZZLE_Z  2
+#  define NV30_FP_SWIZZLE_W  3
+#define NV30_FP_REG_NEGATE          (1 << 17)
+
+#define NV30SR_NONE	0
+#define NV30SR_OUTPUT	1
+#define NV30SR_INPUT	2
+#define NV30SR_TEMP	3
+#define NV30SR_CONST	4
+
+struct nv30_sreg {
+	int type;
+	int index;
+
+	int dst_scale;
+
+	int negate;
+	int abs;
+	int swz[4];
+
+	int cc_update;
+	int cc_update_reg;
+	int cc_test;
+	int cc_test_reg;
+	int cc_swz[4];
+};
+
+static INLINE struct nv30_sreg
+nv30_sr(int type, int index)
+{
+	struct nv30_sreg temp = {
+		.type = type,
+		.index = index,
+		.dst_scale = DEF_SCALE,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+		.cc_update = 0,
+		.cc_update_reg = 0,
+		.cc_test = DEF_CTEST,
+		.cc_test_reg = 0,
+		.cc_swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_swz(struct nv30_sreg src, int x, int y, int z, int w)
+{
+	struct nv30_sreg dst = src;
+
+	dst.swz[SWZ_X] = src.swz[x];
+	dst.swz[SWZ_Y] = src.swz[y];
+	dst.swz[SWZ_Z] = src.swz[z];
+	dst.swz[SWZ_W] = src.swz[w];
+	return dst;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_neg(struct nv30_sreg src)
+{
+	src.negate = !src.negate;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_abs(struct nv30_sreg src)
+{
+	src.abs = 1;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_scale(struct nv30_sreg src, int scale)
+{
+	src.dst_scale = scale;
+	return src;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv30/nv30_state.c b/src/gallium/drivers/nv30/nv30_state.c
new file mode 100644
index 0000000000..26147565a5
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state.c
@@ -0,0 +1,725 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+static void *
+nv30_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct nv30_blend_state *bso = CALLOC(1, sizeof(*bso));
+	struct nouveau_stateobj *so = so_new(16, 0);
+
+	if (cso->blend_enable) {
+		so_method(so, rankine, NV34TCL_BLEND_FUNC_ENABLE, 3);
+		so_data  (so, 1);
+		so_data  (so, (nvgl_blend_func(cso->alpha_src_factor) << 16) |
+			       nvgl_blend_func(cso->rgb_src_factor));
+		so_data  (so, nvgl_blend_func(cso->alpha_dst_factor) << 16 |
+			      nvgl_blend_func(cso->rgb_dst_factor));
+		/* FIXME: Gallium assumes GL_EXT_blend_func_separate.
+		   It is not the case for NV30 */
+		so_method(so, rankine, NV34TCL_BLEND_EQUATION, 1);
+		so_data  (so, nvgl_blend_eqn(cso->rgb_func));
+	} else {
+		so_method(so, rankine, NV34TCL_BLEND_FUNC_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, rankine, NV34TCL_COLOR_MASK, 1);
+	so_data  (so, (((cso->colormask & PIPE_MASK_A) ? (0x01 << 24) : 0) |
+		       ((cso->colormask & PIPE_MASK_R) ? (0x01 << 16) : 0) |
+		       ((cso->colormask & PIPE_MASK_G) ? (0x01 <<  8) : 0) |
+		       ((cso->colormask & PIPE_MASK_B) ? (0x01 <<  0) : 0)));
+
+	if (cso->logicop_enable) {
+		so_method(so, rankine, NV34TCL_COLOR_LOGIC_OP_ENABLE, 2);
+		so_data  (so, 1);
+		so_data  (so, nvgl_logicop_func(cso->logicop_func));
+	} else {
+		so_method(so, rankine, NV34TCL_COLOR_LOGIC_OP_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, rankine, NV34TCL_DITHER_ENABLE, 1);
+	so_data  (so, cso->dither ? 1 : 0);
+
+	so_ref(so, &bso->so);
+	bso->pipe = *cso;
+	return (void *)bso;
+}
+
+static void
+nv30_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->blend = hwcso;
+	nv30->dirty |= NV30_NEW_BLEND;
+}
+
+static void
+nv30_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_blend_state *bso = hwcso;
+
+	so_ref(NULL, &bso->so);
+	FREE(bso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV34TCL_TX_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV34TCL_TX_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV34TCL_TX_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV34TCL_TX_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV34TCL_TX_WRAP_S_CLAMP;
+		break;
+/*	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		ret = NV34TCL_TX_WRAP_S_MIRROR_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		ret = NV34TCL_TX_WRAP_S_MIRROR_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		ret = NV34TCL_TX_WRAP_S_MIRROR_CLAMP;
+		break;*/
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV34TCL_TX_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV34TCL_TX_WRAP_S_SHIFT;
+}
+
+static void *
+nv30_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv30_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv30_sampler_state));
+
+	ps->fmt = 0;
+	/* TODO: Not all RECTs formats have this bit set, bits 15-8 of format
+	   are the tx format to use. We should store normalized coord flag
+	   in sampler state structure, and set appropriate format in
+	   nvxx_fragtex_build()
+	 */
+	/*NV34TCL_TX_FORMAT_RECT*/
+	/*if (!cso->normalized_coords) {
+		ps->fmt |= (1<<14) ;
+	}*/
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV34TCL_TX_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV34TCL_TX_WRAP_T_SHIFT) |
+		    (wrap_mode(cso->wrap_r) << NV34TCL_TX_WRAP_R_SHIFT));
+
+	ps->en = 0;
+
+	if (cso->max_anisotropy >= 8.0) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_8X;
+	} else
+	if (cso->max_anisotropy >= 4.0) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_4X;
+	} else
+	if (cso->max_anisotropy >= 2.0) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_2X;
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV34TCL_TX_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV34TCL_TX_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+	{
+		float limit;
+
+		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+
+		limit = CLAMP(cso->max_lod, 0.0, 15.0);
+		ps->en |= (int)(limit) << 14 /*NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT*/;
+
+		limit = CLAMP(cso->min_lod, 0.0, 15.0);
+		ps->en |= (int)(limit) << 26 /*NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT*/;
+	}
+
+	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv30_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv30->tex_sampler[unit] = sampler[unit];
+		nv30->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv30->nr_samplers; unit++) {
+		nv30->tex_sampler[unit] = NULL;
+		nv30->dirty_samplers |= (1 << unit);
+	}
+
+	nv30->nr_samplers = nr;
+	nv30->dirty |= NV30_NEW_SAMPLER;
+}
+
+static void
+nv30_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv30_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv30->tex_miptree[unit], miptree[unit]);
+		nv30->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv30->nr_textures; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv30->tex_miptree[unit], NULL);
+		nv30->dirty_samplers |= (1 << unit);
+	}
+
+	nv30->nr_textures = nr;
+	nv30->dirty |= NV30_NEW_SAMPLER;
+}
+
+static void *
+nv30_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 */
+
+	so_method(so, rankine, NV34TCL_SHADE_MODEL, 1);
+	so_data  (so, cso->flatshade ? NV34TCL_SHADE_MODEL_FLAT :
+				       NV34TCL_SHADE_MODEL_SMOOTH);
+
+	so_method(so, rankine, NV34TCL_LINE_WIDTH, 2);
+	so_data  (so, (unsigned char)(cso->line_width * 8.0) & 0xff);
+	so_data  (so, cso->line_smooth ? 1 : 0);
+	so_method(so, rankine, NV34TCL_LINE_STIPPLE_ENABLE, 2);
+	so_data  (so, cso->line_stipple_enable ? 1 : 0);
+	so_data  (so, (cso->line_stipple_pattern << 16) |
+		       cso->line_stipple_factor);
+
+	so_method(so, rankine, NV34TCL_POINT_SIZE, 1);
+	so_data  (so, fui(cso->point_size));
+
+	so_method(so, rankine, NV34TCL_POLYGON_MODE_FRONT, 6);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV34TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV34TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV34TCL_FRONT_FACE_CCW);
+	} else {
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV34TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV34TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV34TCL_FRONT_FACE_CW);
+	}
+	so_data(so, cso->poly_smooth ? 1 : 0);
+	so_data(so, (cso->cull_mode != PIPE_WINDING_NONE) ? 1 : 0);
+
+	so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
+
+	so_method(so, rankine, NV34TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_POINT) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_POINT))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_LINE) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_LINE))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_FILL) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_FILL))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if (cso->offset_cw || cso->offset_ccw) {
+		so_method(so, rankine, NV34TCL_POLYGON_OFFSET_FACTOR, 2);
+		so_data  (so, fui(cso->offset_scale));
+		so_data  (so, fui(cso->offset_units * 2));
+	}
+
+	so_method(so, rankine, NV34TCL_POINT_SPRITE, 1);
+	if (cso->point_sprite) {
+		unsigned psctl = (1 << 0), i;
+
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				psctl |= (1 << (8 + i));
+		}
+
+		so_data(so, psctl);
+	} else {
+		so_data(so, 0);
+	}
+
+	so_ref(so, &rsso->so);
+	rsso->pipe = *cso;
+	return (void *)rsso;
+}
+
+static void
+nv30_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->rasterizer = hwcso;
+	nv30->dirty |= NV30_NEW_RAST;
+	/*nv30->draw_dirty |= NV30_NEW_RAST;*/
+}
+
+static void
+nv30_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_rasterizer_state *rsso = hwcso;
+
+	so_ref(NULL, &rsso->so);
+	FREE(rsso);
+}
+
+static void *
+nv30_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+
+	so_method(so, rankine, NV34TCL_DEPTH_FUNC, 3);
+	so_data  (so, nvgl_comparison_op(cso->depth.func));
+	so_data  (so, cso->depth.writemask ? 1 : 0);
+	so_data  (so, cso->depth.enabled ? 1 : 0);
+
+	so_method(so, rankine, NV34TCL_ALPHA_FUNC_ENABLE, 3);
+	so_data  (so, cso->alpha.enabled ? 1 : 0);
+	so_data  (so, nvgl_comparison_op(cso->alpha.func));
+	so_data  (so, float_to_ubyte(cso->alpha.ref_value));
+
+	if (cso->stencil[0].enabled) {
+		so_method(so, rankine, NV34TCL_STENCIL_FRONT_ENABLE, 8);
+		so_data  (so, cso->stencil[0].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[0].writemask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
+		so_data  (so, cso->stencil[0].ref_value);
+		so_data  (so, cso->stencil[0].valuemask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+	} else {
+		so_method(so, rankine, NV34TCL_STENCIL_FRONT_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		so_method(so, rankine, NV34TCL_STENCIL_BACK_ENABLE, 8);
+		so_data  (so, cso->stencil[1].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[1].writemask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_data  (so, cso->stencil[1].ref_value);
+		so_data  (so, cso->stencil[1].valuemask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+	} else {
+		so_method(so, rankine, NV34TCL_STENCIL_BACK_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &zsaso->so);
+	zsaso->pipe = *cso;
+	return (void *)zsaso;
+}
+
+static void
+nv30_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->zsa = hwcso;
+	nv30->dirty |= NV30_NEW_ZSA;
+}
+
+static void
+nv30_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_zsa_state *zsaso = hwcso;
+
+	so_ref(NULL, &zsaso->so);
+	FREE(zsaso);
+}
+
+static void *
+nv30_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	/*struct nv30_context *nv30 = nv30_context(pipe);*/
+	struct nv30_vertex_program *vp;
+
+	vp = CALLOC(1, sizeof(struct nv30_vertex_program));
+	vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	/*vp->draw = draw_create_vertex_shader(nv30->draw, &vp->pipe);*/
+
+	return (void *)vp;
+}
+
+static void
+nv30_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->vertprog = hwcso;
+	nv30->dirty |= NV30_NEW_VERTPROG;
+	/*nv30->draw_dirty |= NV30_NEW_VERTPROG;*/
+}
+
+static void
+nv30_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_vertex_program *vp = hwcso;
+
+	/*draw_delete_vertex_shader(nv30->draw, vp->draw);*/
+	nv30_vertprog_destroy(nv30, vp);
+	FREE((void*)vp->pipe.tokens);
+	FREE(vp);
+}
+
+static void *
+nv30_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv30_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv30_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+	tgsi_scan_shader(fp->pipe.tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv30_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->fragprog = hwcso;
+	nv30->dirty |= NV30_NEW_FRAGPROG;
+}
+
+static void
+nv30_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_fragment_program *fp = hwcso;
+
+	nv30_fragprog_destroy(nv30, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv30_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->blend_colour = *bcol;
+	nv30->dirty |= NV30_NEW_BCOL;
+}
+
+static void
+nv30_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+}
+
+static void
+nv30_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->constbuf[shader] = buf->buffer;
+	nv30->constbuf_nr[shader] = buf->buffer->size / (4 * sizeof(float));
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv30->dirty |= NV30_NEW_VERTPROG;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv30->dirty |= NV30_NEW_FRAGPROG;
+	}
+}
+
+static void
+nv30_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->framebuffer = *fb;
+	nv30->dirty |= NV30_NEW_FB;
+}
+
+static void
+nv30_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	memcpy(nv30->stipple, stipple->stipple, 4 * 32);
+	nv30->dirty |= NV30_NEW_STIPPLE;
+}
+
+static void
+nv30_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->scissor = *s;
+	nv30->dirty |= NV30_NEW_SCISSOR;
+}
+
+static void
+nv30_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->viewport = *vpt;
+	nv30->dirty |= NV30_NEW_VIEWPORT;
+	/*nv30->draw_dirty |= NV30_NEW_VIEWPORT;*/
+}
+
+static void
+nv30_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	memcpy(nv30->vtxbuf, vb, sizeof(*vb) * count);
+	nv30->vtxbuf_nr = count;
+
+	nv30->dirty |= NV30_NEW_ARRAYS;
+	/*nv30->draw_dirty |= NV30_NEW_ARRAYS;*/
+}
+
+static void
+nv30_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	memcpy(nv30->vtxelt, ve, sizeof(*ve) * count);
+	nv30->vtxelt_nr = count;
+
+	nv30->dirty |= NV30_NEW_ARRAYS;
+	/*nv30->draw_dirty |= NV30_NEW_ARRAYS;*/
+}
+
+static void
+nv30_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->edgeflags = bitfield;
+	nv30->dirty |= NV30_NEW_ARRAYS;
+	/*nv30->draw_dirty |= NV30_NEW_ARRAYS;*/
+}
+
+void
+nv30_init_state_functions(struct nv30_context *nv30)
+{
+	nv30->pipe.create_blend_state = nv30_blend_state_create;
+	nv30->pipe.bind_blend_state = nv30_blend_state_bind;
+	nv30->pipe.delete_blend_state = nv30_blend_state_delete;
+
+	nv30->pipe.create_sampler_state = nv30_sampler_state_create;
+	nv30->pipe.bind_sampler_states = nv30_sampler_state_bind;
+	nv30->pipe.delete_sampler_state = nv30_sampler_state_delete;
+	nv30->pipe.set_sampler_textures = nv30_set_sampler_texture;
+
+	nv30->pipe.create_rasterizer_state = nv30_rasterizer_state_create;
+	nv30->pipe.bind_rasterizer_state = nv30_rasterizer_state_bind;
+	nv30->pipe.delete_rasterizer_state = nv30_rasterizer_state_delete;
+
+	nv30->pipe.create_depth_stencil_alpha_state =
+		nv30_depth_stencil_alpha_state_create;
+	nv30->pipe.bind_depth_stencil_alpha_state =
+		nv30_depth_stencil_alpha_state_bind;
+	nv30->pipe.delete_depth_stencil_alpha_state =
+		nv30_depth_stencil_alpha_state_delete;
+
+	nv30->pipe.create_vs_state = nv30_vp_state_create;
+	nv30->pipe.bind_vs_state = nv30_vp_state_bind;
+	nv30->pipe.delete_vs_state = nv30_vp_state_delete;
+
+	nv30->pipe.create_fs_state = nv30_fp_state_create;
+	nv30->pipe.bind_fs_state = nv30_fp_state_bind;
+	nv30->pipe.delete_fs_state = nv30_fp_state_delete;
+
+	nv30->pipe.set_blend_color = nv30_set_blend_color;
+	nv30->pipe.set_clip_state = nv30_set_clip_state;
+	nv30->pipe.set_constant_buffer = nv30_set_constant_buffer;
+	nv30->pipe.set_framebuffer_state = nv30_set_framebuffer_state;
+	nv30->pipe.set_polygon_stipple = nv30_set_polygon_stipple;
+	nv30->pipe.set_scissor_state = nv30_set_scissor_state;
+	nv30->pipe.set_viewport_state = nv30_set_viewport_state;
+
+	nv30->pipe.set_edgeflags = nv30_set_edgeflags;
+	nv30->pipe.set_vertex_buffers = nv30_set_vertex_buffers;
+	nv30->pipe.set_vertex_elements = nv30_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv30/nv30_state.h b/src/gallium/drivers/nv30/nv30_state.h
new file mode 100644
index 0000000000..2023278e37
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state.h
@@ -0,0 +1,88 @@
+#ifndef __NV30_STATE_H__
+#define __NV30_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv30_sampler_state {
+	uint32_t fmt;
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv30_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv30_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv30_vertex_program {
+	struct pipe_shader_state pipe;
+
+	boolean translated;
+
+	struct nv30_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv30_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+	struct nouveau_stateobj *so;
+};
+
+struct nv30_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv30_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv30_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+	struct nouveau_stateobj *so;
+};
+
+struct nv30_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct pipe_texture *shadow_tex;
+	struct pipe_surface *shadow_surface;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv30/nv30_state_blend.c b/src/gallium/drivers/nv30/nv30_state_blend.c
new file mode 100644
index 0000000000..44d43e132a
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_blend.c
@@ -0,0 +1,40 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_blend_validate(struct nv30_context *nv30)
+{
+	so_ref(nv30->blend->so, &nv30->state.hw[NV30_STATE_BLEND]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_blend = {
+	.validate = nv30_state_blend_validate,
+	.dirty = {
+		.pipe = NV30_NEW_BLEND,
+		.hw = NV30_STATE_BLEND
+	}
+};
+
+static boolean
+nv30_state_blend_colour_validate(struct nv30_context *nv30)
+{
+	struct nouveau_stateobj *so = so_new(2, 0);
+	struct pipe_blend_color *bcol = &nv30->blend_colour;
+
+	so_method(so, nv30->screen->rankine, NV34TCL_BLEND_COLOR, 1);
+	so_data  (so, ((float_to_ubyte(bcol->color[3]) << 24) |
+		       (float_to_ubyte(bcol->color[0]) << 16) |
+		       (float_to_ubyte(bcol->color[1]) <<  8) |
+		       (float_to_ubyte(bcol->color[2]) <<  0)));
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_BCOL]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_blend_colour = {
+	.validate = nv30_state_blend_colour_validate,
+	.dirty = {
+		.pipe = NV30_NEW_BCOL,
+		.hw = NV30_STATE_BCOL
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_emit.c b/src/gallium/drivers/nv30/nv30_state_emit.c
new file mode 100644
index 0000000000..f77b08ff69
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_emit.c
@@ -0,0 +1,118 @@
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+static struct nv30_state_entry *render_states[] = {
+	&nv30_state_framebuffer,
+	&nv30_state_rasterizer,
+	&nv30_state_scissor,
+	&nv30_state_stipple,
+	&nv30_state_fragprog,
+	&nv30_state_fragtex,
+	&nv30_state_vertprog,
+	&nv30_state_blend,
+	&nv30_state_blend_colour,
+	&nv30_state_zsa,
+	&nv30_state_viewport,
+	&nv30_state_vbo,
+	NULL
+};
+
+static void
+nv30_state_do_validate(struct nv30_context *nv30,
+		       struct nv30_state_entry **states)
+{
+	const struct pipe_framebuffer_state *fb = &nv30->framebuffer;
+	unsigned i;
+
+	for (i = 0; i < fb->nr_cbufs; i++)
+		fb->cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
+	if (fb->zsbuf)
+		fb->zsbuf->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	while (*states) {
+		struct nv30_state_entry *e = *states;
+
+		if (nv30->dirty & e->dirty.pipe) {
+			if (e->validate(nv30)) {
+				nv30->state.dirty |= (1ULL << e->dirty.hw);
+			}
+		}
+
+		states++;
+	}
+	nv30->dirty = 0;
+}
+
+void
+nv30_state_emit(struct nv30_context *nv30)
+{
+	struct nv30_state *state = &nv30->state;
+	struct nv30_screen *screen = nv30->screen;
+	unsigned i, samplers;
+	uint64_t states;
+
+	if (nv30->pctx_id != screen->cur_pctx) {
+		for (i = 0; i < NV30_STATE_MAX; i++) {
+			if (state->hw[i] && screen->state[i] != state->hw[i])
+				state->dirty |= (1ULL << i);
+		}
+
+		screen->cur_pctx = nv30->pctx_id;
+	}
+
+	for (i = 0, states = state->dirty; states; i++) {
+		if (!(states & (1ULL << i)))
+			continue;
+		so_ref (state->hw[i], &nv30->screen->state[i]);
+		if (state->hw[i])
+			so_emit(nv30->nvws, nv30->screen->state[i]);
+		states &= ~(1ULL << i);
+	}
+
+	state->dirty = 0;
+
+	so_emit_reloc_markers(nv30->nvws, state->hw[NV30_STATE_FB]);
+	for (i = 0, samplers = state->fp_samplers; i < 16 && samplers; i++) {
+		if (!(samplers & (1 << i)))
+			continue;
+		so_emit_reloc_markers(nv30->nvws,
+				      state->hw[NV30_STATE_FRAGTEX0+i]);
+		samplers &= ~(1ULL << i);
+	}
+	so_emit_reloc_markers(nv30->nvws, state->hw[NV30_STATE_FRAGPROG]);
+	if (state->hw[NV30_STATE_VTXBUF] /*&& nv30->render_mode == HW*/)
+		so_emit_reloc_markers(nv30->nvws, state->hw[NV30_STATE_VTXBUF]);
+}
+
+boolean
+nv30_state_validate(struct nv30_context *nv30)
+{
+#if 0
+	boolean was_sw = nv30->fallback_swtnl ? TRUE : FALSE;
+
+	if (nv30->render_mode != HW) {
+		/* Don't even bother trying to go back to hw if none
+		 * of the states that caused swtnl previously have changed.
+		 */
+		if ((nv30->fallback_swtnl & nv30->dirty)
+				!= nv30->fallback_swtnl)
+			return FALSE;
+
+		/* Attempt to go to hwtnl again */
+		nv30->pipe.flush(&nv30->pipe, 0, NULL);
+		nv30->dirty |= (NV30_NEW_VIEWPORT |
+				NV30_NEW_VERTPROG |
+				NV30_NEW_ARRAYS);
+		nv30->render_mode = HW;
+	}
+#endif
+	nv30_state_do_validate(nv30, render_states);
+#if 0
+	if (nv30->fallback_swtnl || nv30->fallback_swrast)
+		return FALSE;
+	
+	if (was_sw)
+		NOUVEAU_ERR("swtnl->hw\n");
+#endif
+	return TRUE;
+}
diff --git a/src/gallium/drivers/nv30/nv30_state_fb.c b/src/gallium/drivers/nv30/nv30_state_fb.c
new file mode 100644
index 0000000000..77368cb205
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_fb.c
@@ -0,0 +1,144 @@
+#include "nv30_context.h"
+#include "nouveau/nouveau_util.h"
+
+static boolean
+nv30_state_framebuffer_validate(struct nv30_context *nv30)
+{
+	struct pipe_framebuffer_state *fb = &nv30->framebuffer;
+	struct pipe_surface *rt[2], *zeta = NULL;
+	uint32_t rt_enable, rt_format;
+	int i, colour_format = 0, zeta_format = 0;
+	struct nouveau_stateobj *so = so_new(64, 10);
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
+	unsigned w = fb->width;
+	unsigned h = fb->height;
+	struct nv30_miptree *nv30mt;
+
+	rt_enable = 0;
+	for (i = 0; i < fb->nr_cbufs; i++) {
+		if (colour_format) {
+			assert(colour_format == fb->cbufs[i]->format);
+		} else {
+			colour_format = fb->cbufs[i]->format;
+			rt_enable |= (NV34TCL_RT_ENABLE_COLOR0 << i);
+			rt[i] = fb->cbufs[i];
+		}
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR1)
+		rt_enable |= NV34TCL_RT_ENABLE_MRT;
+
+	if (fb->zsbuf) {
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	if (!(rt[0]->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+		for (i = 1; i < fb->nr_cbufs; i++)
+			assert(!(rt[i]->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR));
+
+		/* FIXME: NV34TCL_RT_FORMAT_LOG2_[WIDTH/HEIGHT] */
+		rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
+		log2i(fb->width) << 16 /*NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT*/ |
+		log2i(fb->height) << 24 /*NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT*/;
+	}
+	else
+		rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	switch (zeta_format) {
+	case PIPE_FORMAT_Z16_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z16;
+		break;
+	case PIPE_FORMAT_Z24S8_UNORM:
+	case 0:
+		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z24S8;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR0) {
+		uint32_t pitch = rt[0]->stride;
+		if (zeta) {
+			pitch |= (zeta->stride << 16);
+		} else {
+			pitch |= (pitch << 16);
+		}
+
+		nv30mt = (struct nv30_miptree *)rt[0]->texture;
+		so_method(so, nv30->screen->rankine, NV34TCL_DMA_COLOR0, 1);
+		so_reloc (so, nv30mt->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv30->nvws->channel->vram->handle,
+			  nv30->nvws->channel->gart->handle);
+		so_method(so, nv30->screen->rankine, NV34TCL_COLOR0_PITCH, 2);
+		so_data  (so, pitch);
+		so_reloc (so, nv30mt->buffer, rt[0]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR1) {
+		nv30mt = (struct nv30_miptree *)rt[1]->texture;
+		so_method(so, nv30->screen->rankine, NV34TCL_DMA_COLOR1, 1);
+		so_reloc (so, nv30mt->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv30->nvws->channel->vram->handle,
+			  nv30->nvws->channel->gart->handle);
+		so_method(so, nv30->screen->rankine, NV34TCL_COLOR1_OFFSET, 2);
+		so_reloc (so, nv30mt->buffer, rt[1]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_data  (so, rt[1]->stride);
+	}
+
+	if (zeta_format) {
+		nv30mt = (struct nv30_miptree *)zeta->texture;
+		so_method(so, nv30->screen->rankine, NV34TCL_DMA_ZETA, 1);
+		so_reloc (so, nv30mt->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv30->nvws->channel->vram->handle,
+			  nv30->nvws->channel->gart->handle);
+		so_method(so, nv30->screen->rankine, NV34TCL_ZETA_OFFSET, 1);
+		so_reloc (so, nv30mt->buffer, zeta->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		/* TODO: allocate LMA depth buffer */
+	}
+
+	so_method(so, nv30->screen->rankine, NV34TCL_RT_ENABLE, 1);
+	so_data  (so, rt_enable);
+	so_method(so, nv30->screen->rankine, NV34TCL_RT_HORIZ, 3);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_data  (so, rt_format);
+	so_method(so, nv30->screen->rankine, NV34TCL_VIEWPORT_HORIZ, 2);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_method(so, nv30->screen->rankine, NV34TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	so_data  (so, ((w - 1) << 16) | 0);
+	so_data  (so, ((h - 1) << 16) | 0);
+	so_method(so, nv30->screen->rankine, 0x1d88, 1);
+	so_data  (so, (1 << 12) | h);
+	/* Wonder why this is needed, context should all be set to zero on init */
+	so_method(so, nv30->screen->rankine, NV34TCL_VIEWPORT_TX_ORIGIN, 1);
+	so_data  (so, 0);
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_FB]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_framebuffer = {
+	.validate = nv30_state_framebuffer_validate,
+	.dirty = {
+		.pipe = NV30_NEW_FB,
+		.hw = NV30_STATE_FB
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_rasterizer.c b/src/gallium/drivers/nv30/nv30_state_rasterizer.c
new file mode 100644
index 0000000000..6d1b60e043
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_rasterizer.c
@@ -0,0 +1,17 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_rasterizer_validate(struct nv30_context *nv30)
+{
+	so_ref(nv30->rasterizer->so,
+	       &nv30->state.hw[NV30_STATE_RAST]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_rasterizer = {
+	.validate = nv30_state_rasterizer_validate,
+	.dirty = {
+		.pipe = NV30_NEW_RAST,
+		.hw = NV30_STATE_RAST
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_scissor.c b/src/gallium/drivers/nv30/nv30_state_scissor.c
new file mode 100644
index 0000000000..1db9bc1795
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_scissor.c
@@ -0,0 +1,35 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_scissor_validate(struct nv30_context *nv30)
+{
+	struct pipe_rasterizer_state *rast = &nv30->rasterizer->pipe;
+	struct pipe_scissor_state *s = &nv30->scissor;
+	struct nouveau_stateobj *so;
+
+	if (nv30->state.hw[NV30_STATE_SCISSOR] &&
+	    (rast->scissor == 0 && nv30->state.scissor_enabled == 0))
+		return FALSE;
+	nv30->state.scissor_enabled = rast->scissor;
+
+	so = so_new(3, 0);
+	so_method(so, nv30->screen->rankine, NV34TCL_SCISSOR_HORIZ, 2);
+	if (nv30->state.scissor_enabled) {
+		so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
+		so_data  (so, ((s->maxy - s->miny) << 16) | s->miny);
+	} else {
+		so_data  (so, 4096 << 16);
+		so_data  (so, 4096 << 16);
+	}
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_SCISSOR]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_scissor = {
+	.validate = nv30_state_scissor_validate,
+	.dirty = {
+		.pipe = NV30_NEW_SCISSOR | NV30_NEW_RAST,
+		.hw = NV30_STATE_SCISSOR
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_stipple.c b/src/gallium/drivers/nv30/nv30_state_stipple.c
new file mode 100644
index 0000000000..41b42813b4
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_stipple.c
@@ -0,0 +1,39 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_stipple_validate(struct nv30_context *nv30)
+{
+	struct pipe_rasterizer_state *rast = &nv30->rasterizer->pipe;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct nouveau_stateobj *so;
+
+	if (nv30->state.hw[NV30_STATE_STIPPLE] &&
+	   (rast->poly_stipple_enable == 0 && nv30->state.stipple_enabled == 0))
+		return FALSE;
+
+	if (rast->poly_stipple_enable) {
+		unsigned i;
+
+		so = so_new(35, 0);
+		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+		for (i = 0; i < 32; i++)
+			so_data(so, nv30->stipple[i]);
+	} else {
+		so = so_new(2, 0);
+		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_STIPPLE]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_stipple = {
+	.validate = nv30_state_stipple_validate,
+	.dirty = {
+		.pipe = NV30_NEW_STIPPLE | NV30_NEW_RAST,
+		.hw = NV30_STATE_STIPPLE,
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_viewport.c b/src/gallium/drivers/nv30/nv30_state_viewport.c
new file mode 100644
index 0000000000..951d40ebfd
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_viewport.c
@@ -0,0 +1,70 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_viewport_validate(struct nv30_context *nv30)
+{
+	struct pipe_viewport_state *vpt = &nv30->viewport;
+	struct nouveau_stateobj *so;
+	unsigned bypass;
+
+	if (/*nv30->render_mode == HW &&*/ !nv30->rasterizer->pipe.bypass_clipping)
+		bypass = 0;
+	else
+		bypass = 1;
+
+	if (nv30->state.hw[NV30_STATE_VIEWPORT] &&
+	    (bypass || !(nv30->dirty & NV30_NEW_VIEWPORT)) &&
+	    nv30->state.viewport_bypass == bypass)
+		return FALSE;
+	nv30->state.viewport_bypass = bypass;
+
+	so = so_new(11, 0);
+	if (!bypass) {
+		so_method(so, nv30->screen->rankine,
+			  NV34TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(vpt->translate[0]));
+		so_data  (so, fui(vpt->translate[1]));
+		so_data  (so, fui(vpt->translate[2]));
+		so_data  (so, fui(vpt->translate[3]));
+		so_data  (so, fui(vpt->scale[0]));
+		so_data  (so, fui(vpt->scale[1]));
+		so_data  (so, fui(vpt->scale[2]));
+		so_data  (so, fui(vpt->scale[3]));
+/*		so_method(so, nv30->screen->rankine, 0x1d78, 1);
+		so_data  (so, 1);
+*/	} else {
+		so_method(so, nv30->screen->rankine,
+			  NV34TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(0.0));
+		/* Not entirely certain what this is yet.  The DDX uses this
+		 * value also as it fixes rendering when you pass
+		 * pre-transformed vertices to the GPU.  My best gusss is that
+		 * this bypasses some culling/clipping stage.  Might be worth
+		 * noting that points/lines are uneffected by whatever this
+		 * value fixes, only filled polygons are effected.
+		 */
+/*		so_method(so, nv30->screen->rankine, 0x1d78, 1);
+		so_data  (so, 0x110);
+*/	}
+	/* TODO/FIXME: never saw value 0x0110 in renouveau dumps, only 0x0001 */
+	so_method(so, nv30->screen->rankine, 0x1d78, 1);
+	so_data  (so, 1);
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_VIEWPORT]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_viewport = {
+	.validate = nv30_state_viewport_validate,
+	.dirty = {
+		.pipe = NV30_NEW_VIEWPORT | NV30_NEW_RAST,
+		.hw = NV30_STATE_VIEWPORT
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_zsa.c b/src/gallium/drivers/nv30/nv30_state_zsa.c
new file mode 100644
index 0000000000..0940b7269b
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_zsa.c
@@ -0,0 +1,17 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_zsa_validate(struct nv30_context *nv30)
+{
+	so_ref(nv30->zsa->so,
+	       &nv30->state.hw[NV30_STATE_ZSA]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_zsa = {
+	.validate = nv30_state_zsa_validate,
+	.dirty = {
+		.pipe = NV30_NEW_ZSA,
+		.hw = NV30_STATE_ZSA
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_surface.c b/src/gallium/drivers/nv30/nv30_surface.c
new file mode 100644
index 0000000000..0f8dc12045
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_surface.c
@@ -0,0 +1,72 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv30_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nv30_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv04_surface_2d *eng2d = nv30->screen->eng2d;
+
+	if (do_flip) {
+		desty += height;
+		while (height--) {
+			eng2d->copy(eng2d, dest, destx, desty--, src,
+				    srcx, srcy++, width, 1);
+		}
+		return;
+	}
+
+	eng2d->copy(eng2d, dest, destx, desty, src, srcx, srcy, width, height);
+}
+
+static void
+nv30_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv04_surface_2d *eng2d = nv30->screen->eng2d;
+
+	eng2d->fill(eng2d, dest, destx, desty, width, height, value);
+}
+
+void
+nv30_init_surface_functions(struct nv30_context *nv30)
+{
+	nv30->pipe.surface_copy = nv30_surface_copy;
+	nv30->pipe.surface_fill = nv30_surface_fill;
+}
diff --git a/src/gallium/drivers/nv30/nv30_vbo.c b/src/gallium/drivers/nv30/nv30_vbo.c
new file mode 100644
index 0000000000..2d6d48ac16
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_vbo.c
@@ -0,0 +1,556 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+#include "nouveau/nouveau_util.h"
+
+#define FORCE_SWTNL 0
+
+static INLINE int
+nv30_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
+{
+	switch (pipe) {
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		*fmt = NV34TCL_VTXFMT_TYPE_FLOAT;
+		break;
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+		*fmt = NV34TCL_VTXFMT_TYPE_UBYTE;
+		break;
+	case PIPE_FORMAT_R16_SSCALED:
+	case PIPE_FORMAT_R16G16_SSCALED:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*fmt = NV34TCL_VTXFMT_TYPE_USHORT;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	switch (pipe) {
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R16_SSCALED:
+		*ncomp = 1;
+		break;
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R16G16_SSCALED:
+		*ncomp = 2;
+		break;
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+		*ncomp = 3;
+		break;
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*ncomp = 4;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	return 0;
+}
+
+static boolean
+nv30_vbo_set_idxbuf(struct nv30_context *nv30, struct pipe_buffer *ib,
+		    unsigned ib_size)
+{
+	struct pipe_screen *pscreen = &nv30->screen->pipe;
+	unsigned type;
+
+	if (!ib) {
+		nv30->idxbuf = NULL;
+		nv30->idxbuf_format = 0xdeadbeef;
+		return FALSE;
+	}
+
+	if (!pscreen->get_param(pscreen, NOUVEAU_CAP_HW_IDXBUF) || ib_size == 1)
+		return FALSE;
+
+	switch (ib_size) {
+	case 2:
+		type = NV34TCL_IDXBUF_FORMAT_TYPE_U16;
+		break;
+	case 4:
+		type = NV34TCL_IDXBUF_FORMAT_TYPE_U32;
+		break;
+	default:
+		return FALSE;
+	}
+
+	if (ib != nv30->idxbuf ||
+	    type != nv30->idxbuf_format) {
+		nv30->dirty |= NV30_NEW_ARRAYS;
+		nv30->idxbuf = ib;
+		nv30->idxbuf_format = type;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_vbo_static_attrib(struct nv30_context *nv30, struct nouveau_stateobj *so,
+		       int attrib, struct pipe_vertex_element *ve,
+		       struct pipe_vertex_buffer *vb)
+{
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	unsigned type, ncomp;
+	void *map;
+
+	if (nv30_vbo_format_to_hw(ve->src_format, &type, &ncomp))
+		return FALSE;
+
+	map  = ws->buffer_map(ws, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+	map += vb->buffer_offset + ve->src_offset;
+
+	switch (type) {
+	case NV34TCL_VTXFMT_TYPE_FLOAT:
+	{
+		float *v = map;
+
+		switch (ncomp) {
+		case 4:
+			so_method(so, rankine, NV34TCL_VTX_ATTR_4F_X(attrib), 4);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			so_data  (so, fui(v[3]));
+			break;
+		case 3:
+			so_method(so, rankine, NV34TCL_VTX_ATTR_3F_X(attrib), 3);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			break;
+		case 2:
+			so_method(so, rankine, NV34TCL_VTX_ATTR_2F_X(attrib), 2);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			break;
+		case 1:
+			so_method(so, rankine, NV34TCL_VTX_ATTR_1F(attrib), 1);
+			so_data  (so, fui(v[0]));
+			break;
+		default:
+			ws->buffer_unmap(ws, vb->buffer);
+			return FALSE;
+		}
+	}
+		break;
+	default:
+		ws->buffer_unmap(ws, vb->buffer);
+		return FALSE;
+	}
+
+	ws->buffer_unmap(ws, vb->buffer);
+
+	return TRUE;
+}
+
+boolean
+nv30_draw_arrays(struct pipe_context *pipe,
+		 unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_channel *chan = nv30->nvws->channel;
+	unsigned restart = 0;
+
+	nv30_vbo_set_idxbuf(nv30, NULL, 0);
+	if (FORCE_SWTNL || !nv30_state_validate(nv30)) {
+		/*return nv30_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);*/
+		return FALSE;
+	}
+
+	while (count) {
+		unsigned vc, nr;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(rankine, NV34TCL_VB_VERTEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_VERTEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static INLINE void
+nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv30->nvws->channel;
+
+	while (count) {
+		uint8_t *elts = (uint8_t *)ib + start;
+		unsigned vc, push, restart = 0;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(rankine, NV34TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv30_draw_elements_u16(struct nv30_context *nv30, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv30->nvws->channel;
+
+	while (count) {
+		uint16_t *elts = (uint16_t *)ib + start;
+		unsigned vc, push, restart = 0;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(rankine, NV34TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv30_draw_elements_u32(struct nv30_context *nv30, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv30->nvws->channel;
+
+	while (count) {
+		uint32_t *elts = (uint32_t *)ib + start;
+		unsigned vc, push, restart = 0;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 5, 1,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		while (vc) {
+			push = MIN2(vc, 2047);
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U32, push);
+			OUT_RINGp    (elts, push);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static boolean
+nv30_draw_elements_inline(struct pipe_context *pipe,
+			  struct pipe_buffer *ib, unsigned ib_size,
+			  unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, ib, PIPE_BUFFER_USAGE_CPU_READ);
+	if (!ib) {
+		NOUVEAU_ERR("failed mapping ib\n");
+		return FALSE;
+	}
+
+	switch (ib_size) {
+	case 1:
+		nv30_draw_elements_u08(nv30, map, mode, start, count);
+		break;
+	case 2:
+		nv30_draw_elements_u16(nv30, map, mode, start, count);
+		break;
+	case 4:
+		nv30_draw_elements_u32(nv30, map, mode, start, count);
+		break;
+	default:
+		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
+		break;
+	}
+
+	ws->buffer_unmap(ws, ib);
+	return TRUE;
+}
+
+static boolean
+nv30_draw_elements_vbo(struct pipe_context *pipe,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_channel *chan = nv30->nvws->channel;
+	unsigned restart = 0;
+
+	while (count) {
+		unsigned nr, vc;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(rankine, NV34TCL_VB_INDEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_INDEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	return TRUE;
+}
+
+boolean
+nv30_draw_elements(struct pipe_context *pipe,
+		   struct pipe_buffer *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	boolean idxbuf;
+
+	idxbuf = nv30_vbo_set_idxbuf(nv30, indexBuffer, indexSize);
+	if (FORCE_SWTNL || !nv30_state_validate(nv30)) {
+		/*return nv30_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);*/
+		return FALSE;	
+	}
+
+	if (idxbuf) {
+		nv30_draw_elements_vbo(pipe, mode, start, count);
+	} else {
+		nv30_draw_elements_inline(pipe, indexBuffer, indexSize,
+					  mode, start, count);
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static boolean
+nv30_vbo_validate(struct nv30_context *nv30)
+{
+	struct nouveau_stateobj *vtxbuf, *vtxfmt, *sattr = NULL;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct pipe_buffer *ib = nv30->idxbuf;
+	unsigned ib_format = nv30->idxbuf_format;
+	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+	int hw;
+
+	if (nv30->edgeflags) {
+		/*nv30->fallback_swtnl |= NV30_NEW_ARRAYS;*/
+		return FALSE;
+	}
+
+	vtxbuf = so_new(20, 18);
+	so_method(vtxbuf, rankine, NV34TCL_VTXBUF_ADDRESS(0), nv30->vtxelt_nr);
+	vtxfmt = so_new(17, 0);
+	so_method(vtxfmt, rankine, NV34TCL_VTXFMT(0), nv30->vtxelt_nr);
+
+	for (hw = 0; hw < nv30->vtxelt_nr; hw++) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+		unsigned type, ncomp;
+
+		ve = &nv30->vtxelt[hw];
+		vb = &nv30->vtxbuf[ve->vertex_buffer_index];
+
+		if (!vb->stride) {
+			if (!sattr)
+				sattr = so_new(16 * 5, 0);
+
+			if (nv30_vbo_static_attrib(nv30, sattr, hw, ve, vb)) {
+				so_data(vtxbuf, 0);
+				so_data(vtxfmt, NV34TCL_VTXFMT_TYPE_FLOAT);
+				continue;
+			}
+		}
+
+		if (nv30_vbo_format_to_hw(ve->src_format, &type, &ncomp)) {
+			/*nv30->fallback_swtnl |= NV30_NEW_ARRAYS;*/
+			so_ref(NULL, &vtxbuf);
+			so_ref(NULL, &vtxfmt);
+			return FALSE;
+		}
+
+		so_reloc(vtxbuf, vb->buffer, vb->buffer_offset + ve->src_offset,
+			 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+			 0, NV34TCL_VTXBUF_ADDRESS_DMA1);
+		so_data (vtxfmt, ((vb->stride << NV34TCL_VTXFMT_STRIDE_SHIFT) |
+				  (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type));
+	}
+
+	if (ib) {
+		so_method(vtxbuf, rankine, NV34TCL_IDXBUF_ADDRESS, 2);
+		so_reloc (vtxbuf, ib, 0, vb_flags | NOUVEAU_BO_LOW, 0, 0);
+		so_reloc (vtxbuf, ib, ib_format, vb_flags | NOUVEAU_BO_OR,
+			  0, NV34TCL_IDXBUF_FORMAT_DMA1);
+	}
+
+	so_method(vtxbuf, rankine, 0x1710, 1);
+	so_data  (vtxbuf, 0);
+
+	so_ref(vtxbuf, &nv30->state.hw[NV30_STATE_VTXBUF]);
+	nv30->state.dirty |= (1ULL << NV30_STATE_VTXBUF);
+	so_ref(vtxfmt, &nv30->state.hw[NV30_STATE_VTXFMT]);
+	nv30->state.dirty |= (1ULL << NV30_STATE_VTXFMT);
+	so_ref(sattr, &nv30->state.hw[NV30_STATE_VTXATTR]);
+	nv30->state.dirty |= (1ULL << NV30_STATE_VTXATTR);
+	return FALSE;
+}
+
+struct nv30_state_entry nv30_state_vbo = {
+	.validate = nv30_vbo_validate,
+	.dirty = {
+		.pipe = NV30_NEW_ARRAYS,
+		.hw = 0,
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_vertprog.c b/src/gallium/drivers/nv30/nv30_vertprog.c
new file mode 100644
index 0000000000..d262725057
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_vertprog.c
@@ -0,0 +1,838 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  2. Arb. swz/negation
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 8
+#define MASK_Y 4
+#define MASK_Z 2
+#define MASK_W 1
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE 0
+#define DEF_CTEST 0
+#include "nv30_shader.h"
+
+#define swz(s,x,y,z,w) nv30_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv30_sr_neg((s))
+#define abs(s) nv30_sr_abs((s))
+
+struct nv30_vpc {
+	struct nv30_vertex_program *vp;
+
+	struct nv30_vertex_program_exec *vpi;
+
+	unsigned output_map[PIPE_MAX_SHADER_OUTPUTS];
+
+	int high_temp;
+	int temp_temp_count;
+
+	struct nv30_sreg *imm;
+	unsigned nr_imm;
+};
+
+static struct nv30_sreg
+temp(struct nv30_vpc *vpc)
+{
+	int idx;
+
+	idx  = vpc->temp_temp_count++;
+	idx += vpc->high_temp + 1;
+	return nv30_sr(NV30SR_TEMP, idx);
+}
+
+static struct nv30_sreg
+constant(struct nv30_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+	struct nv30_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nv30_sr(NV30SR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nv30_sr(NV30SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv30_vp_arith((cc), (s), NV30_VP_INST_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nv30_vpc *vpc, uint32_t *hw, int pos, struct nv30_sreg src)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV30SR_TEMP:
+		sr |= (NV30_VP_SRC_REG_TYPE_TEMP << NV30_VP_SRC_REG_TYPE_SHIFT);
+		sr |= (src.index << NV30_VP_SRC_TEMP_SRC_SHIFT);
+		break;
+	case NV30SR_INPUT:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NV30_VP_INST_INPUT_SRC_SHIFT);
+		break;
+	case NV30SR_CONST:
+		sr |= (NV30_VP_SRC_REG_TYPE_CONST <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NV30SR_NONE:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV30_VP_SRC_NEGATE;
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NV30_VP_SRC_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV30_VP_SRC_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV30_VP_SRC_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV30_VP_SRC_SWZ_W_SHIFT));
+
+/*
+ * |VVV|
+ * d�.�b
+ *  \u/
+ *
+ */
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NV30_VP_SRC0_HIGH_MASK) >>
+			  NV30_VP_SRC0_HIGH_SHIFT) << NV30_VP_INST_SRC0H_SHIFT;
+		hw[2] |= (sr & NV30_VP_SRC0_LOW_MASK) <<
+			  NV30_VP_INST_SRC0L_SHIFT;
+		break;
+	case 1:
+		hw[2] |= sr << NV30_VP_INST_SRC1_SHIFT;
+		break;
+	case 2:
+		hw[2] |= ((sr & NV30_VP_SRC2_HIGH_MASK) >>
+			  NV30_VP_SRC2_HIGH_SHIFT) << NV30_VP_INST_SRC2H_SHIFT;
+		hw[3] |= (sr & NV30_VP_SRC2_LOW_MASK) <<
+			  NV30_VP_INST_SRC2L_SHIFT;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nv30_vpc *vpc, uint32_t *hw, int slot, struct nv30_sreg dst)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NV30SR_TEMP:
+		hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
+		break;
+	case NV30SR_OUTPUT:
+		switch (dst.index) {
+		case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+		case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+		case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+		case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+		case NV30_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
+		case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+		case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+		case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+		case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+		case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+		case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+		case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+		case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+		case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+		default:
+			break;
+		}
+
+		hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
+		hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+
+		/*XXX: no way this is entirely correct, someone needs to
+		 *     figure out what exactly it is.
+		 */
+		hw[3] |= 0x800;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nv30_vp_arith(struct nv30_vpc *vpc, int slot, int op,
+	      struct nv30_sreg dst, int mask,
+	      struct nv30_sreg s0, struct nv30_sreg s1,
+	      struct nv30_sreg s2)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NV30_VP_INST_COND_TR << NV30_VP_INST_COND_SHIFT);
+	hw[0] |= ((0 << NV30_VP_INST_COND_SWZ_X_SHIFT) |
+		  (1 << NV30_VP_INST_COND_SWZ_Y_SHIFT) |
+		  (2 << NV30_VP_INST_COND_SWZ_Z_SHIFT) |
+		  (3 << NV30_VP_INST_COND_SWZ_W_SHIFT));
+
+	hw[1] |= (op << NV30_VP_INST_VEC_OPCODE_SHIFT);
+//	hw[3] |= NV30_VP_INST_SCA_DEST_TEMP_MASK;
+//	hw[3] |= (mask << NV30_VP_INST_VEC_WRITEMASK_SHIFT);
+
+	if (dst.type == NV30SR_OUTPUT) {
+		if (slot)
+			hw[3] |= (mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
+		else
+			hw[3] |= (mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
+	} else {
+		if (slot)
+			hw[3] |= (mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
+		else
+			hw[3] |= (mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
+	}
+
+	emit_dst(vpc, hw, slot, dst);
+	emit_src(vpc, hw, 0, s0);
+	emit_src(vpc, hw, 1, s1);
+	emit_src(vpc, hw, 2, s2);
+}
+
+static INLINE struct nv30_sreg
+tgsi_src(struct nv30_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nv30_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv30_sr(NV30SR_INPUT, fsrc->SrcRegister.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		if (vpc->high_temp < fsrc->SrcRegister.Index)
+			vpc->high_temp = fsrc->SrcRegister.Index;
+		src = nv30_sr(NV30SR_TEMP, fsrc->SrcRegister.Index);
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+tgsi_dst(struct nv30_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nv30_sreg dst;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = nv30_sr(NV30SR_OUTPUT,
+			      vpc->output_map[fdst->DstRegister.Index]);
+
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = nv30_sr(NV30SR_TEMP, fdst->DstRegister.Index);
+		if (vpc->high_temp < dst.index)
+			vpc->high_temp = dst.index;
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+nv30_vertprog_parse_instruction(struct nv30_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nv30_sreg src[3], dst, tmp;
+	struct nv30_sreg none = nv30_sr(NV30SR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	vpc->temp_temp_count = 0;
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		/*XXX: index comparison is broken now that consts come from
+		 *     two different register files.
+		 */
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_IMMEDIATE:
+			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, 1, OP_EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, 1, OP_RSQ, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(vpc, 0, OP_SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, 0, OP_MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_vertprog_parse_decl_output(struct nv30_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV30_VP_INST_DEST_POS;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_BFC0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_BFC1;
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV30_VP_INST_DEST_FOGC;
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NV30_VP_INST_DEST_PSZ;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV30_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->output_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv30_vertprog_prepare(struct nv30_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int nr_imm = 0;
+
+	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nv30_sreg));
+		assert(vpc->imm);
+	}
+
+	return TRUE;
+}
+
+static void
+nv30_vertprog_translate(struct nv30_context *nv30,
+			struct nv30_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nv30_vpc *vpc = NULL;
+
+	tgsi_dump(vp->pipe.tokens,0);
+
+	vpc = CALLOC(1, sizeof(struct nv30_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+	vpc->high_temp = -1;
+
+	if (!nv30_vertprog_prepare(vpc)) {
+		FREE(vpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &parse.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_OUTPUT:
+				if (!nv30_vertprog_parse_decl_output(vpc, fdec))
+					goto out_err;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(imm->Immediate.NrTokens == 4 + 1);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u.ImmediateFloat32[0].Float,
+					 imm->u.ImmediateFloat32[1].Float,
+					 imm->u.ImmediateFloat32[2].Float,
+					 imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv30_vertprog_parse_instruction(vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NV30_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	FREE(vpc);
+}
+
+static boolean
+nv30_vertprog_validate(struct nv30_context *nv30)
+{ 
+	struct nouveau_winsys *nvws = nv30->nvws;
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct nv30_vertex_program *vp;
+	struct pipe_buffer *constbuf;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	vp = nv30->vertprog;
+	constbuf = nv30->constbuf[PIPE_SHADER_VERTEX];
+
+	/* Translate TGSI shader into hw bytecode */
+	if (!vp->translated) {
+		nv30_vertprog_translate(nv30, vp);
+		if (!vp->translated)
+			return FALSE;
+	}
+
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nv30->screen->vp_exec_heap;
+		struct nouveau_stateobj *so;
+		uint vplen = vp->nr_insns;
+
+		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nv30_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->exec);
+			}
+
+			if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		so = so_new(2, 0);
+		so_method(so, rankine, NV34TCL_VP_START_FROM_ID, 1);
+		so_data  (so, vp->exec->start);
+		so_ref(so, &vp->so);
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nv30->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nv30_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv30_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv30_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NV30_VP_INST_CONST_SRC_MASK;
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NV30_VP_INST_CONST_SRC_SHIFT;
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (constbuf) {
+			map = ws->buffer_map(ws, constbuf,
+					     PIPE_BUFFER_USAGE_CPU_READ);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nv30_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (i + vp->data->start);
+			OUT_RINGp ((uint32_t *)vpd->value, 4);
+		}
+
+		if (constbuf) {
+			ws->buffer_unmap(ws, constbuf);
+		}
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP inst %d: 0x%08x 0x%08x 0x%08x 0x%08x\n",
+				i, vp->insns[i].data[0], vp->insns[i].data[1],
+				vp->insns[i].data[2], vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (vp->insns[i].data, 4);
+		}
+	}
+
+	if (vp->so != nv30->state.hw[NV30_STATE_VERTPROG]) {
+		so_ref(vp->so, &nv30->state.hw[NV30_STATE_VERTPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv30_vertprog_destroy(struct nv30_context *nv30, struct nv30_vertex_program *vp)
+{
+	struct nouveau_winsys *nvws = nv30->screen->nvws;
+
+	vp->translated = FALSE;
+
+	if (vp->nr_insns) {
+		FREE(vp->insns);
+		vp->insns = NULL;
+		vp->nr_insns = 0;
+	}
+
+	if (vp->nr_consts) {
+		FREE(vp->consts);
+		vp->consts = NULL;
+		vp->nr_consts = 0;
+	}
+
+	nvws->res_free(&vp->exec);
+	vp->exec_start = 0;
+	nvws->res_free(&vp->data);
+	vp->data_start = 0;
+	vp->data_start_min = 0;
+
+	vp->ir = vp->or = 0;
+	so_ref(NULL, &vp->so);
+}
+
+struct nv30_state_entry nv30_state_vertprog = {
+	.validate = nv30_vertprog_validate,
+	.dirty = {
+		.pipe = NV30_NEW_VERTPROG /*| NV30_NEW_UCP*/,
+		.hw = NV30_STATE_VERTPROG,
+	}
+};
diff --git a/src/gallium/drivers/nv40/Makefile b/src/gallium/drivers/nv40/Makefile
new file mode 100644
index 0000000000..8c738aefa6
--- /dev/null
+++ b/src/gallium/drivers/nv40/Makefile
@@ -0,0 +1,28 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv40
+
+C_SOURCES = \
+	nv40_clear.c \
+	nv40_context.c \
+	nv40_draw.c \
+	nv40_fragprog.c \
+	nv40_fragtex.c \
+	nv40_miptree.c \
+	nv40_query.c \
+	nv40_screen.c \
+	nv40_state.c \
+	nv40_state_blend.c \
+	nv40_state_emit.c \
+	nv40_state_fb.c \
+	nv40_state_rasterizer.c \
+	nv40_state_scissor.c \
+	nv40_state_stipple.c \
+	nv40_state_viewport.c \
+	nv40_state_zsa.c \
+	nv40_surface.c \
+	nv40_vbo.c \
+	nv40_vertprog.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/nv40/nv40_clear.c b/src/gallium/drivers/nv40/nv40_clear.c
new file mode 100644
index 0000000000..59efd620e3
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_clear.c
@@ -0,0 +1,13 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv40_context.h"
+
+void
+nv40_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+	ps->status = PIPE_SURFACE_STATUS_CLEAR;
+}
diff --git a/src/gallium/drivers/nv40/nv40_context.c b/src/gallium/drivers/nv40/nv40_context.c
new file mode 100644
index 0000000000..5d325f5067
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_context.c
@@ -0,0 +1,72 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "nv40_context.h"
+#include "nv40_screen.h"
+
+static void
+nv40_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	
+	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+		BEGIN_RING(curie, 0x1fd8, 1);
+		OUT_RING  (2);
+		BEGIN_RING(curie, 0x1fd8, 1);
+		OUT_RING  (1);
+	}
+
+	FIRE_RING(fence);
+}
+
+static void
+nv40_destroy(struct pipe_context *pipe)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	if (nv40->draw)
+		draw_destroy(nv40->draw);
+	FREE(nv40);
+}
+
+struct pipe_context *
+nv40_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv40_context *nv40;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv40 = CALLOC(1, sizeof(struct nv40_context));
+	if (!nv40)
+		return NULL;
+	nv40->screen = screen;
+	nv40->pctx_id = pctx_id;
+
+	nv40->nvws = nvws;
+
+	nv40->pipe.winsys = ws;
+	nv40->pipe.screen = pscreen;
+	nv40->pipe.destroy = nv40_destroy;
+	nv40->pipe.draw_arrays = nv40_draw_arrays;
+	nv40->pipe.draw_elements = nv40_draw_elements;
+	nv40->pipe.clear = nv40_clear;
+	nv40->pipe.flush = nv40_flush;
+
+	nv40_init_query_functions(nv40);
+	nv40_init_surface_functions(nv40);
+	nv40_init_state_functions(nv40);
+
+	/* Create, configure, and install fallback swtnl path */
+	nv40->draw = draw_create();
+	draw_wide_point_threshold(nv40->draw, 9999999.0);
+	draw_wide_line_threshold(nv40->draw, 9999999.0);
+	draw_enable_line_stipple(nv40->draw, FALSE);
+	draw_enable_point_sprites(nv40->draw, FALSE);
+	draw_set_rasterize_stage(nv40->draw, nv40_draw_render_stage(nv40));
+
+	return &nv40->pipe;
+}
+	
diff --git a/src/gallium/drivers/nv40/nv40_context.h b/src/gallium/drivers/nv40/nv40_context.h
new file mode 100644
index 0000000000..adcfbdd85a
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_context.h
@@ -0,0 +1,233 @@
+#ifndef __NV40_CONTEXT_H__
+#define __NV40_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv40_screen *ctx = nv40->screen
+#include "nouveau/nouveau_push.h"
+#include "nouveau/nouveau_stateobj.h"
+
+#include "nv40_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+enum nv40_state_index {
+	NV40_STATE_FB = 0,
+	NV40_STATE_VIEWPORT = 1,
+	NV40_STATE_BLEND = 2,
+	NV40_STATE_RAST = 3,
+	NV40_STATE_ZSA = 4,
+	NV40_STATE_BCOL = 5,
+	NV40_STATE_CLIP = 6,
+	NV40_STATE_SCISSOR = 7,
+	NV40_STATE_STIPPLE = 8,
+	NV40_STATE_FRAGPROG = 9,
+	NV40_STATE_VERTPROG = 10,
+	NV40_STATE_FRAGTEX0 = 11,
+	NV40_STATE_FRAGTEX1 = 12,
+	NV40_STATE_FRAGTEX2 = 13,
+	NV40_STATE_FRAGTEX3 = 14,
+	NV40_STATE_FRAGTEX4 = 15,
+	NV40_STATE_FRAGTEX5 = 16,
+	NV40_STATE_FRAGTEX6 = 17,
+	NV40_STATE_FRAGTEX7 = 18,
+	NV40_STATE_FRAGTEX8 = 19,
+	NV40_STATE_FRAGTEX9 = 20,
+	NV40_STATE_FRAGTEX10 = 21,
+	NV40_STATE_FRAGTEX11 = 22,
+	NV40_STATE_FRAGTEX12 = 23,
+	NV40_STATE_FRAGTEX13 = 24,
+	NV40_STATE_FRAGTEX14 = 25,
+	NV40_STATE_FRAGTEX15 = 26,
+	NV40_STATE_VERTTEX0 = 27,
+	NV40_STATE_VERTTEX1 = 28,
+	NV40_STATE_VERTTEX2 = 29,
+	NV40_STATE_VERTTEX3 = 30,
+	NV40_STATE_VTXBUF = 31,
+	NV40_STATE_VTXFMT = 32,
+	NV40_STATE_VTXATTR = 33,
+	NV40_STATE_MAX = 34
+};
+
+#include "nv40_screen.h"
+
+#define NV40_NEW_BLEND		(1 <<  0)
+#define NV40_NEW_RAST		(1 <<  1)
+#define NV40_NEW_ZSA		(1 <<  2)
+#define NV40_NEW_SAMPLER	(1 <<  3)
+#define NV40_NEW_FB		(1 <<  4)
+#define NV40_NEW_STIPPLE	(1 <<  5)
+#define NV40_NEW_SCISSOR	(1 <<  6)
+#define NV40_NEW_VIEWPORT	(1 <<  7)
+#define NV40_NEW_BCOL		(1 <<  8)
+#define NV40_NEW_VERTPROG	(1 <<  9)
+#define NV40_NEW_FRAGPROG	(1 << 10)
+#define NV40_NEW_ARRAYS		(1 << 11)
+#define NV40_NEW_UCP		(1 << 12)
+
+struct nv40_rasterizer_state {
+	struct pipe_rasterizer_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_zsa_state {
+	struct pipe_depth_stencil_alpha_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_blend_state {
+	struct pipe_blend_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+
+struct nv40_state {
+	unsigned scissor_enabled;
+	unsigned stipple_enabled;
+	unsigned viewport_bypass;
+	unsigned fp_samplers;
+
+	uint64_t dirty;
+	struct nouveau_stateobj *hw[NV40_STATE_MAX];
+};
+
+struct nv40_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv40_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	/* HW state derived from pipe states */
+	struct nv40_state state;
+	struct {
+		struct nv40_vertex_program *vertprog;
+
+		unsigned nr_attribs;
+		unsigned hw[PIPE_MAX_SHADER_INPUTS];
+		unsigned draw[PIPE_MAX_SHADER_INPUTS];
+		unsigned emit[PIPE_MAX_SHADER_INPUTS];
+	} swtnl;
+
+	enum {
+		HW, SWTNL, SWRAST
+	} render_mode;
+	unsigned fallback_swtnl;
+	unsigned fallback_swrast;
+
+	/* Context state */
+	unsigned dirty, draw_dirty;
+	struct pipe_scissor_state scissor;
+	unsigned stipple[32];
+	struct pipe_clip_state clip;
+	struct nv40_vertex_program *vertprog;
+	struct nv40_fragment_program *fragprog;
+	struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+	struct nv40_rasterizer_state *rasterizer;
+	struct nv40_zsa_state *zsa;
+	struct nv40_blend_state *blend;
+	struct pipe_blend_color blend_colour;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct pipe_buffer *idxbuf;
+	unsigned idxbuf_format;
+	struct nv40_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv40_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned nr_samplers;
+	unsigned nr_textures;
+	unsigned dirty_samplers;
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+	unsigned vtxelt_nr;
+	const unsigned *edgeflags;
+};
+
+static INLINE struct nv40_context *
+nv40_context(struct pipe_context *pipe)
+{
+	return (struct nv40_context *)pipe;
+}
+
+struct nv40_state_entry {
+	boolean (*validate)(struct nv40_context *nv40);
+	struct {
+		unsigned pipe;
+		unsigned hw;
+	} dirty;
+};
+
+extern void nv40_init_state_functions(struct nv40_context *nv40);
+extern void nv40_init_surface_functions(struct nv40_context *nv40);
+extern void nv40_init_query_functions(struct nv40_context *nv40);
+
+extern void nv40_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv40_draw.c */
+extern struct draw_stage *nv40_draw_render_stage(struct nv40_context *nv40);
+extern boolean nv40_draw_elements_swtnl(struct pipe_context *pipe,
+					struct pipe_buffer *idxbuf,
+					unsigned ib_size, unsigned mode,
+					unsigned start, unsigned count);
+
+/* nv40_vertprog.c */
+extern void nv40_vertprog_destroy(struct nv40_context *,
+				  struct nv40_vertex_program *);
+
+/* nv40_fragprog.c */
+extern void nv40_fragprog_destroy(struct nv40_context *,
+				  struct nv40_fragment_program *);
+
+/* nv40_fragtex.c */
+extern void nv40_fragtex_bind(struct nv40_context *);
+
+/* nv40_state.c and friends */
+extern boolean nv40_state_validate(struct nv40_context *nv40);
+extern boolean nv40_state_validate_swtnl(struct nv40_context *nv40);
+extern void nv40_state_emit(struct nv40_context *nv40);
+extern struct nv40_state_entry nv40_state_rasterizer;
+extern struct nv40_state_entry nv40_state_scissor;
+extern struct nv40_state_entry nv40_state_stipple;
+extern struct nv40_state_entry nv40_state_fragprog;
+extern struct nv40_state_entry nv40_state_vertprog;
+extern struct nv40_state_entry nv40_state_blend;
+extern struct nv40_state_entry nv40_state_blend_colour;
+extern struct nv40_state_entry nv40_state_zsa;
+extern struct nv40_state_entry nv40_state_viewport;
+extern struct nv40_state_entry nv40_state_framebuffer;
+extern struct nv40_state_entry nv40_state_fragtex;
+extern struct nv40_state_entry nv40_state_vbo;
+extern struct nv40_state_entry nv40_state_vtxfmt;
+
+/* nv40_vbo.c */
+extern boolean nv40_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv40_draw_elements(struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+
+/* nv40_clear.c */
+extern void nv40_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_draw.c b/src/gallium/drivers/nv40/nv40_draw.c
new file mode 100644
index 0000000000..c83ff91d7e
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_draw.c
@@ -0,0 +1,349 @@
+#include "pipe/p_shader_tokens.h"
+
+#include "util/u_pack_color.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pipe.h"
+
+#include "nv40_context.h"
+#define NV40_SHADER_NO_FUCKEDNESS
+#include "nv40_shader.h"
+
+/* Simple, but crappy, swtnl path, hopefully we wont need to hit this very
+ * often at all.  Uses "quadro style" vertex submission + a fixed vertex
+ * layout to avoid the need to generate a vertex program or vtxfmt.
+ */
+
+struct nv40_render_stage {
+	struct draw_stage stage;
+	struct nv40_context *nv40;
+	unsigned prim;
+};
+
+static INLINE struct nv40_render_stage *
+nv40_render_stage(struct draw_stage *stage)
+{
+	return (struct nv40_render_stage *)stage;
+}
+
+static INLINE void
+nv40_render_vertex(struct nv40_context *nv40, const struct vertex_header *v)
+{
+	unsigned i;
+
+	for (i = 0; i < nv40->swtnl.nr_attribs; i++) {
+		unsigned idx = nv40->swtnl.draw[i];
+		unsigned hw = nv40->swtnl.hw[i];
+
+		switch (nv40->swtnl.emit[i]) {
+		case EMIT_OMIT:
+			break;
+		case EMIT_1F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_1F(hw), 1);
+			OUT_RING  (fui(v->data[idx][0]));
+			break;
+		case EMIT_2F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_2F_X(hw), 2);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			break;
+		case EMIT_3F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_3F_X(hw), 3);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			OUT_RING  (fui(v->data[idx][2]));
+			break;
+		case EMIT_4F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4F_X(hw), 4);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			OUT_RING  (fui(v->data[idx][2]));
+			OUT_RING  (fui(v->data[idx][3]));
+			break;
+		case EMIT_4UB:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4UB(hw), 1);
+			OUT_RING  (pack_ub4(float_to_ubyte(v->data[idx][0]),
+					    float_to_ubyte(v->data[idx][1]),
+					    float_to_ubyte(v->data[idx][2]),
+					    float_to_ubyte(v->data[idx][3])));
+			break;
+		default:
+			assert(0);
+			break;
+		}
+	}
+}
+
+static INLINE void
+nv40_render_prim(struct draw_stage *stage, struct prim_header *prim,
+	       unsigned mode, unsigned count)
+{
+	struct nv40_render_stage *rs = nv40_render_stage(stage);
+	struct nv40_context *nv40 = rs->nv40;
+	struct nouveau_pushbuf *pb = nv40->nvws->channel->pushbuf;
+	unsigned i;
+
+	/* Ensure there's room for 4xfloat32 + potentially 3 begin/end */
+	if (pb->remaining < ((count * 20) + 6)) {
+		if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+			NOUVEAU_ERR("AIII, missed flush\n");
+			assert(0);
+		}
+		FIRE_RING(NULL);
+		nv40_state_emit(nv40);
+	}
+
+	/* Switch primitive modes if necessary */
+	if (rs->prim != mode) {
+		if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+			BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+			OUT_RING  (NV40TCL_BEGIN_END_STOP);	
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (mode);
+		rs->prim = mode;
+	}
+
+	/* Emit vertex data */
+	for (i = 0; i < count; i++)
+		nv40_render_vertex(nv40, prim->v[i]);
+
+	/* If it's likely we'll need to empty the push buffer soon, finish
+	 * off the primitive now.
+	 */
+	if (pb->remaining < ((count * 20) + 6)) {
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		rs->prim = NV40TCL_BEGIN_END_STOP;
+	}
+}
+
+static void
+nv40_render_point(struct draw_stage *draw, struct prim_header *prim)
+{
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_POINTS, 1);
+}
+
+static void
+nv40_render_line(struct draw_stage *draw, struct prim_header *prim)
+{
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_LINES, 2);
+}
+
+static void
+nv40_render_tri(struct draw_stage *draw, struct prim_header *prim)
+{
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_TRIANGLES, 3);
+}
+
+static void
+nv40_render_flush(struct draw_stage *draw, unsigned flags)
+{
+	struct nv40_render_stage *rs = nv40_render_stage(draw);
+	struct nv40_context *nv40 = rs->nv40;
+
+	if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		rs->prim = NV40TCL_BEGIN_END_STOP;
+	}
+}
+
+static void
+nv40_render_reset_stipple_counter(struct draw_stage *draw)
+{
+}
+
+static void
+nv40_render_destroy(struct draw_stage *draw)
+{
+	FREE(draw);
+}
+
+static INLINE void
+emit_mov(struct nv40_vertex_program *vp,
+	 unsigned dst, unsigned src, unsigned vor, unsigned mask)
+{
+	struct nv40_vertex_program_exec *inst;
+
+	vp->insns = realloc(vp->insns,
+			    sizeof(struct nv40_vertex_program_exec) *
+			    ++vp->nr_insns);
+	inst = &vp->insns[vp->nr_insns - 1];
+
+	inst->data[0] = 0x401f9c6c;
+	inst->data[1] = 0x0040000d | (src << 8);
+	inst->data[2] = 0x8106c083;
+	inst->data[3] = 0x6041ff80 | (dst << 2) | (mask << 13);
+	inst->const_index = -1;
+	inst->has_branch_offset = FALSE;
+
+	vp->ir |= (1 << src);
+	if (vor != ~0)
+		vp->or |= (1 << vor);
+}
+
+static struct nv40_vertex_program *
+create_drawvp(struct nv40_context *nv40)
+{
+	struct nv40_vertex_program *vp = CALLOC_STRUCT(nv40_vertex_program);
+	unsigned i;
+
+	emit_mov(vp, NV40_VP_INST_DEST_POS, 0, ~0, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_COL0, 3, 0, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_COL1, 4, 1, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_BFC0, 3, 2, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_BFC1, 4, 3, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_FOGC, 5, 4, 0x8);
+	for (i = 0; i < 8; i++)
+		emit_mov(vp, NV40_VP_INST_DEST_TC(i), 8 + i, 14 + i, 0xf);
+
+	vp->insns[vp->nr_insns - 1].data[3] |= 1;
+	vp->translated = TRUE;
+	return vp;
+}
+
+struct draw_stage *
+nv40_draw_render_stage(struct nv40_context *nv40)
+{
+	struct nv40_render_stage *render = CALLOC_STRUCT(nv40_render_stage);
+
+	if (!nv40->swtnl.vertprog)
+		nv40->swtnl.vertprog = create_drawvp(nv40);
+
+	render->nv40 = nv40;
+	render->stage.draw = nv40->draw;
+	render->stage.point = nv40_render_point;
+	render->stage.line = nv40_render_line;
+	render->stage.tri = nv40_render_tri;
+	render->stage.flush = nv40_render_flush;
+	render->stage.reset_stipple_counter = nv40_render_reset_stipple_counter;
+	render->stage.destroy = nv40_render_destroy;
+
+	return &render->stage;
+}
+
+boolean
+nv40_draw_elements_swtnl(struct pipe_context *pipe,
+			 struct pipe_buffer *idxbuf, unsigned idxbuf_size,
+			 unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	unsigned i;
+	void *map;
+
+	if (!nv40_state_validate_swtnl(nv40))
+		return FALSE;
+	nv40->state.dirty &= ~(1ULL << NV40_STATE_VTXBUF);
+	nv40_state_emit(nv40);
+
+	for (i = 0; i < nv40->vtxbuf_nr; i++) {
+		map = ws->buffer_map(ws, nv40->vtxbuf[i].buffer,
+                                      PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_vertex_buffer(nv40->draw, i, map);
+	}
+
+	if (idxbuf) {
+		map = ws->buffer_map(ws, idxbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(nv40->draw, idxbuf_size, map);
+	} else {
+		draw_set_mapped_element_buffer(nv40->draw, 0, NULL);
+	}
+
+	if (nv40->constbuf[PIPE_SHADER_VERTEX]) {
+		const unsigned nr = nv40->constbuf_nr[PIPE_SHADER_VERTEX];
+
+		map = ws->buffer_map(ws, nv40->constbuf[PIPE_SHADER_VERTEX],
+				     PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_constant_buffer(nv40->draw, map, nr);
+	}
+
+	draw_arrays(nv40->draw, mode, start, count);
+
+	for (i = 0; i < nv40->vtxbuf_nr; i++)
+		ws->buffer_unmap(ws, nv40->vtxbuf[i].buffer);
+
+	if (idxbuf)
+		ws->buffer_unmap(ws, idxbuf);
+
+	if (nv40->constbuf[PIPE_SHADER_VERTEX])
+		ws->buffer_unmap(ws, nv40->constbuf[PIPE_SHADER_VERTEX]);
+
+	draw_flush(nv40->draw);
+	pipe->flush(pipe, 0, NULL);
+
+	return TRUE;
+}
+
+static INLINE void
+emit_attrib(struct nv40_context *nv40, unsigned hw, unsigned emit,
+	    unsigned semantic, unsigned index)
+{
+	unsigned draw_out = draw_find_vs_output(nv40->draw, semantic, index);
+	unsigned a = nv40->swtnl.nr_attribs++;
+
+	nv40->swtnl.hw[a] = hw;
+	nv40->swtnl.emit[a] = emit;
+	nv40->swtnl.draw[a] = draw_out;
+}
+
+static boolean
+nv40_state_vtxfmt_validate(struct nv40_context *nv40)
+{
+	struct nv40_fragment_program *fp = nv40->fragprog;
+	unsigned colour = 0, texcoords = 0, fog = 0, i;
+
+	/* Determine needed fragprog inputs */
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		switch (fp->info.input_semantic_name[i]) {
+		case TGSI_SEMANTIC_POSITION:
+			break;
+		case TGSI_SEMANTIC_COLOR:
+			colour |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_GENERIC:
+			texcoords |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_FOG:
+			fog = 1;
+			break;
+		default:
+			assert(0);
+		}
+	}
+
+	nv40->swtnl.nr_attribs = 0;
+
+	/* Map draw vtxprog output to hw attribute IDs */
+	for (i = 0; i < 2; i++) {
+		if (!(colour & (1 << i)))
+			continue;
+		emit_attrib(nv40, 3 + i, EMIT_4UB, TGSI_SEMANTIC_COLOR, i);
+	}
+
+	for (i = 0; i < 8; i++) {
+		if (!(texcoords & (1 << i)))
+			continue;
+		emit_attrib(nv40, 8 + i, EMIT_4F, TGSI_SEMANTIC_GENERIC, i);
+	}
+
+	if (fog) {
+		emit_attrib(nv40, 5, EMIT_1F, TGSI_SEMANTIC_FOG, 0);
+	}
+
+	emit_attrib(nv40, 0, EMIT_3F, TGSI_SEMANTIC_POSITION, 0);
+
+	return FALSE;
+}
+
+struct nv40_state_entry nv40_state_vtxfmt = {
+	.validate = nv40_state_vtxfmt_validate,
+	.dirty = {
+		.pipe = NV40_NEW_ARRAYS | NV40_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_fragprog.c b/src/gallium/drivers/nv40/nv40_fragprog.c
new file mode 100644
index 0000000000..91dcbebda0
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_fragprog.c
@@ -0,0 +1,991 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv40_context.h"
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 1
+#define MASK_Y 2
+#define MASK_Z 4
+#define MASK_W 8
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE NV40_FP_OP_DST_SCALE_1X
+#define DEF_CTEST NV40_FP_OP_COND_TR
+#include "nv40_shader.h"
+
+#define swz(s,x,y,z,w) nv40_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv40_sr_neg((s))
+#define abs(s) nv40_sr_abs((s))
+#define scale(s,v) nv40_sr_scale((s), NV40_FP_OP_DST_SCALE_##v)
+
+#define MAX_CONSTS 128
+#define MAX_IMM 32
+struct nv40_fpc {
+	struct nv40_fragment_program *fp;
+
+	uint attrib_map[PIPE_MAX_SHADER_INPUTS];
+
+	unsigned r_temps;
+	unsigned r_temps_discard;
+	struct nv40_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nv40_sreg *r_temp;
+
+	int num_regs;
+
+	unsigned inst_offset;
+	unsigned have_const;
+
+	struct {
+		int pipe;
+		float vals[4];
+	} consts[MAX_CONSTS];
+	int nr_consts;
+
+	struct nv40_sreg imm[MAX_IMM];
+	unsigned nr_imm;
+};
+
+static INLINE struct nv40_sreg
+temp(struct nv40_fpc *fpc)
+{
+	int idx = ffs(~fpc->r_temps) - 1;
+
+	if (idx < 0) {
+		NOUVEAU_ERR("out of temps!!\n");
+		assert(0);
+		return nv40_sr(NV40SR_TEMP, 0);
+	}
+
+	fpc->r_temps |= (1 << idx);
+	fpc->r_temps_discard |= (1 << idx);
+	return nv40_sr(NV40SR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nv40_fpc *fpc)
+{
+	fpc->r_temps &= ~fpc->r_temps_discard;
+	fpc->r_temps_discard = 0;
+}
+
+static INLINE struct nv40_sreg
+constant(struct nv40_fpc *fpc, int pipe, float vals[4])
+{
+	int idx;
+
+	if (fpc->nr_consts == MAX_CONSTS)
+		assert(0);
+	idx = fpc->nr_consts++;
+
+	fpc->consts[idx].pipe = pipe;
+	if (pipe == -1)
+		memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
+	return nv40_sr(NV40SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv40_fp_arith((cc), (s), NV40_FP_OP_OPCODE_##o, \
+			(d), (m), (s0), (s1), (s2))
+#define tex(cc,s,o,u,d,m,s0,s1,s2) \
+	nv40_fp_tex((cc), (s), NV40_FP_OP_OPCODE_##o, (u), \
+		    (d), (m), (s0), none, none)
+
+static void
+grow_insns(struct nv40_fpc *fpc, int size)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+
+	fp->insn_len += size;
+	fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
+}
+
+static void
+emit_src(struct nv40_fpc *fpc, int pos, struct nv40_sreg src)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV40SR_INPUT:
+		sr |= (NV40_FP_REG_TYPE_INPUT << NV40_FP_REG_TYPE_SHIFT);
+		hw[0] |= (src.index << NV40_FP_OP_INPUT_SRC_SHIFT);
+		break;
+	case NV40SR_OUTPUT:
+		sr |= NV40_FP_REG_SRC_HALF;
+		/* fall-through */
+	case NV40SR_TEMP:
+		sr |= (NV40_FP_REG_TYPE_TEMP << NV40_FP_REG_TYPE_SHIFT);
+		sr |= (src.index << NV40_FP_REG_SRC_SHIFT);
+		break;
+	case NV40SR_CONST:
+		if (!fpc->have_const) {
+			grow_insns(fpc, 4);
+			fpc->have_const = 1;
+		}
+
+		hw = &fp->insn[fpc->inst_offset];
+		if (fpc->consts[src.index].pipe >= 0) {
+			struct nv40_fragment_program_data *fpd;
+
+			fp->consts = realloc(fp->consts, ++fp->nr_consts *
+					     sizeof(*fpd));
+			fpd = &fp->consts[fp->nr_consts - 1];
+			fpd->offset = fpc->inst_offset + 4;
+			fpd->index = fpc->consts[src.index].pipe;
+			memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
+		} else {
+			memcpy(&fp->insn[fpc->inst_offset + 4],
+				fpc->consts[src.index].vals,
+				sizeof(uint32_t) * 4);
+		}
+
+		sr |= (NV40_FP_REG_TYPE_CONST << NV40_FP_REG_TYPE_SHIFT);	
+		break;
+	case NV40SR_NONE:
+		sr |= (NV40_FP_REG_TYPE_INPUT << NV40_FP_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV40_FP_REG_NEGATE;
+
+	if (src.abs)
+		hw[1] |= (1 << (29 + pos));
+
+	sr |= ((src.swz[0] << NV40_FP_REG_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV40_FP_REG_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV40_FP_REG_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV40_FP_REG_SWZ_W_SHIFT));
+
+	hw[pos + 1] |= sr;
+}
+
+static void
+emit_dst(struct nv40_fpc *fpc, struct nv40_sreg dst)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+
+	switch (dst.type) {
+	case NV40SR_TEMP:
+		if (fpc->num_regs < (dst.index + 1))
+			fpc->num_regs = dst.index + 1;
+		break;
+	case NV40SR_OUTPUT:
+		if (dst.index == 1) {
+			fp->fp_control |= 0xe;
+		} else {
+			hw[0] |= NV40_FP_OP_OUT_REG_HALF;
+		}
+		break;
+	case NV40SR_NONE:
+		hw[0] |= (1 << 30);
+		break;
+	default:
+		assert(0);
+	}
+
+	hw[0] |= (dst.index << NV40_FP_OP_OUT_REG_SHIFT);
+}
+
+static void
+nv40_fp_arith(struct nv40_fpc *fpc, int sat, int op,
+	      struct nv40_sreg dst, int mask,
+	      struct nv40_sreg s0, struct nv40_sreg s1, struct nv40_sreg s2)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw;
+
+	fpc->inst_offset = fp->insn_len;
+	fpc->have_const = 0;
+	grow_insns(fpc, 4);
+	hw = &fp->insn[fpc->inst_offset];
+	memset(hw, 0, sizeof(uint32_t) * 4);
+
+	if (op == NV40_FP_OP_OPCODE_KIL)
+		fp->fp_control |= NV40TCL_FP_CONTROL_KIL;
+	hw[0] |= (op << NV40_FP_OP_OPCODE_SHIFT);
+	hw[0] |= (mask << NV40_FP_OP_OUTMASK_SHIFT);
+	hw[2] |= (dst.dst_scale << NV40_FP_OP_DST_SCALE_SHIFT);
+
+	if (sat)
+		hw[0] |= NV40_FP_OP_OUT_SAT;
+
+	if (dst.cc_update)
+		hw[0] |= NV40_FP_OP_COND_WRITE_ENABLE;
+	hw[1] |= (dst.cc_test << NV40_FP_OP_COND_SHIFT);
+	hw[1] |= ((dst.cc_swz[0] << NV40_FP_OP_COND_SWZ_X_SHIFT) |
+		  (dst.cc_swz[1] << NV40_FP_OP_COND_SWZ_Y_SHIFT) |
+		  (dst.cc_swz[2] << NV40_FP_OP_COND_SWZ_Z_SHIFT) |
+		  (dst.cc_swz[3] << NV40_FP_OP_COND_SWZ_W_SHIFT));
+
+	emit_dst(fpc, dst);
+	emit_src(fpc, 0, s0);
+	emit_src(fpc, 1, s1);
+	emit_src(fpc, 2, s2);
+}
+
+static void
+nv40_fp_tex(struct nv40_fpc *fpc, int sat, int op, int unit,
+	    struct nv40_sreg dst, int mask,
+	    struct nv40_sreg s0, struct nv40_sreg s1, struct nv40_sreg s2)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+
+	nv40_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
+
+	fp->insn[fpc->inst_offset] |= (unit << NV40_FP_OP_TEX_UNIT_SHIFT);
+	fp->samplers |= (1 << unit);
+}
+
+static INLINE struct nv40_sreg
+tgsi_src(struct nv40_fpc *fpc, const struct tgsi_full_src_register *fsrc)
+{
+	struct nv40_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv40_sr(NV40SR_INPUT,
+			      fpc->attrib_map[fsrc->SrcRegister.Index]);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(fpc, fsrc->SrcRegister.Index, NULL);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		assert(fsrc->SrcRegister.Index < fpc->nr_imm);
+		src = fpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = fpc->r_temp[fsrc->SrcRegister.Index];
+		break;
+	/* NV40 fragprog result regs are just temps, so this is simple */
+	case TGSI_FILE_OUTPUT:
+		src = fpc->r_result[fsrc->SrcRegister.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+tgsi_dst(struct nv40_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		return fpc->r_result[fdst->DstRegister.Index];
+	case TGSI_FILE_TEMPORARY:
+		return fpc->r_temp[fdst->DstRegister.Index];
+	case TGSI_FILE_NULL:
+		return nv40_sr(NV40SR_NONE, 0);
+	default:
+		NOUVEAU_ERR("bad dst file %d\n", fdst->DstRegister.File);
+		return nv40_sr(NV40SR_NONE, 0);
+	}
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+src_native_swz(struct nv40_fpc *fpc, const struct tgsi_full_src_register *fsrc,
+	       struct nv40_sreg *src)
+{
+	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	struct nv40_sreg tgsi = tgsi_src(fpc, fsrc);
+	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
+	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
+			fsrc->SrcRegisterExtSwz.NegateY,
+			fsrc->SrcRegisterExtSwz.NegateZ,
+			fsrc->SrcRegisterExtSwz.NegateW };
+	uint c;
+
+	for (c = 0; c < 4; c++) {
+		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
+		case TGSI_EXTSWIZZLE_X:
+		case TGSI_EXTSWIZZLE_Y:
+		case TGSI_EXTSWIZZLE_Z:
+		case TGSI_EXTSWIZZLE_W:
+			mask |= (1 << c);
+			break;
+		case TGSI_EXTSWIZZLE_ZERO:
+			zero_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		case TGSI_EXTSWIZZLE_ONE:
+			one_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		default:
+			assert(0);
+		}
+
+		if (!tgsi.negate && neg[c])
+			neg_mask |= (1 << c);
+	}
+
+	if (mask == MASK_ALL && !neg_mask)
+		return TRUE;
+
+	*src = temp(fpc);
+
+	if (mask)
+		arith(fpc, 0, MOV, *src, mask, tgsi, none, none);
+
+	if (zero_mask)
+		arith(fpc, 0, SFL, *src, zero_mask, *src, none, none);
+
+	if (one_mask)
+		arith(fpc, 0, STR, *src, one_mask, *src, none, none);
+
+	if (neg_mask) {
+		struct nv40_sreg one = temp(fpc);
+		arith(fpc, 0, STR, one, neg_mask, one, none, none);
+		arith(fpc, 0, MUL, *src, neg_mask, *src, neg(one), none);
+	}
+
+	return FALSE;
+}
+
+static boolean
+nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
+				const struct tgsi_full_instruction *finst)
+{
+	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	struct nv40_sreg src[3], dst, tmp;
+	int mask, sat, unit;
+	int ai = -1, ci = -1, ii = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(fpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_TEMPORARY:
+			if (!src_native_swz(fpc, fsrc, &src[i]))
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if ((ci == -1 && ii == -1) ||
+			    ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			if ((ci == -1 && ii == -1) ||
+			    ii == fsrc->SrcRegister.Index) {
+				ii = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		case TGSI_FILE_SAMPLER:
+			unit = fsrc->SrcRegister.Index;
+			break;
+		case TGSI_FILE_OUTPUT:
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(fpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+	sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_CMP:
+		tmp = temp(fpc);
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		tmp.cc_update = 1;
+		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NV40_VP_INST_COND_LT;
+		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
+		break;
+	case TGSI_OPCODE_COS:
+		arith(fpc, sat, COS, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DDX:
+		if (mask & (MASK_Z | MASK_W)) {
+			tmp = temp(fpc);
+			arith(fpc, sat, DDX, tmp, MASK_X | MASK_Y,
+			      swz(src[0], Z, W, Z, W), none, none);
+			arith(fpc, 0, MOV, tmp, MASK_Z | MASK_W,
+			      swz(tmp, X, Y, X, Y), none, none);
+			arith(fpc, sat, DDX, tmp, MASK_X | MASK_Y, src[0],
+			      none, none);
+			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+		} else {
+			arith(fpc, sat, DDX, dst, mask, src[0], none, none);
+		}
+		break;
+	case TGSI_OPCODE_DDY:
+		if (mask & (MASK_Z | MASK_W)) {
+			tmp = temp(fpc);
+			arith(fpc, sat, DDY, tmp, MASK_X | MASK_Y,
+			      swz(src[0], Z, W, Z, W), none, none);
+			arith(fpc, 0, MOV, tmp, MASK_Z | MASK_W,
+			      swz(tmp, X, Y, X, Y), none, none);
+			arith(fpc, sat, DDY, tmp, MASK_X | MASK_Y, src[0],
+			      none, none);
+			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+		} else {
+			arith(fpc, sat, DDY, dst, mask, src[0], none, none);
+		}
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[1], none);
+		arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
+		      swz(src[1], W, W, W, W), none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(fpc, sat, EX2, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(fpc, sat, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(fpc, sat, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_KILP:
+		arith(fpc, 0, KIL, none, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_KIL:
+		dst = nv40_sr(NV40SR_NONE, 0);
+		dst.cc_update = 1;
+		arith(fpc, 0, MOV, dst, MASK_ALL, src[0], none, none);
+		dst.cc_update = 0; dst.cc_test = NV40_FP_OP_COND_LT;
+		arith(fpc, 0, KIL, dst, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(fpc, sat, LG2, dst, mask, src[0], none, none);
+		break;
+//	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_LRP:
+		tmp = temp(fpc);
+		arith(fpc, 0, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], tmp);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(fpc, sat, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_NOISE1:
+	case TGSI_OPCODE_NOISE2:
+	case TGSI_OPCODE_NOISE3:
+	case TGSI_OPCODE_NOISE4:
+		arith(fpc, sat, SFL, dst, mask, none, none, none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(fpc);
+		arith(fpc, 0, LG2, tmp, MASK_X,
+		      swz(src[0], X, X, X, X), none, none);
+		arith(fpc, 0, MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(fpc, sat, EX2, dst, mask,
+		      swz(tmp, X, X, X, X), none, none);
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(fpc, sat, RCP, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_RET:
+		assert(0);
+		break;
+	case TGSI_OPCODE_RFL:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[0], none);
+		arith(fpc, 0, DP3, tmp, MASK_Y, src[0], src[1], none);
+		arith(fpc, 0, DIV, scale(tmp, 2X), MASK_Z,
+		      swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
+		arith(fpc, sat, MAD, dst, mask,
+		      swz(tmp, Z, Z, Z, Z), src[0], neg(src[1]));
+		break;
+	case TGSI_OPCODE_RSQ:
+		tmp = temp(fpc);
+		arith(fpc, 0, LG2, scale(tmp, INV_2X), MASK_X,
+		      abs(swz(src[0], X, X, X, X)), none, none);
+		arith(fpc, sat, EX2, dst, mask,
+		      neg(swz(tmp, X, X, X, X)), none, none);
+		break;
+	case TGSI_OPCODE_SCS:
+		if (mask & MASK_X) {
+			arith(fpc, sat, COS, dst, MASK_X,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		if (mask & MASK_Y) {
+			arith(fpc, sat, SIN, dst, MASK_Y,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		break;
+	case TGSI_OPCODE_SEQ:
+		arith(fpc, sat, SEQ, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SFL:
+		arith(fpc, sat, SFL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(fpc, sat, SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SIN:
+		arith(fpc, sat, SIN, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_SLE:
+		arith(fpc, sat, SLE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SNE:
+		arith(fpc, sat, SNE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_STR:
+		arith(fpc, sat, STR, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
+		break;
+	case TGSI_OPCODE_TEX:
+		tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXB:
+		tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXP:
+		tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(fpc);
+		arith(fpc, 0, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(fpc, sat, MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	release_temps(fpc);
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_parse_decl_attrib(struct nv40_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV40_FP_OP_INPUT_SRC_POSITION;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_FP_OP_INPUT_SRC_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_FP_OP_INPUT_SRC_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV40_FP_OP_INPUT_SRC_FOGC;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV40_FP_OP_INPUT_SRC_TC(fdec->Semantic.
+						     SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad input semantic\n");
+		return FALSE;
+	}
+
+	fpc->attrib_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_parse_decl_output(struct nv40_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	unsigned idx = fdec->DeclarationRange.First;
+	unsigned hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = 1;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		switch (fdec->Semantic.SemanticIndex) {
+		case 0: hw = 0; break;
+		case 1: hw = 2; break;
+		case 2: hw = 3; break;
+		case 3: hw = 4; break;
+		default:
+			NOUVEAU_ERR("bad rcol index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	fpc->r_result[idx] = nv40_sr(NV40SR_OUTPUT, hw);
+	fpc->r_temps |= (1 << hw);
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_prepare(struct nv40_fpc *fpc)
+{
+	struct tgsi_parse_context p;
+	int high_temp = -1, i;
+
+	tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_INPUT:
+				if (!nv40_fragprog_parse_decl_attrib(fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (!nv40_fragprog_parse_decl_output(fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_TEMPORARY:
+				if (fdec->DeclarationRange.Last > high_temp) {
+					high_temp =
+						fdec->DeclarationRange.Last;
+				}
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			struct tgsi_full_immediate *imm;
+			float vals[4];
+			
+			imm = &p.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(fpc->nr_imm < MAX_IMM);
+
+			vals[0] = imm->u.ImmediateFloat32[0].Float;
+			vals[1] = imm->u.ImmediateFloat32[1].Float;
+			vals[2] = imm->u.ImmediateFloat32[2].Float;
+			vals[3] = imm->u.ImmediateFloat32[3].Float;
+			fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
+		}
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (++high_temp) {
+		fpc->r_temp = CALLOC(high_temp, sizeof(struct nv40_sreg));
+		for (i = 0; i < high_temp; i++)
+			fpc->r_temp[i] = temp(fpc);
+		fpc->r_temps_discard = 0;
+	}
+
+	return TRUE;
+
+out_err:
+	if (fpc->r_temp)
+		FREE(fpc->r_temp);
+	tgsi_parse_free(&p);
+	return FALSE;
+}
+
+static void
+nv40_fragprog_translate(struct nv40_context *nv40,
+			struct nv40_fragment_program *fp)
+{
+	struct tgsi_parse_context parse;
+	struct nv40_fpc *fpc = NULL;
+
+	fpc = CALLOC(1, sizeof(struct nv40_fpc));
+	if (!fpc)
+		return;
+	fpc->fp = fp;
+	fpc->num_regs = 2;
+
+	if (!nv40_fragprog_prepare(fpc)) {
+		FREE(fpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, fp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv40_fragprog_parse_instruction(fpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	fp->fp_control |= fpc->num_regs << NV40TCL_FP_CONTROL_TEMP_COUNT_SHIFT;
+
+	/* Terminate final instruction */
+	fp->insn[fpc->inst_offset] |= 0x00000001;
+
+	/* Append NOP + END instruction, may or may not be necessary. */
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	fp->insn[fpc->inst_offset + 0] = 0x00000001;
+	fp->insn[fpc->inst_offset + 1] = 0x00000000;
+	fp->insn[fpc->inst_offset + 2] = 0x00000000;
+	fp->insn[fpc->inst_offset + 3] = 0x00000000;
+	
+	fp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	if (fpc->r_temp)
+		FREE(fpc->r_temp);
+	FREE(fpc);
+}
+
+static void
+nv40_fragprog_upload(struct nv40_context *nv40,
+		     struct nv40_fragment_program *fp)
+{
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	const uint32_t le = 1;
+	uint32_t *map;
+	int i;
+
+	map = ws->buffer_map(ws, fp->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+#if 0
+	for (i = 0; i < fp->insn_len; i++) {
+		fflush(stdout); fflush(stderr);
+		NOUVEAU_ERR("%d 0x%08x\n", i, fp->insn[i]);
+		fflush(stdout); fflush(stderr);
+	}
+#endif
+
+	if ((*(const uint8_t *)&le)) {
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = fp->insn[i];
+		}
+	} else {
+		/* Weird swapping for big-endian chips */
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = ((fp->insn[i] & 0xffff) << 16) |
+				  ((fp->insn[i] >> 16) & 0xffff);
+		}
+	}
+
+	ws->buffer_unmap(ws, fp->buffer);
+}
+
+static boolean
+nv40_fragprog_validate(struct nv40_context *nv40)
+{
+	struct nv40_fragment_program *fp = nv40->fragprog;
+	struct pipe_buffer *constbuf =
+		nv40->constbuf[PIPE_SHADER_FRAGMENT];
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nouveau_stateobj *so;
+	boolean new_consts = FALSE;
+	int i;
+
+	if (fp->translated)
+		goto update_constants;
+
+	nv40->fallback_swrast &= ~NV40_NEW_FRAGPROG;
+	nv40_fragprog_translate(nv40, fp);
+	if (!fp->translated) {
+		nv40->fallback_swrast |= NV40_NEW_FRAGPROG;
+		return FALSE;
+	}
+
+	fp->buffer = ws->buffer_create(ws, 0x100, 0, fp->insn_len * 4);
+	nv40_fragprog_upload(nv40, fp);
+
+	so = so_new(4, 1);
+	so_method(so, nv40->screen->curie, NV40TCL_FP_ADDRESS, 1);
+	so_reloc (so, fp->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+		  NV40TCL_FP_ADDRESS_DMA0, NV40TCL_FP_ADDRESS_DMA1);
+	so_method(so, nv40->screen->curie, NV40TCL_FP_CONTROL, 1);
+	so_data  (so, fp->fp_control);
+	so_ref(so, &fp->so);
+
+update_constants:
+	if (fp->nr_consts) {
+		float *map;
+		
+		map = ws->buffer_map(ws, constbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		for (i = 0; i < fp->nr_consts; i++) {
+			struct nv40_fragment_program_data *fpd = &fp->consts[i];
+			uint32_t *p = &fp->insn[fpd->offset];
+			uint32_t *cb = (uint32_t *)&map[fpd->index * 4];
+
+			if (!memcmp(p, cb, 4 * sizeof(float)))
+				continue;
+			memcpy(p, cb, 4 * sizeof(float));
+			new_consts = TRUE;
+		}
+		ws->buffer_unmap(ws, constbuf);
+
+		if (new_consts)
+			nv40_fragprog_upload(nv40, fp);
+	}
+
+	if (new_consts || fp->so != nv40->state.hw[NV40_STATE_FRAGPROG]) {
+		so_ref(fp->so, &nv40->state.hw[NV40_STATE_FRAGPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv40_fragprog_destroy(struct nv40_context *nv40,
+		      struct nv40_fragment_program *fp)
+{
+	if (fp->insn_len)
+		FREE(fp->insn);
+}
+
+struct nv40_state_entry nv40_state_fragprog = {
+	.validate = nv40_fragprog_validate,
+	.dirty = {
+		.pipe = NV40_NEW_FRAGPROG,
+		.hw = NV40_STATE_FRAGPROG
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_fragtex.c b/src/gallium/drivers/nv40/nv40_fragtex.c
new file mode 100644
index 0000000000..0227d22620
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_fragtex.c
@@ -0,0 +1,168 @@
+#include "nv40_context.h"
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sx,sy,sz,sw)            \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV40TCL_TEX_FORMAT_FORMAT_##tf,                                              \
+  (NV40TCL_TEX_SWIZZLE_S0_X_##ts0x | NV40TCL_TEX_SWIZZLE_S0_Y_##ts0y |         \
+   NV40TCL_TEX_SWIZZLE_S0_Z_##ts0z | NV40TCL_TEX_SWIZZLE_S0_W_##ts0w |         \
+   NV40TCL_TEX_SWIZZLE_S1_X_##ts1x | NV40TCL_TEX_SWIZZLE_S1_Y_##ts1y |         \
+   NV40TCL_TEX_SWIZZLE_S1_Z_##ts1z | NV40TCL_TEX_SWIZZLE_S1_W_##ts1w),         \
+  ((NV40TCL_TEX_FILTER_SIGNED_RED*sx) | (NV40TCL_TEX_FILTER_SIGNED_GREEN*sy) |       \
+   (NV40TCL_TEX_FILTER_SIGNED_BLUE*sz) | (NV40TCL_TEX_FILTER_SIGNED_ALPHA*sw))       \
+}
+
+struct nv40_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+	int     sign;
+};
+
+static struct nv40_texture_format
+nv40_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(A1R5G5B5_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(A4R4G4B4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(R5G6B5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X, 0, 0, 0, 0),
+	_(R16_SNORM     , A16     , ZERO, ZERO,   S1,  ONE, X, X, X, Y, 1, 1, 1, 1),
+	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X, 0, 0, 0, 0),
+	_(A8L8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y, 0, 0, 0, 0),
+	_(Z16_UNORM     , Z16     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(Z24S8_UNORM   , Z24     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	{},
+};
+
+static struct nv40_texture_format *
+nv40_fragtex_format(uint pipe_format)
+{
+	struct nv40_texture_format *tf = nv40_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", pf_name(pipe_format));
+	return NULL;
+}
+
+
+static struct nouveau_stateobj *
+nv40_fragtex_build(struct nv40_context *nv40, int unit)
+{
+	struct nv40_sampler_state *ps = nv40->tex_sampler[unit];
+	struct nv40_miptree *nv40mt = nv40->tex_miptree[unit];
+	struct pipe_texture *pt = &nv40mt->base;
+	struct nv40_texture_format *tf;
+	struct nouveau_stateobj *so;
+	uint32_t txf, txs, txp;
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	tf = nv40_fragtex_format(pt->format);
+	if (!tf)
+		assert(0);
+
+	txf  = ps->fmt;
+	txf |= tf->format | 0x8000;
+	txf |= ((pt->last_level + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
+
+	if (1) /* XXX */
+		txf |= NV40TCL_TEX_FORMAT_NO_BORDER;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV40TCL_TEX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= NV40TCL_TEX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV40TCL_TEX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV40TCL_TEX_FORMAT_DIMS_1D;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return NULL;
+	}
+
+	if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		txp = 0;
+	} else {
+		txp  = nv40mt->level[0].pitch;
+		txf |= NV40TCL_TEX_FORMAT_LINEAR;
+	}
+
+	txs = tf->swizzle;
+
+	so = so_new(16, 2);
+	so_method(so, nv40->screen->curie, NV40TCL_TEX_OFFSET(unit), 8);
+	so_reloc (so, nv40mt->buffer, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, nv40mt->buffer, txf, tex_flags | NOUVEAU_BO_OR,
+		  NV40TCL_TEX_FORMAT_DMA0, NV40TCL_TEX_FORMAT_DMA1);
+	so_data  (so, ps->wrap);
+	so_data  (so, NV40TCL_TEX_ENABLE_ENABLE | ps->en);
+	so_data  (so, txs);
+	so_data  (so, ps->filt | tf->sign | 0x2000 /*voodoo*/);
+	so_data  (so, (pt->width[0] << NV40TCL_TEX_SIZE0_W_SHIFT) |
+		       pt->height[0]);
+	so_data  (so, ps->bcol);
+	so_method(so, nv40->screen->curie, NV40TCL_TEX_SIZE1(unit), 1);
+	so_data  (so, (pt->depth[0] << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp);
+
+	return so;
+}
+
+static boolean
+nv40_fragtex_validate(struct nv40_context *nv40)
+{
+	struct nv40_fragment_program *fp = nv40->fragprog;
+	struct nv40_state *state = &nv40->state;
+	struct nouveau_stateobj *so;
+	unsigned samplers, unit;
+
+	samplers = state->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = so_new(2, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_TEX_ENABLE(unit), 1);
+		so_data  (so, 0);
+		so_ref(so, &nv40->state.hw[NV40_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV40_STATE_FRAGTEX0 + unit));
+	}
+
+	samplers = nv40->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = nv40_fragtex_build(nv40, unit);
+		so_ref(so, &nv40->state.hw[NV40_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV40_STATE_FRAGTEX0 + unit));
+	}
+
+	nv40->state.fp_samplers = fp->samplers;
+	return FALSE;
+}
+
+struct nv40_state_entry nv40_state_fragtex = {
+	.validate = nv40_fragtex_validate,
+	.dirty = {
+		.pipe = NV40_NEW_SAMPLER | NV40_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_miptree.c b/src/gallium/drivers/nv40/nv40_miptree.c
new file mode 100644
index 0000000000..e38b1e7f5c
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_miptree.c
@@ -0,0 +1,238 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv40_context.h"
+
+static void
+nv40_miptree_layout(struct nv40_miptree *mt)
+{
+	struct pipe_texture *pt = &mt->base;
+	uint width = pt->width[0], height = pt->height[0], depth = pt->depth[0];
+	uint offset = 0;
+	int nr_faces, l, f;
+	uint wide_pitch = pt->tex_usage & (PIPE_TEXTURE_USAGE_SAMPLER |
+		                           PIPE_TEXTURE_USAGE_DEPTH_STENCIL |
+		                           PIPE_TEXTURE_USAGE_RENDER_TARGET |
+		                           PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
+		                           PIPE_TEXTURE_USAGE_PRIMARY);
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		nr_faces = pt->depth[0];
+	} else {
+		nr_faces = 1;
+	}
+
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->depth[l] = depth;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (wide_pitch && (pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR))
+			mt->level[l].pitch = align(pt->width[0] * pt->block.size, 64);
+		else
+			mt->level[l].pitch = pt->width[l] * pt->block.size;
+
+		mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+		depth  = MAX2(1, depth  >> 1);
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l < pt->last_level; l++) {
+			mt->level[l].image_offset[f] = offset;
+
+			if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR) &&
+			    pt->width[l + 1] > 1 && pt->height[l + 1] > 1)
+				offset += align(mt->level[l].pitch * pt->height[l], 64);
+			else
+				offset += mt->level[l].pitch * pt->height[l];
+		}
+
+		mt->level[l].image_offset[f] = offset;
+		offset += mt->level[l].pitch * pt->height[l];
+	}
+
+	mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv40_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv40_miptree *mt;
+	unsigned buf_usage = PIPE_BUFFER_USAGE_PIXEL |
+	                     NOUVEAU_BUFFER_USAGE_TEXTURE;
+
+	mt = MALLOC(sizeof(struct nv40_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->shadow_tex = NULL;
+	mt->shadow_surface = NULL;
+
+	/* Swizzled textures must be POT */
+	if (pt->width[0] & (pt->width[0] - 1) ||
+	    pt->height[0] & (pt->height[0] - 1))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & (PIPE_TEXTURE_USAGE_PRIMARY |
+	                     PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
+	                     PIPE_TEXTURE_USAGE_DEPTH_STENCIL))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else {
+		switch (pt->format) {
+		/* TODO: Figure out which formats can be swizzled */
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_X8R8G8B8_UNORM:
+		case PIPE_FORMAT_R16_SNORM:
+		{
+			if (debug_get_bool_option("NOUVEAU_NO_SWIZZLE", FALSE))
+				mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+			break;
+		}
+		default:
+			mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+		}
+	}
+
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
+		buf_usage |= PIPE_BUFFER_USAGE_CPU_READ_WRITE;
+
+	nv40_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256, buf_usage, mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+
+	return &mt->base;
+}
+
+static struct pipe_texture *
+nv40_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
+		     const unsigned *stride, struct pipe_buffer *pb)
+{
+	struct nv40_miptree *mt;
+
+	/* Only supports 2D, non-mipmapped textures for the moment */
+	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
+	    pt->depth[0] != 1)
+		return NULL;
+
+	mt = CALLOC_STRUCT(nv40_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->level[0].pitch = stride[0];
+	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+
+	pipe_buffer_reference(pscreen, &mt->buffer, pb);
+	return &mt->base;
+}
+
+static void
+nv40_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **ppt)
+{
+	struct pipe_texture *pt = *ppt;
+	struct nv40_miptree *mt = (struct nv40_miptree *)pt;
+	int l;
+
+	*ppt = NULL;
+	if (--pt->refcount)
+		return;
+
+	pipe_buffer_reference(pscreen, &mt->buffer, NULL);
+	for (l = 0; l <= pt->last_level; l++) {
+		if (mt->level[l].image_offset)
+			FREE(mt->level[l].image_offset);
+	}
+
+	if (mt->shadow_tex) {
+		if (mt->shadow_surface)
+			pscreen->tex_surface_release(pscreen, &mt->shadow_surface);
+		nv40_miptree_release(pscreen, &mt->shadow_tex);
+	}
+
+	FREE(mt);
+}
+
+static struct pipe_surface *
+nv40_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv40_miptree *mt = (struct nv40_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = mt->level[level].pitch;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+	ps->face = face;
+	ps->level = level;
+	ps->zslice = zslice;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ps->offset = mt->level[level].image_offset[zslice];
+	} else {
+		ps->offset = mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv40_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+
+	*psurface = NULL;
+	if (--ps->refcount > 0)
+		return;
+
+	pipe_texture_reference(&ps->texture, NULL);
+	FREE(ps);
+}
+
+void
+nv40_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv40_miptree_create;
+	pscreen->texture_blanket = nv40_miptree_blanket;
+	pscreen->texture_release = nv40_miptree_release;
+	pscreen->get_tex_surface = nv40_miptree_surface_new;
+	pscreen->tex_surface_release = nv40_miptree_surface_del;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_query.c b/src/gallium/drivers/nv40/nv40_query.c
new file mode 100644
index 0000000000..9b9a43f49d
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_query.c
@@ -0,0 +1,122 @@
+#include "pipe/p_context.h"
+
+#include "nv40_context.h"
+
+struct nv40_query {
+	struct nouveau_resource *object;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static INLINE struct nv40_query *
+nv40_query(struct pipe_query *pipe)
+{
+	return (struct nv40_query *)pipe;
+}
+
+static struct pipe_query *
+nv40_query_create(struct pipe_context *pipe, unsigned query_type)
+{
+	struct nv40_query *q;
+
+	q = CALLOC(1, sizeof(struct nv40_query));
+	q->type = query_type;
+
+	return (struct pipe_query *)q;
+}
+
+static void
+nv40_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+
+	if (q->object)
+		nv40->nvws->res_free(&q->object);
+	FREE(q);
+}
+
+static void
+nv40_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+
+	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	/* Happens when end_query() is called, then another begin_query()
+	 * without querying the result in-between.  For now we'll wait for
+	 * the existing query to notify completion, but it could be better.
+	 */
+	if (q->object) {
+		uint64_t tmp;
+		pipe->get_query_result(pipe, pq, 1, &tmp);
+	}
+
+	if (nv40->nvws->res_alloc(nv40->screen->query_heap, 1, NULL, &q->object))
+		assert(0);
+	nv40->nvws->notifier_reset(nv40->screen->query, q->object->start);
+
+	BEGIN_RING(curie, NV40TCL_QUERY_RESET, 1);
+	OUT_RING  (1);
+	BEGIN_RING(curie, NV40TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (1);
+
+	q->ready = FALSE;
+}
+
+static void
+nv40_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+
+	BEGIN_RING(curie, NV40TCL_QUERY_GET, 1);
+	OUT_RING  ((0x01 << NV40TCL_QUERY_GET_UNK24_SHIFT) |
+		   ((q->object->start * 32) << NV40TCL_QUERY_GET_OFFSET_SHIFT));
+	FIRE_RING(NULL);
+}
+
+static boolean
+nv40_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+		  boolean wait, uint64_t *result)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+	struct nouveau_winsys *nvws = nv40->nvws;
+
+	assert(q->object && q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	if (!q->ready) {
+		unsigned status;
+
+		status = nvws->notifier_status(nv40->screen->query,
+					       q->object->start);
+		if (status != NV_NOTIFY_STATE_STATUS_COMPLETED) {
+			if (wait == FALSE)
+				return FALSE;
+			nvws->notifier_wait(nv40->screen->query, q->object->start,
+					    NV_NOTIFY_STATE_STATUS_COMPLETED,
+					    0);
+		}
+
+		q->result = nvws->notifier_retval(nv40->screen->query,
+						  q->object->start);
+		q->ready = TRUE;
+		nvws->res_free(&q->object);
+	}
+
+	*result = q->result;
+	return TRUE;
+}
+
+void
+nv40_init_query_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.create_query = nv40_query_create;
+	nv40->pipe.destroy_query = nv40_query_destroy;
+	nv40->pipe.begin_query = nv40_query_begin;
+	nv40->pipe.end_query = nv40_query_end;
+	nv40->pipe.get_query_result = nv40_query_result;
+}
diff --git a/src/gallium/drivers/nv40/nv40_screen.c b/src/gallium/drivers/nv40/nv40_screen.c
new file mode 100644
index 0000000000..2372bc8441
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_screen.c
@@ -0,0 +1,383 @@
+#include "pipe/p_screen.h"
+#include "util/u_simple_screen.h"
+
+#include "nv40_context.h"
+#include "nv40_screen.h"
+
+#define NV4X_GRCLASS4097_CHIPSETS 0x00000baf
+#define NV4X_GRCLASS4497_CHIPSETS 0x00005450
+#define NV6X_GRCLASS4497_CHIPSETS 0x00000088
+
+static const char *
+nv40_screen_get_name(struct pipe_screen *pscreen)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+	struct nouveau_device *dev = screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv40_screen_get_vendor(struct pipe_screen *pscreen)
+{
+	return "nouveau";
+}
+
+static int
+nv40_screen_get_param(struct pipe_screen *pscreen, int param)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 16;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 1;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 1;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 1;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 4;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 0; /* We have 4 - but unsupported currently */
+	case NOUVEAU_CAP_HW_VTXBUF:
+		return 1;
+	case NOUVEAU_CAP_HW_IDXBUF:
+		if (screen->curie->grclass == NV40TCL)
+			return 1;
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv40_screen_get_paramf(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 16.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 16.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv40_screen_surface_format_supported(struct pipe_screen *pscreen,
+				     enum pipe_format format,
+				     enum pipe_texture_target target,
+				     unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_R16_SNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_A8L8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_DXT1_RGB:
+		case PIPE_FORMAT_DXT1_RGBA:
+		case PIPE_FORMAT_DXT3_RGBA:
+		case PIPE_FORMAT_DXT5_RGBA:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static struct pipe_buffer *
+nv40_surface_buffer(struct pipe_surface *surf)
+{
+	struct nv40_miptree *mt = (struct nv40_miptree *)surf->texture;
+
+	return mt->buffer;
+}
+
+static void *
+nv40_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys	*ws = screen->winsys;
+	struct pipe_surface	*surface_to_map;
+	void			*map;
+
+	if (!(surface->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		struct nv40_miptree *mt = (struct nv40_miptree *)surface->texture;
+
+		if (!mt->shadow_tex) {
+			unsigned old_tex_usage = surface->texture->tex_usage;
+			surface->texture->tex_usage = NOUVEAU_TEXTURE_USAGE_LINEAR |
+			                              PIPE_TEXTURE_USAGE_DYNAMIC;
+			mt->shadow_tex = screen->texture_create(screen, surface->texture);
+			surface->texture->tex_usage = old_tex_usage;
+
+			assert(mt->shadow_tex->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR);
+		}
+
+		mt->shadow_surface = screen->get_tex_surface
+		(
+			screen, mt->shadow_tex,
+			surface->face, surface->level, surface->zslice,
+			surface->usage
+		);
+
+		surface_to_map = mt->shadow_surface;
+	}
+	else
+		surface_to_map = surface;
+
+	assert(surface_to_map);
+	map = ws->buffer_map(ws, nv40_surface_buffer(surface_to_map), flags);
+	if (!map)
+		return NULL;
+
+	return map + surface_to_map->offset;
+}
+
+static void
+nv40_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys	*ws = screen->winsys;
+	struct pipe_surface	*surface_to_unmap;
+
+	/* TODO: Copy from shadow just before push buffer is flushed instead.
+	         There are probably some programs that map/unmap excessively
+	         before rendering. */
+	if (!(surface->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		struct nv40_miptree *mt = (struct nv40_miptree *)surface->texture;
+
+		assert(mt->shadow_tex);
+
+		surface_to_unmap = mt->shadow_surface;
+	}
+	else
+		surface_to_unmap = surface;
+
+	assert(surface_to_unmap);
+
+	ws->buffer_unmap(ws, nv40_surface_buffer(surface_to_unmap));
+
+	if (surface_to_unmap != surface) {
+		struct nv40_screen *nvscreen = nv40_screen(screen);
+
+		nvscreen->eng2d->copy(nvscreen->eng2d, surface, 0, 0,
+		                      surface_to_unmap, 0, 0,
+		                      surface->width, surface->height);
+
+		screen->tex_surface_release(screen, &surface_to_unmap);
+	}
+}
+
+static void
+nv40_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->res_free(&screen->vp_exec_heap);
+	nvws->res_free(&screen->vp_data_heap);
+	nvws->res_free(&screen->query_heap);
+	nvws->notifier_free(&screen->query);
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->curie);
+
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv40_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv40_screen *screen = CALLOC_STRUCT(nv40_screen);
+	struct nouveau_stateobj *so;
+	unsigned curie_class;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(nvws);
+	screen->eng2d->buf = nv40_surface_buffer;
+
+	/* 3D object */
+	switch (chipset & 0xf0) {
+	case 0x40:
+		if (NV4X_GRCLASS4097_CHIPSETS & (1 << (chipset & 0x0f)))
+			curie_class = NV40TCL;
+		else
+		if (NV4X_GRCLASS4497_CHIPSETS & (1 << (chipset & 0x0f)))
+			curie_class = NV44TCL;
+		break;
+	case 0x60:
+		if (NV6X_GRCLASS4497_CHIPSETS & (1 << (chipset & 0x0f)))
+			curie_class = NV44TCL;
+		break;
+	default:
+		break;
+	}
+
+	if (!curie_class) {
+		NOUVEAU_ERR("Unknown nv4x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, curie_class, &screen->curie);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Query objects */
+	ret = nvws->notifier_alloc(nvws, 32, &screen->query);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query objects: %d\n", ret);
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	ret = nvws->res_init(&screen->query_heap, 0, 32);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query object heap: %d\n", ret);
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Vtxprog resources */
+	if (nvws->res_init(&screen->vp_exec_heap, 0, 512) ||
+	    nvws->res_init(&screen->vp_data_heap, 0, 256)) {
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Static curie initialisation */
+	so = so_new(128, 0);
+	so_method(so, screen->curie, NV40TCL_DMA_NOTIFY, 1);
+	so_data  (so, screen->sync->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_TEXTURE0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_COLOR1, 1);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_COLOR0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_VTXBUF0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_FENCE, 2);
+	so_data  (so, 0);
+	so_data  (so, screen->query->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_UNK01AC, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_COLOR2, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+
+	so_method(so, screen->curie, 0x1ea4, 3);
+	so_data  (so, 0x00000010);
+	so_data  (so, 0x01000100);
+	so_data  (so, 0xff800006);
+
+	/* vtxprog output routing */
+	so_method(so, screen->curie, 0x1fc4, 1);
+	so_data  (so, 0x06144321);
+	so_method(so, screen->curie, 0x1fc8, 2);
+	so_data  (so, 0xedcba987);
+	so_data  (so, 0x00000021);
+	so_method(so, screen->curie, 0x1fd0, 1);
+	so_data  (so, 0x00171615);
+	so_method(so, screen->curie, 0x1fd4, 1);
+	so_data  (so, 0x001b1a19);
+
+	so_method(so, screen->curie, 0x1ef8, 1);
+	so_data  (so, 0x0020ffff);
+	so_method(so, screen->curie, 0x1d64, 1);
+	so_data  (so, 0x00d30000);
+	so_method(so, screen->curie, 0x1e94, 1);
+	so_data  (so, 0x00000001);
+
+	so_emit(nvws, so);
+	so_ref(NULL, &so);
+	nvws->push_flush(nvws, 0, NULL);
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv40_screen_destroy;
+
+	screen->pipe.get_name = nv40_screen_get_name;
+	screen->pipe.get_vendor = nv40_screen_get_vendor;
+	screen->pipe.get_param = nv40_screen_get_param;
+	screen->pipe.get_paramf = nv40_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv40_screen_surface_format_supported;
+
+	screen->pipe.surface_map = nv40_surface_map;
+	screen->pipe.surface_unmap = nv40_surface_unmap;
+
+	nv40_screen_init_miptree_functions(&screen->pipe);
+	u_simple_screen_init(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_screen.h b/src/gallium/drivers/nv40/nv40_screen.h
new file mode 100644
index 0000000000..4500aa0e5c
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_screen.h
@@ -0,0 +1,37 @@
+#ifndef __NV40_SCREEN_H__
+#define __NV40_SCREEN_H__
+
+#include "pipe/p_screen.h"
+#include "nv04/nv04_surface_2d.h"
+
+struct nv40_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	unsigned cur_pctx;
+
+	/* HW graphics objects */
+	struct nv04_surface_2d *eng2d;
+	struct nouveau_grobj *curie;
+	struct nouveau_notifier *sync;
+
+	/* Query object resources */
+	struct nouveau_notifier *query;
+	struct nouveau_resource *query_heap;
+
+	/* Vtxprog resources */
+	struct nouveau_resource *vp_exec_heap;
+	struct nouveau_resource *vp_data_heap;
+
+	/* Current 3D state of channel */
+	struct nouveau_stateobj *state[NV40_STATE_MAX];
+};
+
+static INLINE struct nv40_screen *
+nv40_screen(struct pipe_screen *screen)
+{
+	return (struct nv40_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_shader.h b/src/gallium/drivers/nv40/nv40_shader.h
new file mode 100644
index 0000000000..854dccf548
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_shader.h
@@ -0,0 +1,556 @@
+#ifndef __NV40_SHADER_H__
+#define __NV40_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * The NV40 instruction set is very similar to NV30.  Most fields are in
+ * a slightly different position in the instruction however.
+ *
+ * Merged instructions
+ *     In some cases it is possible to put two instructions into one opcode
+ *     slot.  The rules for when this is OK is not entirely clear to me yet.
+ *
+ *     There are separate writemasks and dest temp register fields for each
+ *     grouping of instructions.  There is however only one field with the
+ *     ID of a result register.  Writing to temp/result regs is selected by
+ *     setting VEC_RESULT/SCA_RESULT.
+ *
+ * Temporary registers
+ *     The source/dest temp register fields have been extended by 1 bit, to
+ *     give a total of 32 temporary registers.
+ *
+ * Relative Addressing
+ *     NV40 can use an address register to index into vertex attribute regs.
+ *     This is done by putting the offset value into INPUT_SRC and setting
+ *     the INDEX_INPUT flag.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details)
+ *     There is a second condition code register on NV40, it's use is enabled
+ *     by setting the COND_REG_SELECT_1 flag.
+ *
+ * Texture lookup
+ *     TODO
+ */
+
+/* ---- OPCODE BITS 127:96 / data DWORD 0 --- */
+#define NV40_VP_INST_VEC_RESULT                                        (1 << 30)
+/* uncertain.. */
+#define NV40_VP_INST_COND_UPDATE_ENABLE                        ((1 << 14)|1<<29)
+/* use address reg as index into attribs */
+#define NV40_VP_INST_INDEX_INPUT                                       (1 << 27)
+#define NV40_VP_INST_COND_REG_SELECT_1                                 (1 << 25)
+#define NV40_VP_INST_ADDR_REG_SELECT_1                                 (1 << 24)
+#define NV40_VP_INST_SRC2_ABS                                          (1 << 23)
+#define NV40_VP_INST_SRC1_ABS                                          (1 << 22)
+#define NV40_VP_INST_SRC0_ABS                                          (1 << 21)
+#define NV40_VP_INST_VEC_DEST_TEMP_SHIFT                                      15
+#define NV40_VP_INST_VEC_DEST_TEMP_MASK                             (0x1F << 15)
+#define NV40_VP_INST_COND_TEST_ENABLE                                  (1 << 13)
+#define NV40_VP_INST_COND_SHIFT                                               10
+#define NV40_VP_INST_COND_MASK                                       (0x7 << 10)
+#    define NV40_VP_INST_COND_FL                                               0
+#    define NV40_VP_INST_COND_LT                                               1
+#    define NV40_VP_INST_COND_EQ                                               2
+#    define NV40_VP_INST_COND_LE                                               3
+#    define NV40_VP_INST_COND_GT                                               4
+#    define NV40_VP_INST_COND_NE                                               5
+#    define NV40_VP_INST_COND_GE                                               6
+#    define NV40_VP_INST_COND_TR                                               7
+#define NV40_VP_INST_COND_SWZ_X_SHIFT                                          8
+#define NV40_VP_INST_COND_SWZ_X_MASK                                    (3 << 8)
+#define NV40_VP_INST_COND_SWZ_Y_SHIFT                                          6
+#define NV40_VP_INST_COND_SWZ_Y_MASK                                    (3 << 6)
+#define NV40_VP_INST_COND_SWZ_Z_SHIFT                                          4
+#define NV40_VP_INST_COND_SWZ_Z_MASK                                    (3 << 4)
+#define NV40_VP_INST_COND_SWZ_W_SHIFT                                          2
+#define NV40_VP_INST_COND_SWZ_W_MASK                                    (3 << 2)
+#define NV40_VP_INST_COND_SWZ_ALL_SHIFT                                        2
+#define NV40_VP_INST_COND_SWZ_ALL_MASK                               (0xFF << 2)
+#define NV40_VP_INST_ADDR_SWZ_SHIFT                                            0
+#define NV40_VP_INST_ADDR_SWZ_MASK                                   (0x03 << 0)
+#define NV40_VP_INST0_KNOWN ( \
+                NV40_VP_INST_INDEX_INPUT | \
+                NV40_VP_INST_COND_REG_SELECT_1 | \
+                NV40_VP_INST_ADDR_REG_SELECT_1 | \
+                NV40_VP_INST_SRC2_ABS | \
+                NV40_VP_INST_SRC1_ABS | \
+                NV40_VP_INST_SRC0_ABS | \
+                NV40_VP_INST_VEC_DEST_TEMP_MASK | \
+                NV40_VP_INST_COND_TEST_ENABLE | \
+                NV40_VP_INST_COND_MASK | \
+                NV40_VP_INST_COND_SWZ_ALL_MASK | \
+                NV40_VP_INST_ADDR_SWZ_MASK)
+
+/* ---- OPCODE BITS 95:64 / data DWORD 1 --- */
+#define NV40_VP_INST_VEC_OPCODE_SHIFT                                         22
+#define NV40_VP_INST_VEC_OPCODE_MASK                                (0x1F << 22)
+#    define NV40_VP_INST_OP_NOP                                             0x00
+#    define NV40_VP_INST_OP_MOV                                             0x01
+#    define NV40_VP_INST_OP_MUL                                             0x02
+#    define NV40_VP_INST_OP_ADD                                             0x03
+#    define NV40_VP_INST_OP_MAD                                             0x04
+#    define NV40_VP_INST_OP_DP3                                             0x05
+#    define NV40_VP_INST_OP_DPH                                             0x06
+#    define NV40_VP_INST_OP_DP4                                             0x07
+#    define NV40_VP_INST_OP_DST                                             0x08
+#    define NV40_VP_INST_OP_MIN                                             0x09
+#    define NV40_VP_INST_OP_MAX                                             0x0A
+#    define NV40_VP_INST_OP_SLT                                             0x0B
+#    define NV40_VP_INST_OP_SGE                                             0x0C
+#    define NV40_VP_INST_OP_ARL                                             0x0D
+#    define NV40_VP_INST_OP_FRC                                             0x0E
+#    define NV40_VP_INST_OP_FLR                                             0x0F
+#    define NV40_VP_INST_OP_SEQ                                             0x10
+#    define NV40_VP_INST_OP_SFL                                             0x11
+#    define NV40_VP_INST_OP_SGT                                             0x12
+#    define NV40_VP_INST_OP_SLE                                             0x13
+#    define NV40_VP_INST_OP_SNE                                             0x14
+#    define NV40_VP_INST_OP_STR                                             0x15
+#    define NV40_VP_INST_OP_SSG                                             0x16
+#    define NV40_VP_INST_OP_ARR                                             0x17
+#    define NV40_VP_INST_OP_ARA                                             0x18
+#    define NV40_VP_INST_OP_TXL                                             0x19
+#define NV40_VP_INST_SCA_OPCODE_SHIFT                                         27
+#define NV40_VP_INST_SCA_OPCODE_MASK                                (0x1F << 27)
+#    define NV40_VP_INST_OP_NOP                                             0x00
+#    define NV40_VP_INST_OP_MOV                                             0x01
+#    define NV40_VP_INST_OP_RCP                                             0x02
+#    define NV40_VP_INST_OP_RCC                                             0x03
+#    define NV40_VP_INST_OP_RSQ                                             0x04
+#    define NV40_VP_INST_OP_EXP                                             0x05
+#    define NV40_VP_INST_OP_LOG                                             0x06
+#    define NV40_VP_INST_OP_LIT                                             0x07
+#    define NV40_VP_INST_OP_BRA                                             0x09
+#    define NV40_VP_INST_OP_CAL                                             0x0B
+#    define NV40_VP_INST_OP_RET                                             0x0C
+#    define NV40_VP_INST_OP_LG2                                             0x0D
+#    define NV40_VP_INST_OP_EX2                                             0x0E
+#    define NV40_VP_INST_OP_SIN                                             0x0F
+#    define NV40_VP_INST_OP_COS                                             0x10
+#    define NV40_VP_INST_OP_PUSHA                                           0x13
+#    define NV40_VP_INST_OP_POPA                                            0x14
+#define NV40_VP_INST_CONST_SRC_SHIFT                                          12
+#define NV40_VP_INST_CONST_SRC_MASK                                 (0xFF << 12)
+#define NV40_VP_INST_INPUT_SRC_SHIFT                                           8
+#define NV40_VP_INST_INPUT_SRC_MASK                                  (0x0F << 8)
+#    define NV40_VP_INST_IN_POS                                                0
+#    define NV40_VP_INST_IN_WEIGHT                                             1
+#    define NV40_VP_INST_IN_NORMAL                                             2
+#    define NV40_VP_INST_IN_COL0                                               3
+#    define NV40_VP_INST_IN_COL1                                               4
+#    define NV40_VP_INST_IN_FOGC                                               5
+#    define NV40_VP_INST_IN_TC0                                                8
+#    define NV40_VP_INST_IN_TC(n)                                          (8+n)
+#define NV40_VP_INST_SRC0H_SHIFT                                               0
+#define NV40_VP_INST_SRC0H_MASK                                      (0xFF << 0)
+#define NV40_VP_INST1_KNOWN ( \
+                NV40_VP_INST_VEC_OPCODE_MASK | \
+                NV40_VP_INST_SCA_OPCODE_MASK | \
+                NV40_VP_INST_CONST_SRC_MASK  | \
+                NV40_VP_INST_INPUT_SRC_MASK  | \
+                NV40_VP_INST_SRC0H_MASK \
+                )
+
+/* ---- OPCODE BITS 63:32 / data DWORD 2 --- */
+#define NV40_VP_INST_SRC0L_SHIFT                                              23
+#define NV40_VP_INST_SRC0L_MASK                                    (0x1FF << 23)
+#define NV40_VP_INST_SRC1_SHIFT                                                6
+#define NV40_VP_INST_SRC1_MASK                                    (0x1FFFF << 6)
+#define NV40_VP_INST_SRC2H_SHIFT                                               0
+#define NV40_VP_INST_SRC2H_MASK                                      (0x3F << 0)
+#define NV40_VP_INST_IADDRH_SHIFT                                              0
+#define NV40_VP_INST_IADDRH_MASK                                     (0x1F << 0)
+
+/* ---- OPCODE BITS 31:0 / data DWORD 3 --- */
+#define NV40_VP_INST_IADDRL_SHIFT                                             29
+#define NV40_VP_INST_IADDRL_MASK                                       (7 << 29)
+#define NV40_VP_INST_SRC2L_SHIFT                                              21
+#define NV40_VP_INST_SRC2L_MASK                                    (0x7FF << 21)
+#define NV40_VP_INST_SCA_WRITEMASK_SHIFT                                      17
+#define NV40_VP_INST_SCA_WRITEMASK_MASK                              (0xF << 17)
+#    define NV40_VP_INST_SCA_WRITEMASK_X                               (1 << 20)
+#    define NV40_VP_INST_SCA_WRITEMASK_Y                               (1 << 19)
+#    define NV40_VP_INST_SCA_WRITEMASK_Z                               (1 << 18)
+#    define NV40_VP_INST_SCA_WRITEMASK_W                               (1 << 17)
+#define NV40_VP_INST_VEC_WRITEMASK_SHIFT                                      13
+#define NV40_VP_INST_VEC_WRITEMASK_MASK                              (0xF << 13)
+#    define NV40_VP_INST_VEC_WRITEMASK_X                               (1 << 16)
+#    define NV40_VP_INST_VEC_WRITEMASK_Y                               (1 << 15)
+#    define NV40_VP_INST_VEC_WRITEMASK_Z                               (1 << 14)
+#    define NV40_VP_INST_VEC_WRITEMASK_W                               (1 << 13)
+#define NV40_VP_INST_SCA_RESULT                                        (1 << 12)
+#define NV40_VP_INST_SCA_DEST_TEMP_SHIFT                                       7
+#define NV40_VP_INST_SCA_DEST_TEMP_MASK                              (0x1F << 7)
+#define NV40_VP_INST_DEST_SHIFT                                                2
+#define NV40_VP_INST_DEST_MASK                                         (31 << 2)
+#    define NV40_VP_INST_DEST_POS                                              0
+#    define NV40_VP_INST_DEST_COL0                                             1
+#    define NV40_VP_INST_DEST_COL1                                             2
+#    define NV40_VP_INST_DEST_BFC0                                             3
+#    define NV40_VP_INST_DEST_BFC1                                             4
+#    define NV40_VP_INST_DEST_FOGC                                             5
+#    define NV40_VP_INST_DEST_PSZ                                              6
+#    define NV40_VP_INST_DEST_TC0                                              7
+#    define NV40_VP_INST_DEST_TC(n)                                        (7+n)
+#    define NV40_VP_INST_DEST_TEMP                                          0x1F
+#define NV40_VP_INST_INDEX_CONST                                        (1 << 1)
+#define NV40_VP_INST_LAST                                               (1 << 0)
+#define NV40_VP_INST3_KNOWN ( \
+                NV40_VP_INST_SRC2L_MASK |\
+                NV40_VP_INST_SCA_WRITEMASK_MASK |\
+                NV40_VP_INST_VEC_WRITEMASK_MASK |\
+                NV40_VP_INST_SCA_DEST_TEMP_MASK |\
+                NV40_VP_INST_DEST_MASK |\
+                NV40_VP_INST_INDEX_CONST)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV40_VP_SRC0_HIGH_SHIFT                                                9
+#define NV40_VP_SRC0_HIGH_MASK                                        0x0001FE00
+#define NV40_VP_SRC0_LOW_MASK                                         0x000001FF
+#define NV40_VP_SRC2_HIGH_SHIFT                                               11
+#define NV40_VP_SRC2_HIGH_MASK                                        0x0001F800
+#define NV40_VP_SRC2_LOW_MASK                                         0x000007FF
+
+/* Source selection - these are the bits you fill NV40_VP_INST_SRCn with */
+#define NV40_VP_SRC_NEGATE                                             (1 << 16)
+#define NV40_VP_SRC_SWZ_X_SHIFT                                               14
+#define NV40_VP_SRC_SWZ_X_MASK                                         (3 << 14)
+#define NV40_VP_SRC_SWZ_Y_SHIFT                                               12
+#define NV40_VP_SRC_SWZ_Y_MASK                                         (3 << 12)
+#define NV40_VP_SRC_SWZ_Z_SHIFT                                               10
+#define NV40_VP_SRC_SWZ_Z_MASK                                         (3 << 10)
+#define NV40_VP_SRC_SWZ_W_SHIFT                                                8
+#define NV40_VP_SRC_SWZ_W_MASK                                          (3 << 8)
+#define NV40_VP_SRC_SWZ_ALL_SHIFT                                              8
+#define NV40_VP_SRC_SWZ_ALL_MASK                                     (0xFF << 8)
+#define NV40_VP_SRC_TEMP_SRC_SHIFT                                             2
+#define NV40_VP_SRC_TEMP_SRC_MASK                                    (0x1F << 2)
+#define NV40_VP_SRC_REG_TYPE_SHIFT                                             0
+#define NV40_VP_SRC_REG_TYPE_MASK                                       (3 << 0)
+#    define NV40_VP_SRC_REG_TYPE_UNK0                                          0
+#    define NV40_VP_SRC_REG_TYPE_TEMP                                          1
+#    define NV40_VP_SRC_REG_TYPE_INPUT                                         2
+#    define NV40_VP_SRC_REG_TYPE_CONST                                         3
+
+
+/*
+ * Each fragment program opcode appears to be comprised of 4 32-bit values.
+ *
+ *         0 - Opcode, output reg/mask, ATTRIB source
+ *         1 - Source 0
+ *         2 - Source 1
+ *         3 - Source 2
+ *
+ * There appears to be no special difference between result regs and temp regs.
+ *                 result.color == R0.xyzw
+ *                 result.depth == R1.z
+ * When the fragprog contains instructions to write depth,
+ * NV30_TCL_PRIMITIVE_3D_UNK1D78=0 otherwise it is set to 1.
+ *
+ * Constants are inserted directly after the instruction that uses them.
+ * 
+ * It appears that it's not possible to use two input registers in one
+ * instruction as the input sourcing is done in the instruction dword
+ * and not the source selection dwords.  As such instructions such as:
+ * 
+ *                 ADD result.color, fragment.color, fragment.texcoord[0];
+ *
+ * must be split into two MOV's and then an ADD (nvidia does this) but
+ * I'm not sure why it's not just one MOV and then source the second input
+ * in the ADD instruction..
+ *
+ * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
+ * negation requires multiplication with a const.
+ *
+ * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO and
+ * SWIZZLE_ONE.
+ *
+ * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as
+ * SWIZZLE_ZERO is implemented simply by not writing to the relevant components
+ * of the destination.
+ *
+ * Looping
+ *   Loops appear to be fairly expensive on NV40 at least, the proprietary
+ *   driver goes to a lot of effort to avoid using the native looping
+ *   instructions.  If the total number of *executed* instructions between
+ *   REP/ENDREP or LOOP/ENDLOOP is <=500, the driver will unroll the loop.
+ *   The maximum loop count is 255.
+ *
+ * Conditional execution
+ *   TODO
+ * 
+ * Non-native instructions:
+ *         LIT
+ *         LRP - MAD+MAD
+ *         SUB - ADD, negate second source
+ *         RSQ - LG2 + EX2
+ *         POW - LG2 + MUL + EX2
+ *         SCS - COS + SIN
+ *         XPD
+ *         DP2 - MUL + ADD
+ *         NRM
+ */
+
+//== Opcode / Destination selection ==
+#define NV40_FP_OP_PROGRAM_END                                          (1 << 0)
+#define NV40_FP_OP_OUT_REG_SHIFT                                               1
+#define NV40_FP_OP_OUT_REG_MASK                                        (63 << 1)
+/* Needs to be set when writing outputs to get expected result.. */
+#define NV40_FP_OP_OUT_REG_HALF                                         (1 << 7)
+#define NV40_FP_OP_COND_WRITE_ENABLE                                    (1 << 8)
+#define NV40_FP_OP_OUTMASK_SHIFT                                               9
+#define NV40_FP_OP_OUTMASK_MASK                                       (0xF << 9)
+#    define NV40_FP_OP_OUT_X                                            (1 << 9)
+#    define NV40_FP_OP_OUT_Y                                            (1 <<10)
+#    define NV40_FP_OP_OUT_Z                                            (1 <<11)
+#    define NV40_FP_OP_OUT_W                                            (1 <<12)
+/* Uncertain about these, especially the input_src values.. it's possible that
+ * they can be dynamically changed.
+ */
+#define NV40_FP_OP_INPUT_SRC_SHIFT                                            13
+#define NV40_FP_OP_INPUT_SRC_MASK                                     (15 << 13)
+#    define NV40_FP_OP_INPUT_SRC_POSITION                                    0x0
+#    define NV40_FP_OP_INPUT_SRC_COL0                                        0x1
+#    define NV40_FP_OP_INPUT_SRC_COL1                                        0x2
+#    define NV40_FP_OP_INPUT_SRC_FOGC                                        0x3
+#    define NV40_FP_OP_INPUT_SRC_TC0                                         0x4
+#    define NV40_FP_OP_INPUT_SRC_TC(n)                                 (0x4 + n)
+#    define NV40_FP_OP_INPUT_SRC_FACING                                      0xE
+#define NV40_FP_OP_TEX_UNIT_SHIFT                                             17
+#define NV40_FP_OP_TEX_UNIT_MASK                                     (0xF << 17)
+#define NV40_FP_OP_PRECISION_SHIFT                                            22
+#define NV40_FP_OP_PRECISION_MASK                                      (3 << 22)
+#   define NV40_FP_PRECISION_FP32                                              0
+#   define NV40_FP_PRECISION_FP16                                              1
+#   define NV40_FP_PRECISION_FX12                                              2
+#define NV40_FP_OP_OPCODE_SHIFT                                               24
+#define NV40_FP_OP_OPCODE_MASK                                      (0x3F << 24)
+#        define NV40_FP_OP_OPCODE_NOP                                       0x00
+#        define NV40_FP_OP_OPCODE_MOV                                       0x01
+#        define NV40_FP_OP_OPCODE_MUL                                       0x02
+#        define NV40_FP_OP_OPCODE_ADD                                       0x03
+#        define NV40_FP_OP_OPCODE_MAD                                       0x04
+#        define NV40_FP_OP_OPCODE_DP3                                       0x05
+#        define NV40_FP_OP_OPCODE_DP4                                       0x06
+#        define NV40_FP_OP_OPCODE_DST                                       0x07
+#        define NV40_FP_OP_OPCODE_MIN                                       0x08
+#        define NV40_FP_OP_OPCODE_MAX                                       0x09
+#        define NV40_FP_OP_OPCODE_SLT                                       0x0A
+#        define NV40_FP_OP_OPCODE_SGE                                       0x0B
+#        define NV40_FP_OP_OPCODE_SLE                                       0x0C
+#        define NV40_FP_OP_OPCODE_SGT                                       0x0D
+#        define NV40_FP_OP_OPCODE_SNE                                       0x0E
+#        define NV40_FP_OP_OPCODE_SEQ                                       0x0F
+#        define NV40_FP_OP_OPCODE_FRC                                       0x10
+#        define NV40_FP_OP_OPCODE_FLR                                       0x11
+#        define NV40_FP_OP_OPCODE_KIL                                       0x12
+#        define NV40_FP_OP_OPCODE_PK4B                                      0x13
+#        define NV40_FP_OP_OPCODE_UP4B                                      0x14
+/* DDX/DDY can only write to XY */
+#        define NV40_FP_OP_OPCODE_DDX                                       0x15
+#        define NV40_FP_OP_OPCODE_DDY                                       0x16
+#        define NV40_FP_OP_OPCODE_TEX                                       0x17
+#        define NV40_FP_OP_OPCODE_TXP                                       0x18
+#        define NV40_FP_OP_OPCODE_TXD                                       0x19
+#        define NV40_FP_OP_OPCODE_RCP                                       0x1A
+#        define NV40_FP_OP_OPCODE_EX2                                       0x1C
+#        define NV40_FP_OP_OPCODE_LG2                                       0x1D
+#        define NV40_FP_OP_OPCODE_STR                                       0x20
+#        define NV40_FP_OP_OPCODE_SFL                                       0x21
+#        define NV40_FP_OP_OPCODE_COS                                       0x22
+#        define NV40_FP_OP_OPCODE_SIN                                       0x23
+#        define NV40_FP_OP_OPCODE_PK2H                                      0x24
+#        define NV40_FP_OP_OPCODE_UP2H                                      0x25
+#        define NV40_FP_OP_OPCODE_PK4UB                                     0x27
+#        define NV40_FP_OP_OPCODE_UP4UB                                     0x28
+#        define NV40_FP_OP_OPCODE_PK2US                                     0x29
+#        define NV40_FP_OP_OPCODE_UP2US                                     0x2A
+#        define NV40_FP_OP_OPCODE_DP2A                                      0x2E
+#        define NV40_FP_OP_OPCODE_TXL                                       0x2F
+#        define NV40_FP_OP_OPCODE_TXB                                       0x31
+#        define NV40_FP_OP_OPCODE_DIV                                       0x3A
+#        define NV40_FP_OP_OPCODE_UNK_LIT                                   0x3C
+/* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/
+#        define NV40_FP_OP_BRA_OPCODE_BRK                                    0x0
+#        define NV40_FP_OP_BRA_OPCODE_CAL                                    0x1
+#        define NV40_FP_OP_BRA_OPCODE_IF                                     0x2
+#        define NV40_FP_OP_BRA_OPCODE_LOOP                                   0x3
+#        define NV40_FP_OP_BRA_OPCODE_REP                                    0x4
+#        define NV40_FP_OP_BRA_OPCODE_RET                                    0x5
+#define NV40_FP_OP_OUT_SAT                                             (1 << 31)
+
+/* high order bits of SRC0 */
+#define NV40_FP_OP_OUT_ABS                                             (1 << 29)
+#define NV40_FP_OP_COND_SWZ_W_SHIFT                                           27
+#define NV40_FP_OP_COND_SWZ_W_MASK                                     (3 << 27)
+#define NV40_FP_OP_COND_SWZ_Z_SHIFT                                           25
+#define NV40_FP_OP_COND_SWZ_Z_MASK                                     (3 << 25)
+#define NV40_FP_OP_COND_SWZ_Y_SHIFT                                           23
+#define NV40_FP_OP_COND_SWZ_Y_MASK                                     (3 << 23)
+#define NV40_FP_OP_COND_SWZ_X_SHIFT                                           21
+#define NV40_FP_OP_COND_SWZ_X_MASK                                     (3 << 21)
+#define NV40_FP_OP_COND_SWZ_ALL_SHIFT                                         21
+#define NV40_FP_OP_COND_SWZ_ALL_MASK                                (0xFF << 21)
+#define NV40_FP_OP_COND_SHIFT                                                 18
+#define NV40_FP_OP_COND_MASK                                        (0x07 << 18)
+#        define NV40_FP_OP_COND_FL                                             0
+#        define NV40_FP_OP_COND_LT                                             1
+#        define NV40_FP_OP_COND_EQ                                             2
+#        define NV40_FP_OP_COND_LE                                             3
+#        define NV40_FP_OP_COND_GT                                             4
+#        define NV40_FP_OP_COND_NE                                             5
+#        define NV40_FP_OP_COND_GE                                             6
+#        define NV40_FP_OP_COND_TR                                             7
+
+/* high order bits of SRC1 */
+#define NV40_FP_OP_OPCODE_IS_BRANCH                                      (1<<31)
+#define NV40_FP_OP_DST_SCALE_SHIFT                                            28
+#define NV40_FP_OP_DST_SCALE_MASK                                      (3 << 28)
+#define NV40_FP_OP_DST_SCALE_1X                                                0
+#define NV40_FP_OP_DST_SCALE_2X                                                1
+#define NV40_FP_OP_DST_SCALE_4X                                                2
+#define NV40_FP_OP_DST_SCALE_8X                                                3
+#define NV40_FP_OP_DST_SCALE_INV_2X                                            5
+#define NV40_FP_OP_DST_SCALE_INV_4X                                            6
+#define NV40_FP_OP_DST_SCALE_INV_8X                                            7
+
+/* SRC1 LOOP */
+#define NV40_FP_OP_LOOP_INCR_SHIFT                                            19
+#define NV40_FP_OP_LOOP_INCR_MASK                                   (0xFF << 19)
+#define NV40_FP_OP_LOOP_INDEX_SHIFT                                           10
+#define NV40_FP_OP_LOOP_INDEX_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_LOOP_COUNT_SHIFT                                            2
+#define NV40_FP_OP_LOOP_COUNT_MASK                                   (0xFF << 2)
+
+/* SRC1 IF */
+#define NV40_FP_OP_ELSE_ID_SHIFT                                               2
+#define NV40_FP_OP_ELSE_ID_MASK                                      (0xFF << 2)
+
+/* SRC1 CAL */
+#define NV40_FP_OP_IADDR_SHIFT                                                 2
+#define NV40_FP_OP_IADDR_MASK                                        (0xFF << 2)
+
+/* SRC1 REP
+ *   I have no idea why there are 3 count values here..  but they
+ *   have always been filled with the same value in my tests so
+ *   far..
+ */
+#define NV40_FP_OP_REP_COUNT1_SHIFT                                            2
+#define NV40_FP_OP_REP_COUNT1_MASK                                   (0xFF << 2)
+#define NV40_FP_OP_REP_COUNT2_SHIFT                                           10
+#define NV40_FP_OP_REP_COUNT2_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_REP_COUNT3_SHIFT                                           19
+#define NV40_FP_OP_REP_COUNT3_MASK                                  (0xFF << 19)
+
+/* SRC2 REP/IF */
+#define NV40_FP_OP_END_ID_SHIFT                                                2
+#define NV40_FP_OP_END_ID_MASK                                       (0xFF << 2)
+
+// SRC2 high-order
+#define NV40_FP_OP_INDEX_INPUT                                         (1 << 30)
+#define NV40_FP_OP_ADDR_INDEX_SHIFT                                           19
+#define NV40_FP_OP_ADDR_INDEX_MASK                                   (0xF << 19)
+
+//== Register selection ==
+#define NV40_FP_REG_TYPE_SHIFT                                                 0
+#define NV40_FP_REG_TYPE_MASK                                           (3 << 0)
+#        define NV40_FP_REG_TYPE_TEMP                                          0
+#        define NV40_FP_REG_TYPE_INPUT                                         1
+#        define NV40_FP_REG_TYPE_CONST                                         2
+#define NV40_FP_REG_SRC_SHIFT                                                  2
+#define NV40_FP_REG_SRC_MASK                                           (63 << 2)
+#define NV40_FP_REG_SRC_HALF                                            (1 << 8)
+#define NV40_FP_REG_SWZ_ALL_SHIFT                                              9
+#define NV40_FP_REG_SWZ_ALL_MASK                                      (255 << 9)
+#define NV40_FP_REG_SWZ_X_SHIFT                                                9
+#define NV40_FP_REG_SWZ_X_MASK                                          (3 << 9)
+#define NV40_FP_REG_SWZ_Y_SHIFT                                               11
+#define NV40_FP_REG_SWZ_Y_MASK                                         (3 << 11)
+#define NV40_FP_REG_SWZ_Z_SHIFT                                               13
+#define NV40_FP_REG_SWZ_Z_MASK                                         (3 << 13)
+#define NV40_FP_REG_SWZ_W_SHIFT                                               15
+#define NV40_FP_REG_SWZ_W_MASK                                         (3 << 15)
+#        define NV40_FP_SWIZZLE_X                                              0
+#        define NV40_FP_SWIZZLE_Y                                              1
+#        define NV40_FP_SWIZZLE_Z                                              2
+#        define NV40_FP_SWIZZLE_W                                              3
+#define NV40_FP_REG_NEGATE                                             (1 << 17)
+
+#ifndef NV40_SHADER_NO_FUCKEDNESS
+#define NV40SR_NONE	0
+#define NV40SR_OUTPUT	1
+#define NV40SR_INPUT	2
+#define NV40SR_TEMP	3
+#define NV40SR_CONST	4
+
+struct nv40_sreg {
+	int type;
+	int index;
+
+	int dst_scale;
+
+	int negate;
+	int abs;
+	int swz[4];
+
+	int cc_update;
+	int cc_update_reg;
+	int cc_test;
+	int cc_test_reg;
+	int cc_swz[4];
+};
+
+static INLINE struct nv40_sreg
+nv40_sr(int type, int index)
+{
+	struct nv40_sreg temp = {
+		.type = type,
+		.index = index,
+		.dst_scale = DEF_SCALE,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+		.cc_update = 0,
+		.cc_update_reg = 0,
+		.cc_test = DEF_CTEST,
+		.cc_test_reg = 0,
+		.cc_swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_swz(struct nv40_sreg src, int x, int y, int z, int w)
+{
+	struct nv40_sreg dst = src;
+
+	dst.swz[SWZ_X] = src.swz[x];
+	dst.swz[SWZ_Y] = src.swz[y];
+	dst.swz[SWZ_Z] = src.swz[z];
+	dst.swz[SWZ_W] = src.swz[w];
+	return dst;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_neg(struct nv40_sreg src)
+{
+	src.negate = !src.negate;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_abs(struct nv40_sreg src)
+{
+	src.abs = 1;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_scale(struct nv40_sreg src, int scale)
+{
+	src.dst_scale = scale;
+	return src;
+}
+#endif
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_state.c b/src/gallium/drivers/nv40/nv40_state.c
new file mode 100644
index 0000000000..2eff25aa83
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state.c
@@ -0,0 +1,740 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "draw/draw_context.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv40_context.h"
+#include "nv40_state.h"
+
+static void *
+nv40_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nv40_blend_state *bso = CALLOC(1, sizeof(*bso));
+	struct nouveau_stateobj *so = so_new(16, 0);
+
+	if (cso->blend_enable) {
+		so_method(so, curie, NV40TCL_BLEND_ENABLE, 3);
+		so_data  (so, 1);
+		so_data  (so, (nvgl_blend_func(cso->alpha_src_factor) << 16) |
+			       nvgl_blend_func(cso->rgb_src_factor));
+		so_data  (so, nvgl_blend_func(cso->alpha_dst_factor) << 16 |
+			      nvgl_blend_func(cso->rgb_dst_factor));
+		so_method(so, curie, NV40TCL_BLEND_EQUATION, 1);
+		so_data  (so, nvgl_blend_eqn(cso->alpha_func) << 16 |
+			      nvgl_blend_eqn(cso->rgb_func));
+	} else {
+		so_method(so, curie, NV40TCL_BLEND_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, curie, NV40TCL_COLOR_MASK, 1);
+	so_data  (so, (((cso->colormask & PIPE_MASK_A) ? (0x01 << 24) : 0) |
+		       ((cso->colormask & PIPE_MASK_R) ? (0x01 << 16) : 0) |
+		       ((cso->colormask & PIPE_MASK_G) ? (0x01 <<  8) : 0) |
+		       ((cso->colormask & PIPE_MASK_B) ? (0x01 <<  0) : 0)));
+
+	if (cso->logicop_enable) {
+		so_method(so, curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 2);
+		so_data  (so, 1);
+		so_data  (so, nvgl_logicop_func(cso->logicop_func));
+	} else {
+		so_method(so, curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, curie, NV40TCL_DITHER_ENABLE, 1);
+	so_data  (so, cso->dither ? 1 : 0);
+
+	so_ref(so, &bso->so);
+	bso->pipe = *cso;
+	return (void *)bso;
+}
+
+static void
+nv40_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->blend = hwcso;
+	nv40->dirty |= NV40_NEW_BLEND;
+}
+
+static void
+nv40_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_blend_state *bso = hwcso;
+
+	so_ref(NULL, &bso->so);
+	FREE(bso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV40TCL_TEX_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV40TCL_TEX_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV40TCL_TEX_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV40TCL_TEX_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV40TCL_TEX_WRAP_S_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP;
+		break;
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV40TCL_TEX_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV40TCL_TEX_WRAP_S_SHIFT;
+}
+
+static void *
+nv40_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv40_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv40_sampler_state));
+
+	ps->fmt = 0;
+	if (!cso->normalized_coords)
+		ps->fmt |= NV40TCL_TEX_FORMAT_RECT;
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV40TCL_TEX_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV40TCL_TEX_WRAP_T_SHIFT) |
+		    (wrap_mode(cso->wrap_r) << NV40TCL_TEX_WRAP_R_SHIFT));
+
+	ps->en = 0;
+	if (cso->max_anisotropy >= 2.0) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+		if (cso->max_anisotropy >= 16.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_16X;
+		} else
+		if (cso->max_anisotropy >= 12.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_12X;
+		} else
+		if (cso->max_anisotropy >= 10.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_10X;
+		} else
+		if (cso->max_anisotropy >= 8.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 6.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_6X;
+		} else
+		if (cso->max_anisotropy >= 4.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_2X;
+		}
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV40TCL_TEX_FILTER_MAG_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV40TCL_TEX_FILTER_MAG_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV40TCL_TEX_FILTER_MIN_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV40TCL_TEX_FILTER_MIN_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+	{
+		float limit;
+
+		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+
+		limit = CLAMP(cso->max_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 7;
+
+		limit = CLAMP(cso->min_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 19;
+	}
+
+
+	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv40_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv40->tex_sampler[unit] = sampler[unit];
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv40->nr_samplers; unit++) {
+		nv40->tex_sampler[unit] = NULL;
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	nv40->nr_samplers = nr;
+	nv40->dirty |= NV40_NEW_SAMPLER;
+}
+
+static void
+nv40_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv40_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv40->tex_miptree[unit], miptree[unit]);
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv40->nr_textures; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv40->tex_miptree[unit], NULL);
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	nv40->nr_textures = nr;
+	nv40->dirty |= NV40_NEW_SAMPLER;
+}
+
+static void *
+nv40_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *curie = nv40->screen->curie;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 */
+
+	so_method(so, curie, NV40TCL_SHADE_MODEL, 1);
+	so_data  (so, cso->flatshade ? NV40TCL_SHADE_MODEL_FLAT :
+				       NV40TCL_SHADE_MODEL_SMOOTH);
+
+	so_method(so, curie, NV40TCL_LINE_WIDTH, 2);
+	so_data  (so, (unsigned char)(cso->line_width * 8.0) & 0xff);
+	so_data  (so, cso->line_smooth ? 1 : 0);
+	so_method(so, curie, NV40TCL_LINE_STIPPLE_ENABLE, 2);
+	so_data  (so, cso->line_stipple_enable ? 1 : 0);
+	so_data  (so, (cso->line_stipple_pattern << 16) |
+		       cso->line_stipple_factor);
+
+	so_method(so, curie, NV40TCL_POINT_SIZE, 1);
+	so_data  (so, fui(cso->point_size));
+
+	so_method(so, curie, NV40TCL_POLYGON_MODE_FRONT, 6);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV40TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV40TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV40TCL_FRONT_FACE_CCW);
+	} else {
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV40TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV40TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV40TCL_FRONT_FACE_CW);
+	}
+	so_data(so, cso->poly_smooth ? 1 : 0);
+	so_data(so, (cso->cull_mode != PIPE_WINDING_NONE) ? 1 : 0);
+
+	so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
+
+	so_method(so, curie, NV40TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_POINT) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_POINT))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_LINE) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_LINE))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_FILL) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_FILL))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if (cso->offset_cw || cso->offset_ccw) {
+		so_method(so, curie, NV40TCL_POLYGON_OFFSET_FACTOR, 2);
+		so_data  (so, fui(cso->offset_scale));
+		so_data  (so, fui(cso->offset_units * 2));
+	}
+
+	so_method(so, curie, NV40TCL_POINT_SPRITE, 1);
+	if (cso->point_sprite) {
+		unsigned psctl = (1 << 0), i;
+
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				psctl |= (1 << (8 + i));
+		}
+
+		so_data(so, psctl);
+	} else {
+		so_data(so, 0);
+	}
+
+	so_ref(so, &rsso->so);
+	rsso->pipe = *cso;
+	return (void *)rsso;
+}
+
+static void
+nv40_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->rasterizer = hwcso;
+	nv40->dirty |= NV40_NEW_RAST;
+	nv40->draw_dirty |= NV40_NEW_RAST;
+}
+
+static void
+nv40_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_rasterizer_state *rsso = hwcso;
+
+	so_ref(NULL, &rsso->so);
+	FREE(rsso);
+}
+
+static void *
+nv40_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *curie = nv40->screen->curie;
+
+	so_method(so, curie, NV40TCL_DEPTH_FUNC, 3);
+	so_data  (so, nvgl_comparison_op(cso->depth.func));
+	so_data  (so, cso->depth.writemask ? 1 : 0);
+	so_data  (so, cso->depth.enabled ? 1 : 0);
+
+	so_method(so, curie, NV40TCL_ALPHA_TEST_ENABLE, 3);
+	so_data  (so, cso->alpha.enabled ? 1 : 0);
+	so_data  (so, nvgl_comparison_op(cso->alpha.func));
+	so_data  (so, float_to_ubyte(cso->alpha.ref_value));
+
+	if (cso->stencil[0].enabled) {
+		so_method(so, curie, NV40TCL_STENCIL_FRONT_ENABLE, 8);
+		so_data  (so, cso->stencil[0].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[0].writemask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
+		so_data  (so, cso->stencil[0].ref_value);
+		so_data  (so, cso->stencil[0].valuemask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+	} else {
+		so_method(so, curie, NV40TCL_STENCIL_FRONT_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		so_method(so, curie, NV40TCL_STENCIL_BACK_ENABLE, 8);
+		so_data  (so, cso->stencil[1].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[1].writemask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_data  (so, cso->stencil[1].ref_value);
+		so_data  (so, cso->stencil[1].valuemask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+	} else {
+		so_method(so, curie, NV40TCL_STENCIL_BACK_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &zsaso->so);
+	zsaso->pipe = *cso;
+	return (void *)zsaso;
+}
+
+static void
+nv40_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->zsa = hwcso;
+	nv40->dirty |= NV40_NEW_ZSA;
+}
+
+static void
+nv40_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_zsa_state *zsaso = hwcso;
+
+	so_ref(NULL, &zsaso->so);
+	FREE(zsaso);
+}
+
+static void *
+nv40_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_vertex_program *vp;
+
+	vp = CALLOC(1, sizeof(struct nv40_vertex_program));
+	vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	vp->draw = draw_create_vertex_shader(nv40->draw, &vp->pipe);
+
+	return (void *)vp;
+}
+
+static void
+nv40_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->vertprog = hwcso;
+	nv40->dirty |= NV40_NEW_VERTPROG;
+	nv40->draw_dirty |= NV40_NEW_VERTPROG;
+}
+
+static void
+nv40_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_vertex_program *vp = hwcso;
+
+	draw_delete_vertex_shader(nv40->draw, vp->draw);
+	nv40_vertprog_destroy(nv40, vp);
+	FREE((void*)vp->pipe.tokens);
+	FREE(vp);
+}
+
+static void *
+nv40_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv40_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv40_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+	tgsi_scan_shader(fp->pipe.tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv40_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->fragprog = hwcso;
+	nv40->dirty |= NV40_NEW_FRAGPROG;
+}
+
+static void
+nv40_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_fragment_program *fp = hwcso;
+
+	nv40_fragprog_destroy(nv40, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv40_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->blend_colour = *bcol;
+	nv40->dirty |= NV40_NEW_BCOL;
+}
+
+static void
+nv40_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->clip = *clip;
+	nv40->dirty |= NV40_NEW_UCP;
+	nv40->draw_dirty |= NV40_NEW_UCP;
+}
+
+static void
+nv40_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->constbuf[shader] = buf->buffer;
+	nv40->constbuf_nr[shader] = buf->buffer->size / (4 * sizeof(float));
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv40->dirty |= NV40_NEW_VERTPROG;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv40->dirty |= NV40_NEW_FRAGPROG;
+	}
+}
+
+static void
+nv40_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->framebuffer = *fb;
+	nv40->dirty |= NV40_NEW_FB;
+}
+
+static void
+nv40_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	memcpy(nv40->stipple, stipple->stipple, 4 * 32);
+	nv40->dirty |= NV40_NEW_STIPPLE;
+}
+
+static void
+nv40_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->scissor = *s;
+	nv40->dirty |= NV40_NEW_SCISSOR;
+}
+
+static void
+nv40_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->viewport = *vpt;
+	nv40->dirty |= NV40_NEW_VIEWPORT;
+	nv40->draw_dirty |= NV40_NEW_VIEWPORT;
+}
+
+static void
+nv40_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	memcpy(nv40->vtxbuf, vb, sizeof(*vb) * count);
+	nv40->vtxbuf_nr = count;
+
+	nv40->dirty |= NV40_NEW_ARRAYS;
+	nv40->draw_dirty |= NV40_NEW_ARRAYS;
+}
+
+static void
+nv40_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	memcpy(nv40->vtxelt, ve, sizeof(*ve) * count);
+	nv40->vtxelt_nr = count;
+
+	nv40->dirty |= NV40_NEW_ARRAYS;
+	nv40->draw_dirty |= NV40_NEW_ARRAYS;
+}
+
+static void
+nv40_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->edgeflags = bitfield;
+	nv40->dirty |= NV40_NEW_ARRAYS;
+	nv40->draw_dirty |= NV40_NEW_ARRAYS;
+}
+
+void
+nv40_init_state_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.create_blend_state = nv40_blend_state_create;
+	nv40->pipe.bind_blend_state = nv40_blend_state_bind;
+	nv40->pipe.delete_blend_state = nv40_blend_state_delete;
+
+	nv40->pipe.create_sampler_state = nv40_sampler_state_create;
+	nv40->pipe.bind_sampler_states = nv40_sampler_state_bind;
+	nv40->pipe.delete_sampler_state = nv40_sampler_state_delete;
+	nv40->pipe.set_sampler_textures = nv40_set_sampler_texture;
+
+	nv40->pipe.create_rasterizer_state = nv40_rasterizer_state_create;
+	nv40->pipe.bind_rasterizer_state = nv40_rasterizer_state_bind;
+	nv40->pipe.delete_rasterizer_state = nv40_rasterizer_state_delete;
+
+	nv40->pipe.create_depth_stencil_alpha_state =
+		nv40_depth_stencil_alpha_state_create;
+	nv40->pipe.bind_depth_stencil_alpha_state =
+		nv40_depth_stencil_alpha_state_bind;
+	nv40->pipe.delete_depth_stencil_alpha_state =
+		nv40_depth_stencil_alpha_state_delete;
+
+	nv40->pipe.create_vs_state = nv40_vp_state_create;
+	nv40->pipe.bind_vs_state = nv40_vp_state_bind;
+	nv40->pipe.delete_vs_state = nv40_vp_state_delete;
+
+	nv40->pipe.create_fs_state = nv40_fp_state_create;
+	nv40->pipe.bind_fs_state = nv40_fp_state_bind;
+	nv40->pipe.delete_fs_state = nv40_fp_state_delete;
+
+	nv40->pipe.set_blend_color = nv40_set_blend_color;
+	nv40->pipe.set_clip_state = nv40_set_clip_state;
+	nv40->pipe.set_constant_buffer = nv40_set_constant_buffer;
+	nv40->pipe.set_framebuffer_state = nv40_set_framebuffer_state;
+	nv40->pipe.set_polygon_stipple = nv40_set_polygon_stipple;
+	nv40->pipe.set_scissor_state = nv40_set_scissor_state;
+	nv40->pipe.set_viewport_state = nv40_set_viewport_state;
+
+	nv40->pipe.set_edgeflags = nv40_set_edgeflags;
+	nv40->pipe.set_vertex_buffers = nv40_set_vertex_buffers;
+	nv40->pipe.set_vertex_elements = nv40_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_state.h b/src/gallium/drivers/nv40/nv40_state.h
new file mode 100644
index 0000000000..9c55903ae3
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state.h
@@ -0,0 +1,91 @@
+#ifndef __NV40_STATE_H__
+#define __NV40_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv40_sampler_state {
+	uint32_t fmt;
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv40_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv40_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv40_vertex_program {
+	struct pipe_shader_state pipe;
+
+	struct draw_vertex_shader *draw;
+
+	boolean translated;
+
+	struct pipe_clip_state ucp;
+
+	struct nv40_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv40_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+	uint32_t clip_ctrl;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv40_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv40_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct pipe_texture *shadow_tex;
+	struct pipe_surface *shadow_surface;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_state_blend.c b/src/gallium/drivers/nv40/nv40_state_blend.c
new file mode 100644
index 0000000000..95e6d7394f
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_blend.c
@@ -0,0 +1,40 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_blend_validate(struct nv40_context *nv40)
+{
+	so_ref(nv40->blend->so, &nv40->state.hw[NV40_STATE_BLEND]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_blend = {
+	.validate = nv40_state_blend_validate,
+	.dirty = {
+		.pipe = NV40_NEW_BLEND,
+		.hw = NV40_STATE_BLEND
+	}
+};
+
+static boolean
+nv40_state_blend_colour_validate(struct nv40_context *nv40)
+{
+	struct nouveau_stateobj *so = so_new(2, 0);
+	struct pipe_blend_color *bcol = &nv40->blend_colour;
+
+	so_method(so, nv40->screen->curie, NV40TCL_BLEND_COLOR, 1);
+	so_data  (so, ((float_to_ubyte(bcol->color[3]) << 24) |
+		       (float_to_ubyte(bcol->color[0]) << 16) |
+		       (float_to_ubyte(bcol->color[1]) <<  8) |
+		       (float_to_ubyte(bcol->color[2]) <<  0)));
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_BCOL]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_blend_colour = {
+	.validate = nv40_state_blend_colour_validate,
+	.dirty = {
+		.pipe = NV40_NEW_BCOL,
+		.hw = NV40_STATE_BCOL
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_emit.c b/src/gallium/drivers/nv40/nv40_state_emit.c
new file mode 100644
index 0000000000..ce859def10
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_emit.c
@@ -0,0 +1,184 @@
+#include "nv40_context.h"
+#include "nv40_state.h"
+#include "draw/draw_context.h"
+
+static struct nv40_state_entry *render_states[] = {
+	&nv40_state_framebuffer,
+	&nv40_state_rasterizer,
+	&nv40_state_scissor,
+	&nv40_state_stipple,
+	&nv40_state_fragprog,
+	&nv40_state_fragtex,
+	&nv40_state_vertprog,
+	&nv40_state_blend,
+	&nv40_state_blend_colour,
+	&nv40_state_zsa,
+	&nv40_state_viewport,
+	&nv40_state_vbo,
+	NULL
+};
+
+static struct nv40_state_entry *swtnl_states[] = {
+	&nv40_state_framebuffer,
+	&nv40_state_rasterizer,
+	&nv40_state_scissor,
+	&nv40_state_stipple,
+	&nv40_state_fragprog,
+	&nv40_state_fragtex,
+	&nv40_state_vertprog,
+	&nv40_state_blend,
+	&nv40_state_blend_colour,
+	&nv40_state_zsa,
+	&nv40_state_viewport,
+	&nv40_state_vtxfmt,
+	NULL
+};
+
+static void
+nv40_state_do_validate(struct nv40_context *nv40,
+		       struct nv40_state_entry **states)
+{
+	const struct pipe_framebuffer_state *fb = &nv40->framebuffer;
+	unsigned i;
+
+	for (i = 0; i < fb->nr_cbufs; i++)
+		fb->cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
+	if (fb->zsbuf)
+		fb->zsbuf->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	while (*states) {
+		struct nv40_state_entry *e = *states;
+
+		if (nv40->dirty & e->dirty.pipe) {
+			if (e->validate(nv40))
+				nv40->state.dirty |= (1ULL << e->dirty.hw);
+		}
+
+		states++;
+	}
+	nv40->dirty = 0;
+}
+
+void
+nv40_state_emit(struct nv40_context *nv40)
+{
+	struct nv40_state *state = &nv40->state;
+	struct nv40_screen *screen = nv40->screen;
+	unsigned i, samplers;
+	uint64_t states;
+
+	if (nv40->pctx_id != screen->cur_pctx) {
+		for (i = 0; i < NV40_STATE_MAX; i++) {
+			if (state->hw[i] && screen->state[i] != state->hw[i])
+				state->dirty |= (1ULL << i);
+		}
+
+		screen->cur_pctx = nv40->pctx_id;
+	}
+
+	for (i = 0, states = state->dirty; states; i++) {
+		if (!(states & (1ULL << i)))
+			continue;
+		so_ref (state->hw[i], &nv40->screen->state[i]);
+		if (state->hw[i])
+			so_emit(nv40->nvws, nv40->screen->state[i]);
+		states &= ~(1ULL << i);
+	}
+
+	if (state->dirty & ((1ULL << NV40_STATE_FRAGPROG) |
+			    (1ULL << NV40_STATE_FRAGTEX0))) {
+		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (2);
+		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (1);
+	}
+
+	state->dirty = 0;
+
+	so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_FB]);
+	for (i = 0, samplers = state->fp_samplers; i < 16 && samplers; i++) {
+		if (!(samplers & (1 << i)))
+			continue;
+		so_emit_reloc_markers(nv40->nvws,
+				      state->hw[NV40_STATE_FRAGTEX0+i]);
+		samplers &= ~(1ULL << i);
+	}
+	so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_FRAGPROG]);
+	if (state->hw[NV40_STATE_VTXBUF] && nv40->render_mode == HW)
+		so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_VTXBUF]);
+}
+
+boolean
+nv40_state_validate(struct nv40_context *nv40)
+{
+	boolean was_sw = nv40->fallback_swtnl ? TRUE : FALSE;
+
+	if (nv40->render_mode != HW) {
+		/* Don't even bother trying to go back to hw if none
+		 * of the states that caused swtnl previously have changed.
+		 */
+		if ((nv40->fallback_swtnl & nv40->dirty)
+				!= nv40->fallback_swtnl)
+			return FALSE;
+
+		/* Attempt to go to hwtnl again */
+		nv40->pipe.flush(&nv40->pipe, 0, NULL);
+		nv40->dirty |= (NV40_NEW_VIEWPORT |
+				NV40_NEW_VERTPROG |
+				NV40_NEW_ARRAYS);
+		nv40->render_mode = HW;
+	}
+
+	nv40_state_do_validate(nv40, render_states);
+	if (nv40->fallback_swtnl || nv40->fallback_swrast)
+		return FALSE;
+	
+	if (was_sw)
+		NOUVEAU_ERR("swtnl->hw\n");
+
+	return TRUE;
+}
+
+boolean
+nv40_state_validate_swtnl(struct nv40_context *nv40)
+{
+	struct draw_context *draw = nv40->draw;
+
+	/* Setup for swtnl */
+	if (nv40->render_mode == HW) {
+		NOUVEAU_ERR("hw->swtnl 0x%08x\n", nv40->fallback_swtnl);
+		nv40->pipe.flush(&nv40->pipe, 0, NULL);
+		nv40->dirty |= (NV40_NEW_VIEWPORT |
+				NV40_NEW_VERTPROG |
+				NV40_NEW_ARRAYS);
+		nv40->render_mode = SWTNL;
+	}
+
+	if (nv40->draw_dirty & NV40_NEW_VERTPROG)
+		draw_bind_vertex_shader(draw, nv40->vertprog->draw);
+
+	if (nv40->draw_dirty & NV40_NEW_RAST)
+		draw_set_rasterizer_state(draw, &nv40->rasterizer->pipe);
+
+	if (nv40->draw_dirty & NV40_NEW_UCP)
+		draw_set_clip_state(draw, &nv40->clip);
+
+	if (nv40->draw_dirty & NV40_NEW_VIEWPORT)
+		draw_set_viewport_state(draw, &nv40->viewport);
+
+	if (nv40->draw_dirty & NV40_NEW_ARRAYS) {
+		draw_set_edgeflags(draw, nv40->edgeflags);
+		draw_set_vertex_buffers(draw, nv40->vtxbuf_nr, nv40->vtxbuf);
+		draw_set_vertex_elements(draw, nv40->vtxelt_nr, nv40->vtxelt);	
+	}
+
+	nv40_state_do_validate(nv40, swtnl_states);
+	if (nv40->fallback_swrast) {
+		NOUVEAU_ERR("swtnl->swrast 0x%08x\n", nv40->fallback_swrast);
+		return FALSE;
+	}
+
+	nv40->draw_dirty = 0;
+	return TRUE;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_state_fb.c b/src/gallium/drivers/nv40/nv40_state_fb.c
new file mode 100644
index 0000000000..454abad31f
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_fb.c
@@ -0,0 +1,162 @@
+#include "nv40_context.h"
+#include "nouveau/nouveau_util.h"
+
+static struct pipe_buffer *
+nv40_surface_buffer(struct pipe_surface *surface)
+{
+	struct nv40_miptree *mt = (struct nv40_miptree *)surface->texture;
+	return mt->buffer;
+}
+
+static boolean
+nv40_state_framebuffer_validate(struct nv40_context *nv40)
+{
+	struct pipe_framebuffer_state *fb = &nv40->framebuffer;
+	struct pipe_surface *rt[4], *zeta;
+	uint32_t rt_enable, rt_format;
+	int i, colour_format = 0, zeta_format = 0;
+	struct nouveau_stateobj *so = so_new(64, 10);
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
+	unsigned w = fb->width;
+	unsigned h = fb->height;
+
+	rt_enable = 0;
+	for (i = 0; i < fb->nr_cbufs; i++) {
+		if (colour_format) {
+			assert(colour_format == fb->cbufs[i]->format);
+		} else {
+			colour_format = fb->cbufs[i]->format;
+			rt_enable |= (NV40TCL_RT_ENABLE_COLOR0 << i);
+			rt[i] = fb->cbufs[i];
+		}
+	}
+
+	if (rt_enable & (NV40TCL_RT_ENABLE_COLOR1 | NV40TCL_RT_ENABLE_COLOR2 |
+			 NV40TCL_RT_ENABLE_COLOR3))
+		rt_enable |= NV40TCL_RT_ENABLE_MRT;
+
+	if (fb->zsbuf) {
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	if (!(rt[0]->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+		for (i = 1; i < fb->nr_cbufs; i++)
+			assert(!(rt[i]->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR));
+
+		rt_format = NV40TCL_RT_FORMAT_TYPE_SWIZZLED |
+		            log2i(fb->width) << NV40TCL_RT_FORMAT_LOG2_WIDTH_SHIFT |
+		            log2i(fb->height) << NV40TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT;
+	}
+	else
+		rt_format = NV40TCL_RT_FORMAT_TYPE_LINEAR;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV40TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV40TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	switch (zeta_format) {
+	case PIPE_FORMAT_Z16_UNORM:
+		rt_format |= NV40TCL_RT_FORMAT_ZETA_Z16;
+		break;
+	case PIPE_FORMAT_Z24S8_UNORM:
+	case 0:
+		rt_format |= NV40TCL_RT_FORMAT_ZETA_Z24S8;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR0) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR0, 1);
+		so_reloc (so, nv40_surface_buffer(rt[0]), 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR0_PITCH, 2);
+		so_data  (so, rt[0]->stride);
+		so_reloc (so, nv40_surface_buffer(rt[0]), rt[0]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR1) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR1, 1);
+		so_reloc (so, nv40_surface_buffer(rt[1]), 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR1_OFFSET, 2);
+		so_reloc (so, nv40_surface_buffer(rt[1]), rt[1]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_data  (so, rt[1]->stride);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR2) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR2, 1);
+		so_reloc (so, nv40_surface_buffer(rt[2]), 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR2_OFFSET, 1);
+		so_reloc (so, nv40_surface_buffer(rt[2]), rt[2]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR2_PITCH, 1);
+		so_data  (so, rt[2]->stride);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR3) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR3, 1);
+		so_reloc (so, nv40_surface_buffer(rt[3]), 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR3_OFFSET, 1);
+		so_reloc (so, nv40_surface_buffer(rt[3]), rt[3]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR3_PITCH, 1);
+		so_data  (so, rt[3]->stride);
+	}
+
+	if (zeta_format) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_ZETA, 1);
+		so_reloc (so, nv40_surface_buffer(zeta), 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_ZETA_OFFSET, 1);
+		so_reloc (so, nv40_surface_buffer(zeta), zeta->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_ZETA_PITCH, 1);
+		so_data  (so, zeta->stride);
+	}
+
+	so_method(so, nv40->screen->curie, NV40TCL_RT_ENABLE, 1);
+	so_data  (so, rt_enable);
+	so_method(so, nv40->screen->curie, NV40TCL_RT_HORIZ, 3);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_data  (so, rt_format);
+	so_method(so, nv40->screen->curie, NV40TCL_VIEWPORT_HORIZ, 2);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_method(so, nv40->screen->curie, NV40TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	so_data  (so, ((w - 1) << 16) | 0);
+	so_data  (so, ((h - 1) << 16) | 0);
+	so_method(so, nv40->screen->curie, 0x1d88, 1);
+	so_data  (so, (1 << 12) | h);
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_FB]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_framebuffer = {
+	.validate = nv40_state_framebuffer_validate,
+	.dirty = {
+		.pipe = NV40_NEW_FB,
+		.hw = NV40_STATE_FB
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_rasterizer.c b/src/gallium/drivers/nv40/nv40_state_rasterizer.c
new file mode 100644
index 0000000000..9ecda5990f
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_rasterizer.c
@@ -0,0 +1,17 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_rasterizer_validate(struct nv40_context *nv40)
+{
+	so_ref(nv40->rasterizer->so,
+	       &nv40->state.hw[NV40_STATE_RAST]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_rasterizer = {
+	.validate = nv40_state_rasterizer_validate,
+	.dirty = {
+		.pipe = NV40_NEW_RAST,
+		.hw = NV40_STATE_RAST
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_scissor.c b/src/gallium/drivers/nv40/nv40_state_scissor.c
new file mode 100644
index 0000000000..285239ef41
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_scissor.c
@@ -0,0 +1,35 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_scissor_validate(struct nv40_context *nv40)
+{
+	struct pipe_rasterizer_state *rast = &nv40->rasterizer->pipe;
+	struct pipe_scissor_state *s = &nv40->scissor;
+	struct nouveau_stateobj *so;
+
+	if (nv40->state.hw[NV40_STATE_SCISSOR] &&
+	    (rast->scissor == 0 && nv40->state.scissor_enabled == 0))
+		return FALSE;
+	nv40->state.scissor_enabled = rast->scissor;
+
+	so = so_new(3, 0);
+	so_method(so, nv40->screen->curie, NV40TCL_SCISSOR_HORIZ, 2);
+	if (nv40->state.scissor_enabled) {
+		so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
+		so_data  (so, ((s->maxy - s->miny) << 16) | s->miny);
+	} else {
+		so_data  (so, 4096 << 16);
+		so_data  (so, 4096 << 16);
+	}
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_SCISSOR]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_scissor = {
+	.validate = nv40_state_scissor_validate,
+	.dirty = {
+		.pipe = NV40_NEW_SCISSOR | NV40_NEW_RAST,
+		.hw = NV40_STATE_SCISSOR
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_stipple.c b/src/gallium/drivers/nv40/nv40_state_stipple.c
new file mode 100644
index 0000000000..b51024ad9b
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_stipple.c
@@ -0,0 +1,39 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_stipple_validate(struct nv40_context *nv40)
+{
+	struct pipe_rasterizer_state *rast = &nv40->rasterizer->pipe;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nouveau_stateobj *so;
+
+	if (nv40->state.hw[NV40_STATE_STIPPLE] &&
+	   (rast->poly_stipple_enable == 0 && nv40->state.stipple_enabled == 0))
+		return FALSE;
+
+	if (rast->poly_stipple_enable) {
+		unsigned i;
+
+		so = so_new(35, 0);
+		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+		for (i = 0; i < 32; i++)
+			so_data(so, nv40->stipple[i]);
+	} else {
+		so = so_new(2, 0);
+		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_STIPPLE]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_stipple = {
+	.validate = nv40_state_stipple_validate,
+	.dirty = {
+		.pipe = NV40_NEW_STIPPLE | NV40_NEW_RAST,
+		.hw = NV40_STATE_STIPPLE,
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_viewport.c b/src/gallium/drivers/nv40/nv40_state_viewport.c
new file mode 100644
index 0000000000..869a55b405
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_viewport.c
@@ -0,0 +1,67 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_viewport_validate(struct nv40_context *nv40)
+{
+	struct pipe_viewport_state *vpt = &nv40->viewport;
+	struct nouveau_stateobj *so;
+	unsigned bypass;
+
+	if (nv40->render_mode == HW && !nv40->rasterizer->pipe.bypass_clipping)
+		bypass = 0;
+	else
+		bypass = 1;
+
+	if (nv40->state.hw[NV40_STATE_VIEWPORT] &&
+	    (bypass || !(nv40->dirty & NV40_NEW_VIEWPORT)) &&
+	    nv40->state.viewport_bypass == bypass)
+		return FALSE;
+	nv40->state.viewport_bypass = bypass;
+
+	so = so_new(11, 0);
+	if (!bypass) {
+		so_method(so, nv40->screen->curie,
+			  NV40TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(vpt->translate[0]));
+		so_data  (so, fui(vpt->translate[1]));
+		so_data  (so, fui(vpt->translate[2]));
+		so_data  (so, fui(vpt->translate[3]));
+		so_data  (so, fui(vpt->scale[0]));
+		so_data  (so, fui(vpt->scale[1]));
+		so_data  (so, fui(vpt->scale[2]));
+		so_data  (so, fui(vpt->scale[3]));
+		so_method(so, nv40->screen->curie, 0x1d78, 1);
+		so_data  (so, 1);
+	} else {
+		so_method(so, nv40->screen->curie,
+			  NV40TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(0.0));
+		/* Not entirely certain what this is yet.  The DDX uses this
+		 * value also as it fixes rendering when you pass
+		 * pre-transformed vertices to the GPU.  My best gusss is that
+		 * this bypasses some culling/clipping stage.  Might be worth
+		 * noting that points/lines are uneffected by whatever this
+		 * value fixes, only filled polygons are effected.
+		 */
+		so_method(so, nv40->screen->curie, 0x1d78, 1);
+		so_data  (so, 0x110);
+	}
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_VIEWPORT]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_viewport = {
+	.validate = nv40_state_viewport_validate,
+	.dirty = {
+		.pipe = NV40_NEW_VIEWPORT | NV40_NEW_RAST,
+		.hw = NV40_STATE_VIEWPORT
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_zsa.c b/src/gallium/drivers/nv40/nv40_state_zsa.c
new file mode 100644
index 0000000000..fb760677c8
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_zsa.c
@@ -0,0 +1,17 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_zsa_validate(struct nv40_context *nv40)
+{
+	so_ref(nv40->zsa->so,
+	       &nv40->state.hw[NV40_STATE_ZSA]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_zsa = {
+	.validate = nv40_state_zsa_validate,
+	.dirty = {
+		.pipe = NV40_NEW_ZSA,
+		.hw = NV40_STATE_ZSA
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_surface.c b/src/gallium/drivers/nv40/nv40_surface.c
new file mode 100644
index 0000000000..c4a5fb20d9
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_surface.c
@@ -0,0 +1,72 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv40_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nv40_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv04_surface_2d *eng2d = nv40->screen->eng2d;
+
+	if (do_flip) {
+		desty += height;
+		while (height--) {
+			eng2d->copy(eng2d, dest, destx, desty--, src,
+				    srcx, srcy++, width, 1);
+		}
+		return;
+	}
+
+	eng2d->copy(eng2d, dest, destx, desty, src, srcx, srcy, width, height);
+}
+
+static void
+nv40_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv04_surface_2d *eng2d = nv40->screen->eng2d;
+
+	eng2d->fill(eng2d, dest, destx, desty, width, height, value);
+}
+
+void
+nv40_init_surface_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.surface_copy = nv40_surface_copy;
+	nv40->pipe.surface_fill = nv40_surface_fill;
+}
diff --git a/src/gallium/drivers/nv40/nv40_vbo.c b/src/gallium/drivers/nv40/nv40_vbo.c
new file mode 100644
index 0000000000..8f1834628f
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_vbo.c
@@ -0,0 +1,555 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv40_context.h"
+#include "nv40_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+#include "nouveau/nouveau_util.h"
+
+#define FORCE_SWTNL 0
+
+static INLINE int
+nv40_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
+{
+	switch (pipe) {
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		*fmt = NV40TCL_VTXFMT_TYPE_FLOAT;
+		break;
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+		*fmt = NV40TCL_VTXFMT_TYPE_UBYTE;
+		break;
+	case PIPE_FORMAT_R16_SSCALED:
+	case PIPE_FORMAT_R16G16_SSCALED:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*fmt = NV40TCL_VTXFMT_TYPE_USHORT;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	switch (pipe) {
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R16_SSCALED:
+		*ncomp = 1;
+		break;
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R16G16_SSCALED:
+		*ncomp = 2;
+		break;
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+		*ncomp = 3;
+		break;
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*ncomp = 4;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	return 0;
+}
+
+static boolean
+nv40_vbo_set_idxbuf(struct nv40_context *nv40, struct pipe_buffer *ib,
+		    unsigned ib_size)
+{
+	struct pipe_screen *pscreen = &nv40->screen->pipe;
+	unsigned type;
+
+	if (!ib) {
+		nv40->idxbuf = NULL;
+		nv40->idxbuf_format = 0xdeadbeef;
+		return FALSE;
+	}
+
+	if (!pscreen->get_param(pscreen, NOUVEAU_CAP_HW_IDXBUF) || ib_size == 1)
+		return FALSE;
+
+	switch (ib_size) {
+	case 2:
+		type = NV40TCL_IDXBUF_FORMAT_TYPE_U16;
+		break;
+	case 4:
+		type = NV40TCL_IDXBUF_FORMAT_TYPE_U32;
+		break;
+	default:
+		return FALSE;
+	}
+
+	if (ib != nv40->idxbuf ||
+	    type != nv40->idxbuf_format) {
+		nv40->dirty |= NV40_NEW_ARRAYS;
+		nv40->idxbuf = ib;
+		nv40->idxbuf_format = type;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv40_vbo_static_attrib(struct nv40_context *nv40, struct nouveau_stateobj *so,
+		       int attrib, struct pipe_vertex_element *ve,
+		       struct pipe_vertex_buffer *vb)
+{
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	unsigned type, ncomp;
+	void *map;
+
+	if (nv40_vbo_format_to_hw(ve->src_format, &type, &ncomp))
+		return FALSE;
+
+	map  = ws->buffer_map(ws, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+	map += vb->buffer_offset + ve->src_offset;
+
+	switch (type) {
+	case NV40TCL_VTXFMT_TYPE_FLOAT:
+	{
+		float *v = map;
+
+		switch (ncomp) {
+		case 4:
+			so_method(so, curie, NV40TCL_VTX_ATTR_4F_X(attrib), 4);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			so_data  (so, fui(v[3]));
+			break;
+		case 3:
+			so_method(so, curie, NV40TCL_VTX_ATTR_3F_X(attrib), 3);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			break;
+		case 2:
+			so_method(so, curie, NV40TCL_VTX_ATTR_2F_X(attrib), 2);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			break;
+		case 1:
+			so_method(so, curie, NV40TCL_VTX_ATTR_1F(attrib), 1);
+			so_data  (so, fui(v[0]));
+			break;
+		default:
+			ws->buffer_unmap(ws, vb->buffer);
+			return FALSE;
+		}
+	}
+		break;
+	default:
+		ws->buffer_unmap(ws, vb->buffer);
+		return FALSE;
+	}
+
+	ws->buffer_unmap(ws, vb->buffer);
+
+	return TRUE;
+}
+
+boolean
+nv40_draw_arrays(struct pipe_context *pipe,
+		 unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_channel *chan = nv40->nvws->channel;
+	unsigned restart;
+
+	nv40_vbo_set_idxbuf(nv40, NULL, 0);
+	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
+		return nv40_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);
+	}
+
+	while (count) {
+		unsigned vc, nr;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(curie, NV40TCL_VB_VERTEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_VERTEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static INLINE void
+nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv40->nvws->channel;
+
+	while (count) {
+		uint8_t *elts = (uint8_t *)ib + start;
+		unsigned vc, push, restart;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv40->nvws->channel;
+
+	while (count) {
+		uint16_t *elts = (uint16_t *)ib + start;
+		unsigned vc, push, restart;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv40_draw_elements_u32(struct nv40_context *nv40, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv40->nvws->channel;
+
+	while (count) {
+		uint32_t *elts = (uint32_t *)ib + start;
+		unsigned vc, push, restart;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 5, 1,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		while (vc) {
+			push = MIN2(vc, 2047);
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U32, push);
+			OUT_RINGp    (elts, push);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static boolean
+nv40_draw_elements_inline(struct pipe_context *pipe,
+			  struct pipe_buffer *ib, unsigned ib_size,
+			  unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, ib, PIPE_BUFFER_USAGE_CPU_READ);
+	if (!ib) {
+		NOUVEAU_ERR("failed mapping ib\n");
+		return FALSE;
+	}
+
+	switch (ib_size) {
+	case 1:
+		nv40_draw_elements_u08(nv40, map, mode, start, count);
+		break;
+	case 2:
+		nv40_draw_elements_u16(nv40, map, mode, start, count);
+		break;
+	case 4:
+		nv40_draw_elements_u32(nv40, map, mode, start, count);
+		break;
+	default:
+		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
+		break;
+	}
+
+	ws->buffer_unmap(ws, ib);
+	return TRUE;
+}
+
+static boolean
+nv40_draw_elements_vbo(struct pipe_context *pipe,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_channel *chan = nv40->nvws->channel;
+	unsigned restart;
+
+	while (count) {
+		unsigned nr, vc;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(curie, NV40TCL_VB_INDEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_INDEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	return TRUE;
+}
+
+boolean
+nv40_draw_elements(struct pipe_context *pipe,
+		   struct pipe_buffer *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	boolean idxbuf;
+
+	idxbuf = nv40_vbo_set_idxbuf(nv40, indexBuffer, indexSize);
+	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
+		return nv40_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);
+	}
+
+	if (idxbuf) {
+		nv40_draw_elements_vbo(pipe, mode, start, count);
+	} else {
+		nv40_draw_elements_inline(pipe, indexBuffer, indexSize,
+					  mode, start, count);
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static boolean
+nv40_vbo_validate(struct nv40_context *nv40)
+{
+	struct nouveau_stateobj *vtxbuf, *vtxfmt, *sattr = NULL;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct pipe_buffer *ib = nv40->idxbuf;
+	unsigned ib_format = nv40->idxbuf_format;
+	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+	int hw;
+
+	if (nv40->edgeflags) {
+		nv40->fallback_swtnl |= NV40_NEW_ARRAYS;
+		return FALSE;
+	}
+
+	vtxbuf = so_new(20, 18);
+	so_method(vtxbuf, curie, NV40TCL_VTXBUF_ADDRESS(0), nv40->vtxelt_nr);
+	vtxfmt = so_new(17, 0);
+	so_method(vtxfmt, curie, NV40TCL_VTXFMT(0), nv40->vtxelt_nr);
+
+	for (hw = 0; hw < nv40->vtxelt_nr; hw++) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+		unsigned type, ncomp;
+
+		ve = &nv40->vtxelt[hw];
+		vb = &nv40->vtxbuf[ve->vertex_buffer_index];
+
+		if (!vb->stride) {
+			if (!sattr)
+				sattr = so_new(16 * 5, 0);
+
+			if (nv40_vbo_static_attrib(nv40, sattr, hw, ve, vb)) {
+				so_data(vtxbuf, 0);
+				so_data(vtxfmt, NV40TCL_VTXFMT_TYPE_FLOAT);
+				continue;
+			}
+		}
+
+		if (nv40_vbo_format_to_hw(ve->src_format, &type, &ncomp)) {
+			nv40->fallback_swtnl |= NV40_NEW_ARRAYS;
+			so_ref(NULL, &vtxbuf);
+			so_ref(NULL, &vtxfmt);
+			return FALSE;
+		}
+
+		so_reloc(vtxbuf, vb->buffer, vb->buffer_offset + ve->src_offset,
+			 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+			 0, NV40TCL_VTXBUF_ADDRESS_DMA1);
+		so_data (vtxfmt, ((vb->stride << NV40TCL_VTXFMT_STRIDE_SHIFT) |
+				  (ncomp << NV40TCL_VTXFMT_SIZE_SHIFT) | type));
+	}
+
+	if (ib) {
+		so_method(vtxbuf, curie, NV40TCL_IDXBUF_ADDRESS, 2);
+		so_reloc (vtxbuf, ib, 0, vb_flags | NOUVEAU_BO_LOW, 0, 0);
+		so_reloc (vtxbuf, ib, ib_format, vb_flags | NOUVEAU_BO_OR,
+			  0, NV40TCL_IDXBUF_FORMAT_DMA1);
+	}
+
+	so_method(vtxbuf, curie, 0x1710, 1);
+	so_data  (vtxbuf, 0);
+
+	so_ref(vtxbuf, &nv40->state.hw[NV40_STATE_VTXBUF]);
+	nv40->state.dirty |= (1ULL << NV40_STATE_VTXBUF);
+	so_ref(vtxfmt, &nv40->state.hw[NV40_STATE_VTXFMT]);
+	nv40->state.dirty |= (1ULL << NV40_STATE_VTXFMT);
+	so_ref(sattr, &nv40->state.hw[NV40_STATE_VTXATTR]);
+	nv40->state.dirty |= (1ULL << NV40_STATE_VTXATTR);
+	return FALSE;
+}
+
+struct nv40_state_entry nv40_state_vbo = {
+	.validate = nv40_vbo_validate,
+	.dirty = {
+		.pipe = NV40_NEW_ARRAYS,
+		.hw = 0,
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_vertprog.c b/src/gallium/drivers/nv40/nv40_vertprog.c
new file mode 100644
index 0000000000..0862386638
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_vertprog.c
@@ -0,0 +1,1070 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv40_context.h"
+#include "nv40_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 8
+#define MASK_Y 4
+#define MASK_Z 2
+#define MASK_W 1
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE 0
+#define DEF_CTEST 0
+#include "nv40_shader.h"
+
+#define swz(s,x,y,z,w) nv40_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv40_sr_neg((s))
+#define abs(s) nv40_sr_abs((s))
+
+#define NV40_VP_INST_DEST_CLIP(n) ((~0 - 6) + (n))
+
+struct nv40_vpc {
+	struct nv40_vertex_program *vp;
+
+	struct nv40_vertex_program_exec *vpi;
+
+	unsigned r_temps;
+	unsigned r_temps_discard;
+	struct nv40_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nv40_sreg *r_address;
+	struct nv40_sreg *r_temp;
+
+	struct nv40_sreg *imm;
+	unsigned nr_imm;
+
+	unsigned hpos_idx;
+};
+
+static struct nv40_sreg
+temp(struct nv40_vpc *vpc)
+{
+	int idx = ffs(~vpc->r_temps) - 1;
+
+	if (idx < 0) {
+		NOUVEAU_ERR("out of temps!!\n");
+		assert(0);
+		return nv40_sr(NV40SR_TEMP, 0);
+	}
+
+	vpc->r_temps |= (1 << idx);
+	vpc->r_temps_discard |= (1 << idx);
+	return nv40_sr(NV40SR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nv40_vpc *vpc)
+{
+	vpc->r_temps &= ~vpc->r_temps_discard;
+	vpc->r_temps_discard = 0;
+}
+
+static struct nv40_sreg
+constant(struct nv40_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	struct nv40_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nv40_sr(NV40SR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nv40_sr(NV40SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv40_vp_arith((cc), (s), NV40_VP_INST_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nv40_vpc *vpc, uint32_t *hw, int pos, struct nv40_sreg src)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV40SR_TEMP:
+		sr |= (NV40_VP_SRC_REG_TYPE_TEMP << NV40_VP_SRC_REG_TYPE_SHIFT);
+		sr |= (src.index << NV40_VP_SRC_TEMP_SRC_SHIFT);
+		break;
+	case NV40SR_INPUT:
+		sr |= (NV40_VP_SRC_REG_TYPE_INPUT <<
+		       NV40_VP_SRC_REG_TYPE_SHIFT);
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NV40_VP_INST_INPUT_SRC_SHIFT);
+		break;
+	case NV40SR_CONST:
+		sr |= (NV40_VP_SRC_REG_TYPE_CONST <<
+		       NV40_VP_SRC_REG_TYPE_SHIFT);
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NV40SR_NONE:
+		sr |= (NV40_VP_SRC_REG_TYPE_INPUT <<
+		       NV40_VP_SRC_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV40_VP_SRC_NEGATE;
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NV40_VP_SRC_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV40_VP_SRC_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV40_VP_SRC_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV40_VP_SRC_SWZ_W_SHIFT));
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NV40_VP_SRC0_HIGH_MASK) >>
+			  NV40_VP_SRC0_HIGH_SHIFT) << NV40_VP_INST_SRC0H_SHIFT;
+		hw[2] |= (sr & NV40_VP_SRC0_LOW_MASK) <<
+			  NV40_VP_INST_SRC0L_SHIFT;
+		break;
+	case 1:
+		hw[2] |= sr << NV40_VP_INST_SRC1_SHIFT;
+		break;
+	case 2:
+		hw[2] |= ((sr & NV40_VP_SRC2_HIGH_MASK) >>
+			  NV40_VP_SRC2_HIGH_SHIFT) << NV40_VP_INST_SRC2H_SHIFT;
+		hw[3] |= (sr & NV40_VP_SRC2_LOW_MASK) <<
+			  NV40_VP_INST_SRC2L_SHIFT;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nv40_vpc *vpc, uint32_t *hw, int slot, struct nv40_sreg dst)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NV40SR_TEMP:
+		hw[3] |= NV40_VP_INST_DEST_MASK;
+		if (slot == 0) {
+			hw[0] |= (dst.index <<
+				  NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
+		} else {
+			hw[3] |= (dst.index << 
+				  NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
+		}
+		break;
+	case NV40SR_OUTPUT:
+		switch (dst.index) {
+		case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+		case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+		case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+		case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+		case NV40_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
+		case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+		case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+		case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+		case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+		case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+		case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+		case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+		case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+		case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+		case NV40_VP_INST_DEST_CLIP(0):
+			vp->or |= (1 << 6);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE0;
+			dst.index = NV40_VP_INST_DEST_FOGC;
+			break;
+		case NV40_VP_INST_DEST_CLIP(1):
+			vp->or |= (1 << 7);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE1;
+			dst.index = NV40_VP_INST_DEST_FOGC;
+			break;
+		case NV40_VP_INST_DEST_CLIP(2):
+			vp->or |= (1 << 8);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE2;
+			dst.index = NV40_VP_INST_DEST_FOGC;
+			break;
+		case NV40_VP_INST_DEST_CLIP(3):
+			vp->or |= (1 << 9);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE3;
+			dst.index = NV40_VP_INST_DEST_PSZ;
+			break;
+		case NV40_VP_INST_DEST_CLIP(4):
+			vp->or |= (1 << 10);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE4;
+			dst.index = NV40_VP_INST_DEST_PSZ;
+			break;
+		case NV40_VP_INST_DEST_CLIP(5):
+			vp->or |= (1 << 11);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE5;
+			dst.index = NV40_VP_INST_DEST_PSZ;
+			break;
+		default:
+			break;
+		}
+
+		hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
+		if (slot == 0) {
+			hw[0] |= NV40_VP_INST_VEC_RESULT;
+			hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+		} else {
+			hw[3] |= NV40_VP_INST_SCA_RESULT;
+			hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+		}
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nv40_vp_arith(struct nv40_vpc *vpc, int slot, int op,
+	      struct nv40_sreg dst, int mask,
+	      struct nv40_sreg s0, struct nv40_sreg s1,
+	      struct nv40_sreg s2)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NV40_VP_INST_COND_TR << NV40_VP_INST_COND_SHIFT);
+	hw[0] |= ((0 << NV40_VP_INST_COND_SWZ_X_SHIFT) |
+		  (1 << NV40_VP_INST_COND_SWZ_Y_SHIFT) |
+		  (2 << NV40_VP_INST_COND_SWZ_Z_SHIFT) |
+		  (3 << NV40_VP_INST_COND_SWZ_W_SHIFT));
+
+	if (slot == 0) {
+		hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
+		hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+		hw[3] |= (mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
+	} else {
+		hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
+		hw[0] |= (NV40_VP_INST_VEC_DEST_TEMP_MASK | (1 << 20));
+		hw[3] |= (mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
+	}
+
+	emit_dst(vpc, hw, slot, dst);
+	emit_src(vpc, hw, 0, s0);
+	emit_src(vpc, hw, 1, s1);
+	emit_src(vpc, hw, 2, s2);
+}
+
+static INLINE struct nv40_sreg
+tgsi_src(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nv40_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv40_sr(NV40SR_INPUT, fsrc->SrcRegister.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = vpc->r_temp[fsrc->SrcRegister.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+tgsi_dst(struct nv40_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nv40_sreg dst;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = vpc->r_result[fdst->DstRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = vpc->r_temp[fdst->DstRegister.Index];
+		break;
+	case TGSI_FILE_ADDRESS:
+		dst = vpc->r_address[fdst->DstRegister.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+src_native_swz(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc,
+	       struct nv40_sreg *src)
+{
+	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	struct nv40_sreg tgsi = tgsi_src(vpc, fsrc);
+	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
+	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
+			fsrc->SrcRegisterExtSwz.NegateY,
+			fsrc->SrcRegisterExtSwz.NegateZ,
+			fsrc->SrcRegisterExtSwz.NegateW };
+	uint c;
+
+	for (c = 0; c < 4; c++) {
+		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
+		case TGSI_EXTSWIZZLE_X:
+		case TGSI_EXTSWIZZLE_Y:
+		case TGSI_EXTSWIZZLE_Z:
+		case TGSI_EXTSWIZZLE_W:
+			mask |= tgsi_mask(1 << c);
+			break;
+		case TGSI_EXTSWIZZLE_ZERO:
+			zero_mask |= tgsi_mask(1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		case TGSI_EXTSWIZZLE_ONE:
+			one_mask |= tgsi_mask(1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		default:
+			assert(0);
+		}
+
+		if (!tgsi.negate && neg[c])
+			neg_mask |= tgsi_mask(1 << c);
+	}
+
+	if (mask == MASK_ALL && !neg_mask)
+		return TRUE;
+
+	*src = temp(vpc);
+
+	if (mask)
+		arith(vpc, 0, OP_MOV, *src, mask, tgsi, none, none);
+
+	if (zero_mask)
+		arith(vpc, 0, OP_SFL, *src, zero_mask, *src, none, none);
+
+	if (one_mask)
+		arith(vpc, 0, OP_STR, *src, one_mask, *src, none, none);
+
+	if (neg_mask) {
+		struct nv40_sreg one = temp(vpc);
+		arith(vpc, 0, OP_STR, one, neg_mask, one, none, none);
+		arith(vpc, 0, OP_MUL, *src, neg_mask, *src, neg(one), none);
+	}
+
+	return FALSE;
+}
+
+static boolean
+nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nv40_sreg src[3], dst, tmp;
+	struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1, ii = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_TEMPORARY:
+			if (!src_native_swz(vpc, fsrc, &src[i]))
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if ((ci == -1 && ii == -1) ||
+			    ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			if ((ci == -1 && ii == -1) ||
+			    ii == fsrc->SrcRegister.Index) {
+				ii = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, 1, OP_EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, 1, OP_RSQ, dst, mask, none, none, abs(src[0]));
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, 0, OP_MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	release_temps(vpc);
+	return TRUE;
+}
+
+static boolean
+nv40_vertprog_parse_decl_output(struct nv40_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	unsigned idx = fdec->DeclarationRange.First;
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV40_VP_INST_DEST_POS;
+		vpc->hpos_idx = idx;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_VP_INST_DEST_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_VP_INST_DEST_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_VP_INST_DEST_BFC0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_VP_INST_DEST_BFC1;
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV40_VP_INST_DEST_FOGC;
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NV40_VP_INST_DEST_PSZ;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV40_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->r_result[idx] = nv40_sr(NV40SR_OUTPUT, hw);
+	return TRUE;
+}
+
+static boolean
+nv40_vertprog_prepare(struct nv40_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int high_temp = -1, high_addr = -1, nr_imm = 0, i;
+
+	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_TEMPORARY:
+				if (fdec->DeclarationRange.Last > high_temp) {
+					high_temp =
+						fdec->DeclarationRange.Last;
+				}
+				break;
+#if 0 /* this would be nice.. except gallium doesn't track it */
+			case TGSI_FILE_ADDRESS:
+				if (fdec->DeclarationRange.Last > high_addr) {
+					high_addr =
+						fdec->DeclarationRange.Last;
+				}
+				break;
+#endif
+			case TGSI_FILE_OUTPUT:
+				if (!nv40_vertprog_parse_decl_output(vpc, fdec))
+					return FALSE;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+#if 1 /* yay, parse instructions looking for address regs instead */
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			const struct tgsi_full_dst_register *fdst;
+
+			finst = &p.FullToken.FullInstruction;
+			fdst = &finst->FullDstRegisters[0];
+
+			if (fdst->DstRegister.File == TGSI_FILE_ADDRESS) {
+				if (fdst->DstRegister.Index > high_addr)
+					high_addr = fdst->DstRegister.Index;
+			}
+		
+		}
+			break;
+#endif
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nv40_sreg));
+		assert(vpc->imm);
+	}
+
+	if (++high_temp) {
+		vpc->r_temp = CALLOC(high_temp, sizeof(struct nv40_sreg));
+		for (i = 0; i < high_temp; i++)
+			vpc->r_temp[i] = temp(vpc);
+	}
+
+	if (++high_addr) {
+		vpc->r_address = CALLOC(high_addr, sizeof(struct nv40_sreg));
+		for (i = 0; i < high_addr; i++)
+			vpc->r_address[i] = temp(vpc);
+	}
+
+	vpc->r_temps_discard = 0;
+	return TRUE;
+}
+
+static void
+nv40_vertprog_translate(struct nv40_context *nv40,
+			struct nv40_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nv40_vpc *vpc = NULL;
+	struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	int i;
+
+	vpc = CALLOC(1, sizeof(struct nv40_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+
+	if (!nv40_vertprog_prepare(vpc)) {
+		FREE(vpc);
+		return;
+	}
+
+	/* Redirect post-transform vertex position to a temp if user clip
+	 * planes are enabled.  We need to append code the the vtxprog
+	 * to handle clip planes later.
+	 */
+	if (vp->ucp.nr)  {
+		vpc->r_result[vpc->hpos_idx] = temp(vpc);
+		vpc->r_temps_discard = 0;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(imm->Immediate.NrTokens == 4 + 1);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u.ImmediateFloat32[0].Float,
+					 imm->u.ImmediateFloat32[1].Float,
+					 imm->u.ImmediateFloat32[2].Float,
+					 imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv40_vertprog_parse_instruction(vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* Write out HPOS if it was redirected to a temp earlier */
+	if (vpc->r_result[vpc->hpos_idx].type != NV40SR_OUTPUT) {
+		struct nv40_sreg hpos = nv40_sr(NV40SR_OUTPUT,
+						NV40_VP_INST_DEST_POS);
+		struct nv40_sreg htmp = vpc->r_result[vpc->hpos_idx];
+
+		arith(vpc, 0, OP_MOV, hpos, MASK_ALL, htmp, none, none);
+	}
+
+	/* Insert code to handle user clip planes */
+	for (i = 0; i < vp->ucp.nr; i++) {
+		struct nv40_sreg cdst = nv40_sr(NV40SR_OUTPUT,
+						NV40_VP_INST_DEST_CLIP(i));
+		struct nv40_sreg ceqn = constant(vpc, -1,
+						 nv40->clip.ucp[i][0],
+						 nv40->clip.ucp[i][1],
+						 nv40->clip.ucp[i][2],
+						 nv40->clip.ucp[i][3]);
+		struct nv40_sreg htmp = vpc->r_result[vpc->hpos_idx];
+		unsigned mask;
+
+		switch (i) {
+		case 0: case 3: mask = MASK_Y; break;
+		case 1: case 4: mask = MASK_Z; break;
+		case 2: case 5: mask = MASK_W; break;
+		default:
+			NOUVEAU_ERR("invalid clip dist #%d\n", i);
+			goto out_err;
+		}
+
+		arith(vpc, 0, OP_DP4, cdst, mask, htmp, ceqn, none);
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NV40_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	if (vpc->r_temp)
+		FREE(vpc->r_temp); 
+	if (vpc->r_address)
+		FREE(vpc->r_address); 
+	if (vpc->imm)	
+		FREE(vpc->imm); 
+	FREE(vpc);
+}
+
+static boolean
+nv40_vertprog_validate(struct nv40_context *nv40)
+{ 
+	struct nouveau_winsys *nvws = nv40->nvws;
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nv40_vertex_program *vp;
+	struct pipe_buffer *constbuf;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	if (nv40->render_mode == HW) {
+		vp = nv40->vertprog;
+		constbuf = nv40->constbuf[PIPE_SHADER_VERTEX];
+
+		if ((nv40->dirty & NV40_NEW_UCP) ||
+		    memcmp(&nv40->clip, &vp->ucp, sizeof(vp->ucp))) {
+			nv40_vertprog_destroy(nv40, vp);
+			memcpy(&vp->ucp, &nv40->clip, sizeof(vp->ucp));
+		}
+	} else {
+		vp = nv40->swtnl.vertprog;
+		constbuf = NULL;
+	}
+
+	/* Translate TGSI shader into hw bytecode */
+	if (vp->translated)
+		goto check_gpu_resources;
+
+	nv40->fallback_swtnl &= ~NV40_NEW_VERTPROG;
+	nv40_vertprog_translate(nv40, vp);
+	if (!vp->translated) {
+		nv40->fallback_swtnl |= NV40_NEW_VERTPROG;
+		return FALSE;
+	}
+
+check_gpu_resources:
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nv40->screen->vp_exec_heap;
+		struct nouveau_stateobj *so;
+		uint vplen = vp->nr_insns;
+
+		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nv40_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->exec);
+			}
+
+			if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		so = so_new(7, 0);
+		so_method(so, curie, NV40TCL_VP_START_FROM_ID, 1);
+		so_data  (so, vp->exec->start);
+		so_method(so, curie, NV40TCL_VP_ATTRIB_EN, 2);
+		so_data  (so, vp->ir);
+		so_data  (so, vp->or);
+		so_method(so, curie,  NV40TCL_CLIP_PLANE_ENABLE, 1);
+		so_data  (so, vp->clip_ctrl);
+		so_ref(so, &vp->so);
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nv40->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nv40_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv40_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv40_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NV40_VP_INST_CONST_SRC_MASK;
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NV40_VP_INST_CONST_SRC_SHIFT;
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (constbuf) {
+			map = ws->buffer_map(ws, constbuf,
+					     PIPE_BUFFER_USAGE_CPU_READ);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nv40_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (i + vp->data->start);
+			OUT_RINGp ((uint32_t *)vpd->value, 4);
+		}
+
+		if (constbuf)
+			ws->buffer_unmap(ws, constbuf);
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[0]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[1]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[2]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(curie, NV40TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (vp->insns[i].data, 4);
+		}
+	}
+
+	if (vp->so != nv40->state.hw[NV40_STATE_VERTPROG]) {
+		so_ref(vp->so, &nv40->state.hw[NV40_STATE_VERTPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv40_vertprog_destroy(struct nv40_context *nv40, struct nv40_vertex_program *vp)
+{
+	struct nouveau_winsys *nvws = nv40->screen->nvws;
+
+	vp->translated = FALSE;
+
+	if (vp->nr_insns) {
+		FREE(vp->insns);
+		vp->insns = NULL;
+		vp->nr_insns = 0;
+	}
+
+	if (vp->nr_consts) {
+		FREE(vp->consts);
+		vp->consts = NULL;
+		vp->nr_consts = 0;
+	}
+
+	nvws->res_free(&vp->exec);
+	vp->exec_start = 0;
+	nvws->res_free(&vp->data);
+	vp->data_start = 0;
+	vp->data_start_min = 0;
+
+	vp->ir = vp->or = vp->clip_ctrl = 0;
+	so_ref(NULL, &vp->so);
+}
+
+struct nv40_state_entry nv40_state_vertprog = {
+	.validate = nv40_vertprog_validate,
+	.dirty = {
+		.pipe = NV40_NEW_VERTPROG | NV40_NEW_UCP,
+		.hw = NV40_STATE_VERTPROG,
+	}
+};
+
diff --git a/src/gallium/drivers/nv50/Makefile b/src/gallium/drivers/nv50/Makefile
new file mode 100644
index 0000000000..612aea28a3
--- /dev/null
+++ b/src/gallium/drivers/nv50/Makefile
@@ -0,0 +1,21 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv50
+
+C_SOURCES = \
+	nv50_clear.c \
+	nv50_context.c \
+	nv50_draw.c \
+	nv50_miptree.c \
+	nv50_query.c \
+	nv50_program.c \
+	nv50_screen.c \
+	nv50_state.c \
+	nv50_state_validate.c \
+	nv50_surface.c \
+	nv50_tex.c \
+	nv50_transfer.c \
+	nv50_vbo.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/nv50/nv50_clear.c b/src/gallium/drivers/nv50/nv50_clear.c
new file mode 100644
index 0000000000..f9bc3b53ca
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_clear.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv50_context.h"
+
+void
+nv50_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct pipe_framebuffer_state fb, s_fb = nv50->framebuffer;
+	struct pipe_scissor_state sc, s_sc = nv50->scissor;
+	unsigned dirty = nv50->dirty;
+
+	nv50->dirty = 0;
+
+	if (ps->format == PIPE_FORMAT_Z24S8_UNORM ||
+	    ps->format == PIPE_FORMAT_Z16_UNORM) {
+		fb.nr_cbufs = 0;
+		fb.zsbuf = ps;
+	} else {
+		fb.nr_cbufs = 1;
+		fb.cbufs[0] = ps;
+		fb.zsbuf = NULL;
+	}
+	fb.width = ps->width;
+	fb.height = ps->height;
+	pipe->set_framebuffer_state(pipe, &fb);
+
+	sc.minx = sc.miny = 0;
+	sc.maxx = fb.width;
+	sc.maxy = fb.height;
+	pipe->set_scissor_state(pipe, &sc);
+
+	nv50_state_validate(nv50);
+
+	switch (ps->format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+		BEGIN_RING(chan, tesla, 0x0d80, 4);
+		OUT_RINGf (chan, ubyte_to_float((clearValue >> 16) & 0xff));
+		OUT_RINGf (chan, ubyte_to_float((clearValue >>  8) & 0xff));
+		OUT_RINGf (chan, ubyte_to_float((clearValue >>  0) & 0xff));
+		OUT_RINGf (chan, ubyte_to_float((clearValue >> 24) & 0xff));
+		BEGIN_RING(chan, tesla, 0x19d0, 1);
+		OUT_RING  (chan, 0x3c);
+		break;
+	case PIPE_FORMAT_Z24S8_UNORM:
+		BEGIN_RING(chan, tesla, 0x0d90, 1);
+		OUT_RINGf (chan, (float)(clearValue >> 8) * (1.0 / 16777215.0));
+		BEGIN_RING(chan, tesla, 0x0da0, 1);
+		OUT_RING  (chan, clearValue & 0xff);
+		BEGIN_RING(chan, tesla, 0x19d0, 1);
+		OUT_RING  (chan, 0x03);
+		break;
+	default:
+		pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height,
+				   clearValue);
+		break;
+	}
+
+	pipe->set_framebuffer_state(pipe, &s_fb);
+	pipe->set_scissor_state(pipe, &s_sc);
+	nv50->dirty |= dirty;
+
+	ps->status = PIPE_SURFACE_STATUS_CLEAR;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_context.c b/src/gallium/drivers/nv50/nv50_context.c
new file mode 100644
index 0000000000..565a5da668
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_context.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "nv50_context.h"
+#include "nv50_screen.h"
+
+static void
+nv50_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+	
+	FIRE_RING(nv50->screen->nvws->channel);
+}
+
+static void
+nv50_destroy(struct pipe_context *pipe)
+{
+	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+
+	draw_destroy(nv50->draw);
+	FREE(nv50);
+}
+
+
+static void
+nv50_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+}
+
+struct pipe_context *
+nv50_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct pipe_winsys *pipe_winsys = pscreen->winsys;
+	struct nv50_screen *screen = nv50_screen(pscreen);
+	struct nv50_context *nv50;
+
+	nv50 = CALLOC_STRUCT(nv50_context);
+	if (!nv50)
+		return NULL;
+	nv50->screen = screen;
+	nv50->pctx_id = pctx_id;
+
+	nv50->pipe.winsys = pipe_winsys;
+	nv50->pipe.screen = pscreen;
+
+	nv50->pipe.destroy = nv50_destroy;
+
+	nv50->pipe.set_edgeflags = nv50_set_edgeflags;
+	nv50->pipe.draw_arrays = nv50_draw_arrays;
+	nv50->pipe.draw_elements = nv50_draw_elements;
+	nv50->pipe.clear = nv50_clear;
+
+	nv50->pipe.flush = nv50_flush;
+
+	nv50_init_surface_functions(nv50);
+	nv50_init_state_functions(nv50);
+	nv50_init_query_functions(nv50);
+
+	nv50->draw = draw_create();
+	assert(nv50->draw);
+	draw_set_rasterize_stage(nv50->draw, nv50_draw_render_stage(nv50));
+
+	return &nv50->pipe;
+}
+
+		
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
new file mode 100644
index 0000000000..313e435e7a
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -0,0 +1,201 @@
+#ifndef __NV50_CONTEXT_H__
+#define __NV50_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+#include "nouveau/nouveau_stateobj.h"
+
+#include "nv50_screen.h"
+#include "nv50_program.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+/* Constant buffer assignment */
+#define NV50_CB_PMISC		0
+#define NV50_CB_PVP		1
+#define NV50_CB_PFP		2
+#define NV50_CB_PGP		3
+#define NV50_CB_TIC		4
+#define NV50_CB_TSC		5
+#define NV50_CB_PUPLOAD         6
+
+#define NV50_NEW_BLEND		(1 << 0)
+#define NV50_NEW_ZSA		(1 << 1)
+#define NV50_NEW_BLEND_COLOUR	(1 << 2)
+#define NV50_NEW_STIPPLE	(1 << 3)
+#define NV50_NEW_SCISSOR	(1 << 4)
+#define NV50_NEW_VIEWPORT	(1 << 5)
+#define NV50_NEW_RASTERIZER	(1 << 6)
+#define NV50_NEW_FRAMEBUFFER	(1 << 7)
+#define NV50_NEW_VERTPROG	(1 << 8)
+#define NV50_NEW_VERTPROG_CB	(1 << 9)
+#define NV50_NEW_FRAGPROG	(1 << 10)
+#define NV50_NEW_FRAGPROG_CB	(1 << 11)
+#define NV50_NEW_ARRAYS		(1 << 12)
+#define NV50_NEW_SAMPLER	(1 << 13)
+#define NV50_NEW_TEXTURE	(1 << 14)
+
+struct nv50_blend_stateobj {
+	struct pipe_blend_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv50_zsa_stateobj {
+	struct pipe_depth_stencil_alpha_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv50_rasterizer_stateobj {
+	struct pipe_rasterizer_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv50_miptree_level {
+	int *image_offset;
+	unsigned pitch;
+};
+
+struct nv50_miptree {
+	struct pipe_texture base;
+	struct pipe_buffer *buffer;
+
+	struct nv50_miptree_level level[PIPE_MAX_TEXTURE_LEVELS];
+	int image_nr;
+	int total_size;
+};
+
+static INLINE struct nv50_miptree *
+nv50_miptree(struct pipe_texture *pt)
+{
+	return (struct nv50_miptree *)pt;
+}
+
+struct nv50_surface {
+	struct pipe_surface base;
+};
+
+static INLINE struct nv50_surface *
+nv50_surface(struct pipe_surface *pt)
+{
+	return (struct nv50_surface *)pt;
+}
+
+static INLINE struct pipe_buffer *
+nv50_surface_buffer(struct pipe_surface *surface)
+{
+	struct nv50_miptree *mt = (struct nv50_miptree *)surface->texture;
+	return mt->buffer;
+}
+
+struct nv50_state {
+	unsigned dirty;
+
+	struct nouveau_stateobj *fb;
+	struct nouveau_stateobj *blend;
+	struct nouveau_stateobj *blend_colour;
+	struct nouveau_stateobj *zsa;
+	struct nouveau_stateobj *rast;
+	struct nouveau_stateobj *stipple;
+	struct nouveau_stateobj *scissor;
+	unsigned scissor_enabled;
+	struct nouveau_stateobj *viewport;
+	unsigned viewport_bypass;
+	struct nouveau_stateobj *tsc_upload;
+	struct nouveau_stateobj *tic_upload;
+	struct nouveau_stateobj *vertprog;
+	struct nouveau_stateobj *fragprog;
+	struct nouveau_stateobj *vtxfmt;
+	struct nouveau_stateobj *vtxbuf;
+};
+
+struct nv50_context {
+	struct pipe_context pipe;
+
+	struct nv50_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	struct nv50_state state;
+
+	unsigned dirty;
+	struct nv50_blend_stateobj *blend;
+	struct nv50_zsa_stateobj *zsa;
+	struct nv50_rasterizer_stateobj *rasterizer;
+	struct pipe_blend_color blend_colour;
+	struct pipe_poly_stipple stipple;
+	struct pipe_scissor_state scissor;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct nv50_program *vertprog;
+	struct nv50_program *fragprog;
+	struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+	unsigned vtxelt_nr;
+	unsigned *sampler[PIPE_MAX_SAMPLERS];
+	unsigned sampler_nr;
+	struct nv50_miptree *miptree[PIPE_MAX_SAMPLERS];
+	unsigned miptree_nr;
+};
+
+static INLINE struct nv50_context *
+nv50_context(struct pipe_context *pipe)
+{
+	return (struct nv50_context *)pipe;
+}
+
+extern void nv50_init_surface_functions(struct nv50_context *nv50);
+extern void nv50_init_state_functions(struct nv50_context *nv50);
+extern void nv50_init_query_functions(struct nv50_context *nv50);
+
+extern void nv50_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+extern int
+nv50_surface_do_copy(struct nv50_screen *screen, struct pipe_surface *dst,
+		     int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		     int w, int h);
+
+/* nv50_draw.c */
+extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *nv50);
+
+/* nv50_vbo.c */
+extern boolean nv50_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv50_draw_elements(struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+extern void nv50_vbo_validate(struct nv50_context *nv50);
+
+/* nv50_clear.c */
+extern void nv50_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv50_program.c */
+extern void nv50_vertprog_validate(struct nv50_context *nv50);
+extern void nv50_fragprog_validate(struct nv50_context *nv50);
+extern void nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p);
+
+/* nv50_state_validate.c */
+extern boolean nv50_state_validate(struct nv50_context *nv50);
+
+/* nv50_tex.c */
+extern void nv50_tex_validate(struct nv50_context *);
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_draw.c b/src/gallium/drivers/nv50/nv50_draw.c
new file mode 100644
index 0000000000..2f6f607261
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_draw.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "draw/draw_pipe.h"
+
+#include "nv50_context.h"
+
+struct nv50_render_stage {
+	struct draw_stage stage;
+	struct nv50_context *nv50;
+};
+
+static INLINE struct nv50_render_stage *
+nv50_render_stage(struct draw_stage *stage)
+{
+	return (struct nv50_render_stage *)stage;
+}
+
+static void
+nv50_render_point(struct draw_stage *stage, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_line(struct draw_stage *stage, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_tri(struct draw_stage *stage, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_flush(struct draw_stage *stage, unsigned flags)
+{
+}
+
+static void
+nv50_render_reset_stipple_counter(struct draw_stage *stage)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_destroy(struct draw_stage *stage)
+{
+	FREE(stage);
+}
+
+struct draw_stage *
+nv50_draw_render_stage(struct nv50_context *nv50)
+{
+	struct nv50_render_stage *rs = CALLOC_STRUCT(nv50_render_stage);
+
+	rs->nv50 = nv50;
+	rs->stage.draw = nv50->draw;
+	rs->stage.destroy = nv50_render_destroy;
+	rs->stage.point = nv50_render_point;
+	rs->stage.line = nv50_render_line;
+	rs->stage.tri = nv50_render_tri;
+	rs->stage.flush = nv50_render_flush;
+	rs->stage.reset_stipple_counter = nv50_render_reset_stipple_counter;
+
+	return &rs->stage;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
new file mode 100644
index 0000000000..24a68b7235
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv50_context.h"
+
+static struct pipe_texture *
+nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
+	struct pipe_texture *pt = &mt->base;
+	unsigned usage, width = tmp->width[0], height = tmp->height[0];
+	unsigned depth = tmp->depth[0];
+	int i, l;
+
+	mt->base = *tmp;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+
+	usage = PIPE_BUFFER_USAGE_PIXEL;
+	switch (pt->format) {
+	case PIPE_FORMAT_Z24S8_UNORM:
+	case PIPE_FORMAT_Z16_UNORM:
+		usage |= NOUVEAU_BUFFER_USAGE_ZETA;
+		break;
+	default:
+		break;
+	}
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_3D:
+		mt->image_nr = pt->depth[0];
+		break;
+	case PIPE_TEXTURE_CUBE:
+		mt->image_nr = 6;
+		break;
+	default:
+		mt->image_nr = 1;
+		break;
+	}
+
+	for (l = 0; l <= pt->last_level; l++) {
+		struct nv50_miptree_level *lvl = &mt->level[l];
+
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->depth[l] = depth;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		lvl->image_offset = CALLOC(mt->image_nr, sizeof(int));
+		lvl->pitch = align(pt->width[l] * pt->block.size, 64);
+
+		width = MAX2(1, width >> 1);
+		height = MAX2(1, height >> 1);
+		depth = MAX2(1, depth >> 1);
+	}
+
+	for (i = 0; i < mt->image_nr; i++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			struct nv50_miptree_level *lvl = &mt->level[l];
+			int size;
+
+			size  = align(pt->width[l], 8) * pt->block.size;
+			size  = align(size, 64);
+			size *= align(pt->height[l], 8) * pt->block.size;
+
+			lvl->image_offset[i] = mt->total_size;
+
+			mt->total_size += size;
+		}
+	}
+
+	mt->buffer = ws->buffer_create(ws, 256, usage, mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+
+	return &mt->base;
+}
+
+static struct pipe_texture *
+nv50_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
+		     const unsigned *stride, struct pipe_buffer *pb)
+{
+	struct nv50_miptree *mt;
+
+	/* Only supports 2D, non-mipmapped textures for the moment */
+	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
+	    pt->depth[0] != 1)
+		return NULL;
+
+	mt = CALLOC_STRUCT(nv50_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->image_nr = 1;
+	mt->level[0].pitch = *stride;
+	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+
+	pipe_buffer_reference(pscreen, &mt->buffer, pb);
+	return &mt->base;
+}
+
+static void
+nv50_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **ppt)
+{
+	struct pipe_texture *pt = *ppt;
+
+	*ppt = NULL;
+
+	if (--pt->refcount <= 0) {
+		struct nv50_miptree *mt = nv50_miptree(pt);
+
+		pipe_buffer_reference(pscreen, &mt->buffer, NULL);
+		FREE(mt);
+	}
+}
+
+static struct pipe_surface *
+nv50_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv50_miptree *mt = nv50_miptree(pt);
+	struct nv50_miptree_level *lvl = &mt->level[level];
+	struct pipe_surface *ps;
+	int img;
+
+	if (pt->target == PIPE_TEXTURE_CUBE)
+		img = face;
+	else
+	if (pt->target == PIPE_TEXTURE_3D)
+		img = zslice;
+	else
+		img = 0;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+	ps->face = face;
+	ps->level = level;
+	ps->zslice = zslice;
+	ps->offset = lvl->image_offset[img];
+
+	return ps;
+}
+
+static void
+nv50_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+	struct nv50_surface *s = nv50_surface(ps);
+
+	*psurface = NULL;
+
+	if (--ps->refcount <= 0) {
+		pipe_texture_reference(&ps->texture, NULL);
+		FREE(s);
+	}
+}
+
+void
+nv50_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv50_miptree_create;
+	pscreen->texture_blanket = nv50_miptree_blanket;
+	pscreen->texture_release = nv50_miptree_release;
+	pscreen->get_tex_surface = nv50_miptree_surface_new;
+	pscreen->tex_surface_release = nv50_miptree_surface_del;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
new file mode 100644
index 0000000000..14c5d47e79
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -0,0 +1,1784 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv50_context.h"
+
+#define NV50_SU_MAX_TEMP 64
+//#define NV50_PROGRAM_DUMP
+
+/* ARL - gallium craps itself on progs/vp/arl.txt
+ *
+ * MSB - Like MAD, but MUL+SUB
+ * 	- Fuck it off, introduce a way to negate args for ops that
+ * 	  support it.
+ *
+ * Look into inlining IMMD for ops other than MOV (make it general?)
+ * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
+ * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
+ *
+ * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
+ * case, if the emit_src() causes the inst to suddenly become long.
+ *
+ * Verify half-insns work where expected - and force disable them where they
+ * don't work - MUL has it forcibly disabled atm as it fixes POW..
+ *
+ * FUCK! watch dst==src vectors, can overwrite components that are needed.
+ * 	ie. SUB R0, R0.yzxw, R0
+ *
+ * Things to check with renouveau:
+ * 	FP attr/result assignment - how?
+ * 		attrib
+ * 			- 0x16bc maps vp output onto fp hpos
+ * 			- 0x16c0 maps vp output onto fp col0
+ * 		result
+ * 			- colr always 0-3
+ * 			- depr always 4
+ * 0x16bc->0x16e8 --> some binding between vp/fp regs
+ * 0x16b8 --> VP output count
+ *
+ * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
+ * 	      "MOV rcol.x, fcol.y" = 0x00000004
+ * 0x19a8 --> as above but 0x00000100 and 0x00000000
+ * 	- 0x00100000 used when KIL used
+ * 0x196c --> as above but 0x00000011 and 0x00000000
+ *
+ * 0x1988 --> 0xXXNNNNNN
+ * 	- XX == FP high something
+ */
+struct nv50_reg {
+	enum {
+		P_TEMP,
+		P_ATTR,
+		P_RESULT,
+		P_CONST,
+		P_IMMD
+	} type;
+	int index;
+
+	int hw;
+	int neg;
+};
+
+struct nv50_pc {
+	struct nv50_program *p;
+
+	/* hw resources */
+	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
+
+	/* tgsi resources */
+	struct nv50_reg *temp;
+	int temp_nr;
+	struct nv50_reg *attr;
+	int attr_nr;
+	struct nv50_reg *result;
+	int result_nr;
+	struct nv50_reg *param;
+	int param_nr;
+	struct nv50_reg *immd;
+	float *immd_buf;
+	int immd_nr;
+
+	struct nv50_reg *temp_temp[16];
+	unsigned temp_temp_nr;
+};
+
+static void
+alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
+{
+	int i;
+
+	if (reg->type == P_RESULT) {
+		if (pc->p->cfg.high_result < (reg->hw + 1))
+			pc->p->cfg.high_result = reg->hw + 1;
+	}
+
+	if (reg->type != P_TEMP)
+		return;
+
+	if (reg->hw >= 0) {
+		/*XXX: do this here too to catch FP temp-as-attr usage..
+		 *     not clean, but works */
+		if (pc->p->cfg.high_temp < (reg->hw + 1))
+			pc->p->cfg.high_temp = reg->hw + 1;
+		return;
+	}
+
+	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
+		if (!(pc->r_temp[i])) {
+			pc->r_temp[i] = reg;
+			reg->hw = i;
+			if (pc->p->cfg.high_temp < (i + 1))
+				pc->p->cfg.high_temp = i + 1;
+			return;
+		}
+	}
+
+	assert(0);
+}
+
+static struct nv50_reg *
+alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
+{
+	struct nv50_reg *r;
+	int i;
+
+	if (dst && dst->type == P_TEMP && dst->hw == -1)
+		return dst;
+
+	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
+		if (!pc->r_temp[i]) {
+			r = CALLOC_STRUCT(nv50_reg);
+			r->type = P_TEMP;
+			r->index = -1;
+			r->hw = i;
+			pc->r_temp[i] = r;
+			return r;
+		}
+	}
+
+	assert(0);
+	return NULL;
+}
+
+static void
+free_temp(struct nv50_pc *pc, struct nv50_reg *r)
+{
+	if (r->index == -1) {
+		unsigned hw = r->hw;
+
+		FREE(pc->r_temp[hw]);
+		pc->r_temp[hw] = NULL;
+	}
+}
+
+static int
+alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
+{
+	int i;
+
+	if ((idx + 4) >= NV50_SU_MAX_TEMP)
+		return 1;
+
+	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
+	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
+		return alloc_temp4(pc, dst, idx + 1);
+
+	for (i = 0; i < 4; i++) {
+		dst[i] = CALLOC_STRUCT(nv50_reg);
+		dst[i]->type = P_TEMP;
+		dst[i]->index = -1;
+		dst[i]->hw = idx + i;
+		pc->r_temp[idx + i] = dst[i];
+	}
+
+	return 0;
+}
+
+static void
+free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
+{
+	int i;
+
+	for (i = 0; i < 4; i++)
+		free_temp(pc, reg[i]);
+}
+
+static struct nv50_reg *
+temp_temp(struct nv50_pc *pc)
+{
+	if (pc->temp_temp_nr >= 16)
+		assert(0);
+
+	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
+	return pc->temp_temp[pc->temp_temp_nr++];
+}
+
+static void
+kill_temp_temp(struct nv50_pc *pc)
+{
+	int i;
+	
+	for (i = 0; i < pc->temp_temp_nr; i++)
+		free_temp(pc, pc->temp_temp[i]);
+	pc->temp_temp_nr = 0;
+}
+
+static int
+ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
+{
+	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * r * sizeof(float)),
+			       (pc->immd_nr + 1) * 4 * sizeof(float));
+	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
+	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
+	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
+	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
+	
+	return pc->immd_nr++;
+}
+
+static struct nv50_reg *
+alloc_immd(struct nv50_pc *pc, float f)
+{
+	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
+	unsigned hw;
+
+	hw = ctor_immd(pc, f, 0, 0, 0) * 4;
+	r->type = P_IMMD;
+	r->hw = hw;
+	r->index = -1;
+	return r;
+}
+
+static struct nv50_program_exec *
+exec(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
+
+	e->param.index = -1;
+	return e;
+}
+
+static void
+emit(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+	struct nv50_program *p = pc->p;
+
+	if (p->exec_tail)
+		p->exec_tail->next = e;
+	if (!p->exec_head)
+		p->exec_head = e;
+	p->exec_tail = e;
+	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
+}
+
+static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
+
+static boolean
+is_long(struct nv50_program_exec *e)
+{
+	if (e->inst[0] & 1)
+		return TRUE;
+	return FALSE;
+}
+
+static boolean
+is_immd(struct nv50_program_exec *e)
+{
+	if (is_long(e) && (e->inst[1] & 3) == 3)
+		return TRUE;
+	return FALSE;
+}
+
+static INLINE void
+set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
+	 struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
+	e->inst[1] |= (pred << 7) | (idx << 12);
+}
+
+static INLINE void
+set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
+	    struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
+	e->inst[1] |= (idx << 4) | (on << 6);
+}
+
+static INLINE void
+set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+	if (is_long(e))
+		return;
+
+	e->inst[0] |= 1;
+	set_pred(pc, 0xf, 0, e);
+	set_pred_wr(pc, 0, 0, e);
+}
+
+static INLINE void
+set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
+{
+	if (dst->type == P_RESULT) {
+		set_long(pc, e);
+		e->inst[1] |= 0x00000008;
+	}
+
+	alloc_reg(pc, dst);
+	e->inst[0] |= (dst->hw << 2);
+}
+
+static INLINE void
+set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
+{
+	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
+
+	set_long(pc, e);
+	/*XXX: can't be predicated - bits overlap.. catch cases where both
+	 *     are required and avoid them. */
+	set_pred(pc, 0, 0, e);
+	set_pred_wr(pc, 0, 0, e);
+
+	e->inst[1] |= 0x00000002 | 0x00000001;
+	e->inst[0] |= (val & 0x3f) << 16;
+	e->inst[1] |= (val >> 6) << 2;
+}
+
+static void
+emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
+	    struct nv50_reg *src, struct nv50_reg *iv)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0x80000000;
+	set_dst(pc, dst, e);
+	alloc_reg(pc, src);
+	e->inst[0] |= (src->hw << 16);
+	if (iv) {
+		e->inst[0] |= (1 << 25);
+		alloc_reg(pc, iv);
+		e->inst[0] |= (iv->hw << 9);
+	}
+
+	emit(pc, e);
+}
+
+static void
+set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
+	 struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+#if 1
+	e->inst[1] |= (1 << 22);
+#else
+	if (src->type == P_IMMD) {
+		e->inst[1] |= (NV50_CB_PMISC << 22);
+	} else {
+		if (pc->p->type == PIPE_SHADER_VERTEX)
+			e->inst[1] |= (NV50_CB_PVP << 22);
+		else
+			e->inst[1] |= (NV50_CB_PFP << 22);
+	}
+#endif
+
+	e->param.index = src->hw;
+	e->param.shift = s;
+	e->param.mask = m << (s % 32);
+}
+
+static void
+emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0x10000000;
+
+	set_dst(pc, dst, e);
+
+	if (0 && dst->type != P_RESULT && src->type == P_IMMD) {
+		set_immd(pc, src, e);
+		/*XXX: 32-bit, but steals part of "half" reg space - need to
+		 *     catch and handle this case if/when we do half-regs
+		 */
+		e->inst[0] |= 0x00008000;
+	} else
+	if (src->type == P_IMMD || src->type == P_CONST) {
+		set_long(pc, e);
+		set_data(pc, src, 0x7f, 9, e);
+		e->inst[1] |= 0x20000000; /* src0 const? */
+	} else {
+		if (src->type == P_ATTR) {
+			set_long(pc, e);
+			e->inst[1] |= 0x00200000;
+		}
+
+		alloc_reg(pc, src);
+		e->inst[0] |= (src->hw << 9);
+	}
+
+	/* We really should support "half" instructions here at some point,
+	 * but I don't feel confident enough about them yet.
+	 */
+	set_long(pc, e);
+	if (is_long(e) && !is_immd(e)) {
+		e->inst[1] |= 0x04000000; /* 32-bit */
+		e->inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
+	}
+
+	emit(pc, e);
+}
+
+static boolean
+check_swap_src_0_1(struct nv50_pc *pc,
+		   struct nv50_reg **s0, struct nv50_reg **s1)
+{
+	struct nv50_reg *src0 = *s0, *src1 = *s1;
+
+	if (src0->type == P_CONST) {
+		if (src1->type != P_CONST) {
+			*s0 = src1;
+			*s1 = src0;
+			return TRUE;
+		}
+	} else
+	if (src1->type == P_ATTR) {
+		if (src0->type != P_ATTR) {
+			*s0 = src1;
+			*s1 = src0;
+			return TRUE;
+		}
+	}
+
+	return FALSE;
+}
+
+static void
+set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+	if (src->type == P_ATTR) {
+		set_long(pc, e);
+		e->inst[1] |= 0x00200000;
+	} else
+	if (src->type == P_CONST || src->type == P_IMMD) {
+		struct nv50_reg *temp = temp_temp(pc);
+
+		emit_mov(pc, temp, src);
+		src = temp;
+	}
+
+	alloc_reg(pc, src);
+	e->inst[0] |= (src->hw << 9);
+}
+
+static void
+set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+	if (src->type == P_ATTR) {
+		struct nv50_reg *temp = temp_temp(pc);
+
+		emit_mov(pc, temp, src);
+		src = temp;
+	} else
+	if (src->type == P_CONST || src->type == P_IMMD) {
+		assert(!(e->inst[0] & 0x00800000));
+		if (e->inst[0] & 0x01000000) {
+			struct nv50_reg *temp = temp_temp(pc);
+
+			emit_mov(pc, temp, src);
+			src = temp;
+		} else {
+			set_data(pc, src, 0x7f, 16, e);
+			e->inst[0] |= 0x00800000;
+		}
+	}
+
+	alloc_reg(pc, src);
+	e->inst[0] |= (src->hw << 16);
+}
+
+static void
+set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+
+	if (src->type == P_ATTR) {
+		struct nv50_reg *temp = temp_temp(pc);
+
+		emit_mov(pc, temp, src);
+		src = temp;
+	} else
+	if (src->type == P_CONST || src->type == P_IMMD) {
+		assert(!(e->inst[0] & 0x01000000));
+		if (e->inst[0] & 0x00800000) {
+			struct nv50_reg *temp = temp_temp(pc);
+
+			emit_mov(pc, temp, src);
+			src = temp;
+		} else {
+			set_data(pc, src, 0x7f, 32+14, e);
+			e->inst[0] |= 0x01000000;
+		}
+	}
+
+	alloc_reg(pc, src);
+	e->inst[1] |= (src->hw << 14);
+}
+
+static void
+emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xc0000000;
+	set_long(pc, e);
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
+	 struct nv50_reg *src0, struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	if (is_long(e))
+		set_src_2(pc, src1, e);
+	else
+		set_src_1(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
+	    struct nv50_reg *src0, struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	set_long(pc, e);
+	e->inst[0] |= 0xb0000000;
+	e->inst[1] |= (sub << 29);
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	set_long(pc, e);
+	if (check_swap_src_0_1(pc, &src0, &src1))
+		e->inst[1] |= 0x04000000;
+	else
+		e->inst[1] |= 0x08000000;
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_2(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1, struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xe0000000;
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	set_src_2(pc, src2, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1, struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xe0000000;
+	set_long(pc, e);
+	e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	set_src_2(pc, src2, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_flop(struct nv50_pc *pc, unsigned sub,
+	  struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0x90000000;
+	if (sub) {
+		set_long(pc, e);
+		e->inst[1] |= (sub << 29);
+	}
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29) | 0x00004000;
+
+	emit(pc, e);
+}
+
+static void
+emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29);
+
+	emit(pc, e);
+}
+
+static void
+emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
+	 struct nv50_reg *src0, struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+	unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
+	struct nv50_reg *rdst;
+
+	assert(c_op <= 7);
+	if (check_swap_src_0_1(pc, &src0, &src1))
+		c_op = inv_cop[c_op];
+
+	rdst = dst;
+	if (dst->type != P_TEMP)
+		dst = alloc_temp(pc, NULL);
+
+	/* set.u32 */
+	set_long(pc, e);
+	e->inst[0] |= 0xb0000000;
+	e->inst[1] |= (3 << 29);
+	e->inst[1] |= (c_op << 14);
+	/*XXX: breaks things, .u32 by default?
+	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
+	 *     doesn't seem to match what the hw actually does.
+	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
+	 */
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	emit(pc, e);
+
+	/* cvt.f32.u32 */
+	e = exec(pc);
+	e->inst[0] = 0xa0000001;
+	e->inst[1] = 0x64014780;
+	set_dst(pc, rdst, e);
+	set_src_0(pc, dst, e);
+	emit(pc, e);
+
+	if (dst != rdst)
+		free_temp(pc, dst);
+}
+
+static void
+emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xa0000000; /* cvt */
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29); /* cvt */
+	e->inst[1] |= 0x08000000; /* integer mode */
+	e->inst[1] |= 0x04000000; /* 32 bit */
+	e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */
+	e->inst[1] |= (1 << 14); /* src .f32 */
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
+	 struct nv50_reg *v, struct nv50_reg *e)
+{
+	struct nv50_reg *temp = alloc_temp(pc, NULL);
+
+	emit_flop(pc, 3, temp, v);
+	emit_mul(pc, temp, temp, e);
+	emit_preex2(pc, temp, temp);
+	emit_flop(pc, 6, dst, temp);
+
+	free_temp(pc, temp);
+}
+
+static void
+emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xa0000000; /* cvt */
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29); /* cvt */
+	e->inst[1] |= 0x04000000; /* 32 bit */
+	e->inst[1] |= (1 << 14); /* src .f32 */
+	e->inst[1] |= ((1 << 6) << 14); /* .abs */
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
+	 struct nv50_reg **src)
+{
+	struct nv50_reg *one = alloc_immd(pc, 1.0);
+	struct nv50_reg *zero = alloc_immd(pc, 0.0);
+	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
+	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
+	struct nv50_reg *tmp[4];
+
+	if (mask & (1 << 0))
+		emit_mov(pc, dst[0], one);
+
+	if (mask & (1 << 3))
+		emit_mov(pc, dst[3], one);
+
+	if (mask & (3 << 1)) {
+		if (mask & (1 << 1))
+			tmp[0] = dst[1];
+		else
+			tmp[0] = temp_temp(pc);
+		emit_minmax(pc, 4, tmp[0], src[0], zero);
+	}
+
+	if (mask & (1 << 2)) {
+		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
+
+		tmp[1] = temp_temp(pc);
+		emit_minmax(pc, 4, tmp[1], src[1], zero);
+
+		tmp[3] = temp_temp(pc);
+		emit_minmax(pc, 4, tmp[3], src[3], neg128);
+		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
+
+		emit_pow(pc, dst[2], tmp[1], tmp[3]);
+		emit_mov(pc, dst[2], zero);
+		set_pred(pc, 3, 0, pc->p->exec_tail);
+	}
+}
+
+static void
+emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	set_long(pc, e);
+	e->inst[0] |= 0xa0000000; /* delta */
+	e->inst[1] |= (7 << 29); /* delta */
+	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
+	e->inst[1] |= (1 << 14); /* src .f32 */
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e;
+	const int r_pred = 1;
+
+	/* Sets predicate reg ? */
+	e = exec(pc);
+	e->inst[0] = 0xa00001fd;
+	e->inst[1] = 0xc4014788;
+	set_src_0(pc, src, e);
+	set_pred_wr(pc, 1, r_pred, e);
+	emit(pc, e);
+
+	/* This is probably KILP */
+	e = exec(pc);
+	e->inst[0] = 0x000001fe;
+	set_long(pc, e);
+	set_pred(pc, 1 /* LT? */, r_pred, e);
+	emit(pc, e);
+}
+
+static struct nv50_reg *
+tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
+{
+	switch (dst->DstRegister.File) {
+	case TGSI_FILE_TEMPORARY:
+		return &pc->temp[dst->DstRegister.Index * 4 + c];
+	case TGSI_FILE_OUTPUT:
+		return &pc->result[dst->DstRegister.Index * 4 + c];
+	case TGSI_FILE_NULL:
+		return NULL;
+	default:
+		break;
+	}
+
+	return NULL;
+}
+
+static struct nv50_reg *
+tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
+{
+	struct nv50_reg *r = NULL;
+	struct nv50_reg *temp;
+	unsigned c;
+
+	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
+	switch (c) {
+	case TGSI_EXTSWIZZLE_X:
+	case TGSI_EXTSWIZZLE_Y:
+	case TGSI_EXTSWIZZLE_Z:
+	case TGSI_EXTSWIZZLE_W:
+		switch (src->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			r = &pc->attr[src->SrcRegister.Index * 4 + c];
+			break;
+		case TGSI_FILE_TEMPORARY:
+			r = &pc->temp[src->SrcRegister.Index * 4 + c];
+			break;
+		case TGSI_FILE_CONSTANT:
+			r = &pc->param[src->SrcRegister.Index * 4 + c];
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			r = &pc->immd[src->SrcRegister.Index * 4 + c];
+			break;
+		case TGSI_FILE_SAMPLER:
+			break;
+		default:
+			assert(0);
+			break;
+		}
+		break;
+	case TGSI_EXTSWIZZLE_ZERO:
+		r = alloc_immd(pc, 0.0);
+		break;
+	case TGSI_EXTSWIZZLE_ONE:
+		r = alloc_immd(pc, 1.0);
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
+	case TGSI_UTIL_SIGN_KEEP:
+		break;
+	case TGSI_UTIL_SIGN_CLEAR:
+		temp = temp_temp(pc);
+		emit_abs(pc, temp, r);
+		r = temp;
+		break;
+	case TGSI_UTIL_SIGN_TOGGLE:
+		temp = temp_temp(pc);
+		emit_neg(pc, temp, r);
+		r = temp;
+		break;
+	case TGSI_UTIL_SIGN_SET:
+		temp = temp_temp(pc);
+		emit_abs(pc, temp, r);
+		emit_neg(pc, temp, r);
+		r = temp;
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	return r;
+}
+
+static boolean
+nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
+{
+	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
+	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
+	unsigned mask, sat, unit;
+	int i, c;
+
+	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
+
+	for (c = 0; c < 4; c++) {
+		if (mask & (1 << c))
+			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
+		else
+			dst[c] = NULL;
+	}
+
+	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
+
+		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
+			unit = fs->SrcRegister.Index;
+
+		for (c = 0; c < 4; c++)
+			src[i][c] = tgsi_src(pc, c, fs);
+	}
+
+	if (sat) {
+		for (c = 0; c < 4; c++) {
+			rdst[c] = dst[c];
+			dst[c] = temp_temp(pc);
+		}
+	}
+
+	switch (inst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_abs(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_ADD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_add(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_COS:
+		temp = alloc_temp(pc, NULL);
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, 5, temp, temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		break;
+	case TGSI_OPCODE_DP3:
+		temp = alloc_temp(pc, NULL);
+		emit_mul(pc, temp, src[0][0], src[1][0]);
+		emit_mad(pc, temp, src[0][1], src[1][1], temp);
+		emit_mad(pc, temp, src[0][2], src[1][2], temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_DP4:
+		temp = alloc_temp(pc, NULL);
+		emit_mul(pc, temp, src[0][0], src[1][0]);
+		emit_mad(pc, temp, src[0][1], src[1][1], temp);
+		emit_mad(pc, temp, src[0][2], src[1][2], temp);
+		emit_mad(pc, temp, src[0][3], src[1][3], temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_DPH:
+		temp = alloc_temp(pc, NULL);
+		emit_mul(pc, temp, src[0][0], src[1][0]);
+		emit_mad(pc, temp, src[0][1], src[1][1], temp);
+		emit_mad(pc, temp, src[0][2], src[1][2], temp);
+		emit_add(pc, temp, src[1][3], temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_DST:
+	{
+		struct nv50_reg *one = alloc_immd(pc, 1.0);
+		if (mask & (1 << 0))
+			emit_mov(pc, dst[0], one);
+		if (mask & (1 << 1))
+			emit_mul(pc, dst[1], src[0][1], src[1][1]);
+		if (mask & (1 << 2))
+			emit_mov(pc, dst[2], src[0][2]);
+		if (mask & (1 << 3))
+			emit_mov(pc, dst[3], src[1][3]);
+		FREE(one);
+	}
+		break;
+	case TGSI_OPCODE_EX2:
+		temp = alloc_temp(pc, NULL);
+		emit_preex2(pc, temp, src[0][0]);
+		emit_flop(pc, 6, temp, temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_FLR:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flr(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_FRC:
+		temp = alloc_temp(pc, NULL);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flr(pc, temp, src[0][c]);
+			emit_sub(pc, dst[c], src[0][c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_KIL:
+		emit_kil(pc, src[0][0]);
+		emit_kil(pc, src[0][1]);
+		emit_kil(pc, src[0][2]);
+		emit_kil(pc, src[0][3]);
+		break;
+	case TGSI_OPCODE_LIT:
+		emit_lit(pc, &dst[0], mask, &src[0][0]);
+		break;
+	case TGSI_OPCODE_LG2:
+		temp = alloc_temp(pc, NULL);
+		emit_flop(pc, 3, temp, src[0][0]);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		break;
+	case TGSI_OPCODE_LRP:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			/*XXX: we can do better than this */
+			temp = alloc_temp(pc, NULL);
+			emit_neg(pc, temp, src[0][c]);
+			emit_mad(pc, temp, temp, src[2][c], src[2][c]);
+			emit_mad(pc, dst[c], src[0][c], src[1][c], temp);
+			free_temp(pc, temp);
+		}
+		break;
+	case TGSI_OPCODE_MAD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
+		}
+		break;
+	case TGSI_OPCODE_MAX:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_MIN:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_MOV:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_MUL:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mul(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_POW:
+		temp = alloc_temp(pc, NULL);
+		emit_pow(pc, temp, src[0][0], src[1][0]);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_RCP:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flop(pc, 0, dst[c], src[0][0]);
+		}
+		break;
+	case TGSI_OPCODE_RSQ:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flop(pc, 2, dst[c], src[0][0]);
+		}
+		break;
+	case TGSI_OPCODE_SCS:
+		temp = alloc_temp(pc, NULL);
+		emit_precossin(pc, temp, src[0][0]);
+		if (mask & (1 << 0))
+			emit_flop(pc, 5, dst[0], temp);
+		if (mask & (1 << 1))
+			emit_flop(pc, 4, dst[1], temp);
+		break;
+	case TGSI_OPCODE_SGE:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_SIN:
+		temp = alloc_temp(pc, NULL);
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, 4, temp, temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		break;
+	case TGSI_OPCODE_SLT:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_SUB:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_sub(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXP:
+	{
+		struct nv50_reg *t[4];
+		struct nv50_program_exec *e;
+
+		alloc_temp4(pc, t, 0);
+		emit_mov(pc, t[0], src[0][0]);
+		emit_mov(pc, t[1], src[0][1]);
+
+		e = exec(pc);
+		e->inst[0] = 0xf6400000;
+		e->inst[0] |= (unit << 9);
+		set_long(pc, e);
+		e->inst[1] |= 0x0000c004;
+		set_dst(pc, t[0], e);
+		emit(pc, e);
+
+		if (mask & (1 << 0)) emit_mov(pc, dst[0], t[0]);
+		if (mask & (1 << 1)) emit_mov(pc, dst[1], t[1]);
+		if (mask & (1 << 2)) emit_mov(pc, dst[2], t[2]);
+		if (mask & (1 << 3)) emit_mov(pc, dst[3], t[3]);
+
+		free_temp4(pc, t);
+	}
+		break;
+	case TGSI_OPCODE_XPD:
+		temp = alloc_temp(pc, NULL);
+		if (mask & (1 << 0)) {
+			emit_mul(pc, temp, src[0][2], src[1][1]);
+			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
+		}
+		if (mask & (1 << 1)) {
+			emit_mul(pc, temp, src[0][0], src[1][2]);
+			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
+		}
+		if (mask & (1 << 2)) {
+			emit_mul(pc, temp, src[0][1], src[1][0]);
+			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_END:
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	if (sat) {
+		for (c = 0; c < 4; c++) {
+			struct nv50_program_exec *e;
+
+			if (!(mask & (1 << c)))
+				continue;
+			e = exec(pc);
+
+			e->inst[0] = 0xa0000000; /* cvt */
+			set_long(pc, e);
+			e->inst[1] |= (6 << 29); /* cvt */
+			e->inst[1] |= 0x04000000; /* 32 bit */
+			e->inst[1] |= (1 << 14); /* src .f32 */
+			e->inst[1] |= ((1 << 5) << 14); /* .sat */
+			set_dst(pc, rdst[c], e);
+			set_src_0(pc, dst[c], e);
+			emit(pc, e);
+		}
+	}
+
+	kill_temp_temp(pc);
+	return TRUE;
+}
+
+static boolean
+nv50_program_tx_prep(struct nv50_pc *pc)
+{
+	struct tgsi_parse_context p;
+	boolean ret = FALSE;
+	unsigned i, c;
+
+	tgsi_parse_init(&p, pc->p->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch (tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm =
+				&p.FullToken.FullImmediate;
+
+			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
+				      imm->u.ImmediateFloat32[1].Float,
+				      imm->u.ImmediateFloat32[2].Float,
+				      imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *d;
+			unsigned last;
+
+			d = &p.FullToken.FullDeclaration;
+			last = d->DeclarationRange.Last;
+
+			switch (d->Declaration.File) {
+			case TGSI_FILE_TEMPORARY:
+				if (pc->temp_nr < (last + 1))
+					pc->temp_nr = last + 1;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (pc->result_nr < (last + 1))
+					pc->result_nr = last + 1;
+				break;
+			case TGSI_FILE_INPUT:
+				if (pc->attr_nr < (last + 1))
+					pc->attr_nr = last + 1;
+				break;
+			case TGSI_FILE_CONSTANT:
+				if (pc->param_nr < (last + 1))
+					pc->param_nr = last + 1;
+				break;
+			case TGSI_FILE_SAMPLER:
+				break;
+			default:
+				NOUVEAU_ERR("bad decl file %d\n",
+					    d->Declaration.File);
+				goto out_err;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (pc->temp_nr) {
+		pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->temp)
+			goto out_err;
+
+		for (i = 0; i < pc->temp_nr; i++) {
+			for (c = 0; c < 4; c++) {
+				pc->temp[i*4+c].type = P_TEMP;
+				pc->temp[i*4+c].hw = -1;
+				pc->temp[i*4+c].index = i;
+			}
+		}
+	}
+
+	if (pc->attr_nr) {
+		struct nv50_reg *iv = NULL;
+		int aid = 0;
+
+		pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->attr)
+			goto out_err;
+
+		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+			iv = alloc_temp(pc, NULL);
+			emit_interp(pc, iv, iv, NULL);
+			emit_flop(pc, 0, iv, iv);
+			aid++;
+		}
+
+		for (i = 0; i < pc->attr_nr; i++) {
+			struct nv50_reg *a = &pc->attr[i*4];
+
+			for (c = 0; c < 4; c++) {
+				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+					struct nv50_reg *at =
+						alloc_temp(pc, NULL);
+					pc->attr[i*4+c].type = at->type;
+					pc->attr[i*4+c].hw = at->hw;
+					pc->attr[i*4+c].index = at->index;
+				} else {
+					pc->p->cfg.vp.attr[aid/32] |=
+						(1 << (aid % 32));
+					pc->attr[i*4+c].type = P_ATTR;
+					pc->attr[i*4+c].hw = aid++;
+					pc->attr[i*4+c].index = i;
+				}
+			}
+
+			if (pc->p->type != PIPE_SHADER_FRAGMENT)
+				continue;
+
+			emit_interp(pc, &a[0], &a[0], iv);
+			emit_interp(pc, &a[1], &a[1], iv);
+			emit_interp(pc, &a[2], &a[2], iv);
+			emit_interp(pc, &a[3], &a[3], iv);
+		}
+
+		if (iv)
+			free_temp(pc, iv);
+	}
+
+	if (pc->result_nr) {
+		int rid = 0;
+
+		pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->result)
+			goto out_err;
+
+		for (i = 0; i < pc->result_nr; i++) {
+			for (c = 0; c < 4; c++) {
+				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+					pc->result[i*4+c].type = P_TEMP;
+					pc->result[i*4+c].hw = -1;
+				} else {
+					pc->result[i*4+c].type = P_RESULT;
+					pc->result[i*4+c].hw = rid++;
+				}
+				pc->result[i*4+c].index = i;
+			}
+		}
+	}
+
+	if (pc->param_nr) {
+		int rid = 0;
+
+		pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->param)
+			goto out_err;
+
+		for (i = 0; i < pc->param_nr; i++) {
+			for (c = 0; c < 4; c++) {
+				pc->param[i*4+c].type = P_CONST;
+				pc->param[i*4+c].hw = rid++;
+				pc->param[i*4+c].index = i;
+			}
+		}
+	}
+
+	if (pc->immd_nr) {
+		int rid = pc->param_nr * 4;
+
+		pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->immd)
+			goto out_err;
+
+		for (i = 0; i < pc->immd_nr; i++) {
+			for (c = 0; c < 4; c++) {
+				pc->immd[i*4+c].type = P_IMMD;
+				pc->immd[i*4+c].hw = rid++;
+				pc->immd[i*4+c].index = i;
+			}
+		}
+	}
+
+	ret = TRUE;
+out_err:
+	tgsi_parse_free(&p);
+	return ret;
+}
+
+static boolean
+nv50_program_tx(struct nv50_program *p)
+{
+	struct tgsi_parse_context parse;
+	struct nv50_pc *pc;
+	boolean ret;
+
+	pc = CALLOC_STRUCT(nv50_pc);
+	if (!pc)
+		return FALSE;
+	pc->p = p;
+	pc->p->cfg.high_temp = 4;
+
+	ret = nv50_program_tx_prep(pc);
+	if (ret == FALSE)
+		goto out_cleanup;
+
+	tgsi_parse_init(&parse, pc->p->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		const union tgsi_full_token *tok = &parse.FullToken;
+
+		tgsi_parse_token(&parse);
+
+		switch (tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			ret = nv50_program_tx_insn(pc, tok);
+			if (ret == FALSE)
+				goto out_err;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (p->type == PIPE_SHADER_FRAGMENT) {
+		struct nv50_reg out;
+
+		out.type = P_TEMP;
+		for (out.hw = 0; out.hw < pc->result_nr * 4; out.hw++)
+			emit_mov(pc, &out, &pc->result[out.hw]);
+	}
+
+	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
+	pc->p->exec_tail->inst[1] |= 0x00000001;
+
+	p->param_nr = pc->param_nr * 4;
+	p->immd_nr = pc->immd_nr * 4;
+	p->immd = pc->immd_buf;
+
+out_err:
+	tgsi_parse_free(&parse);
+
+out_cleanup:
+	return ret;
+}
+
+static void
+nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
+{
+	if (nv50_program_tx(p) == FALSE)
+		assert(0);
+	p->translated = TRUE;
+}
+
+static void
+nv50_program_upload_data(struct nv50_context *nv50, float *map,
+			 unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+
+	while (count) {
+		unsigned nr = count > 2047 ? 2047 : count;
+
+		BEGIN_RING(chan, tesla, 0x00000f00, 1);
+		OUT_RING  (chan, (NV50_CB_PMISC << 0) | (start << 8));
+		BEGIN_RING(chan, tesla, 0x40000f04, nr);
+		OUT_RINGp (chan, map, nr);
+
+		map += nr;
+		start += nr;
+		count -= nr;
+	}
+}
+
+static void
+nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
+{
+	struct nouveau_winsys *nvws = nv50->screen->nvws;
+	struct pipe_winsys *ws = nv50->pipe.winsys;
+	unsigned nr = p->param_nr + p->immd_nr;
+
+	if (!p->data && nr) {
+		struct nouveau_resource *heap = nv50->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, nr, p, &p->data)) {
+			while (heap->next && heap->size < nr) {
+				struct nv50_program *evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, nr, p, &p->data))
+				assert(0);
+		}
+	}
+
+	if (p->param_nr) {
+		float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
+					    PIPE_BUFFER_USAGE_CPU_READ);
+		nv50_program_upload_data(nv50, map, p->data->start,
+					 p->param_nr);
+		ws->buffer_unmap(ws, nv50->constbuf[p->type]);
+	}
+
+	if (p->immd_nr) {
+		nv50_program_upload_data(nv50, p->immd,
+					 p->data->start + p->param_nr,
+					 p->immd_nr);
+	}
+}
+
+static void
+nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
+{
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct pipe_winsys *ws = nv50->pipe.winsys;
+	struct nv50_program_exec *e;
+	struct nouveau_stateobj *so;
+	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
+	unsigned start, count, *up, *ptr;
+	boolean upload = FALSE;
+
+	if (!p->buffer) {
+		p->buffer = ws->buffer_create(ws, 0x100, 0, p->exec_size * 4);
+		upload = TRUE;
+	}
+
+	if (p->data && p->data->start != p->data_start) {
+		for (e = p->exec_head; e; e = e->next) {
+			unsigned ei, ci;
+
+			if (e->param.index < 0)
+				continue;
+			ei = e->param.shift >> 5;
+			ci = e->param.index + p->data->start;
+
+			e->inst[ei] &= ~e->param.mask;
+			e->inst[ei] |= (ci << e->param.shift);
+		}
+
+		p->data_start = p->data->start;
+		upload = TRUE;
+	}
+
+	if (!upload)
+		return;
+
+#ifdef NV50_PROGRAM_DUMP
+	NOUVEAU_ERR("-------\n");
+	up = ptr = MALLOC(p->exec_size * 4);
+	for (e = p->exec_head; e; e = e->next) {
+		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
+		if (is_long(e))
+			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
+	}
+
+#endif
+
+	up = ptr = MALLOC(p->exec_size * 4);
+	for (e = p->exec_head; e; e = e->next) {
+		*(ptr++) = e->inst[0];
+		if (is_long(e))
+			*(ptr++) = e->inst[1];
+	}
+
+	so = so_new(4,2);
+	so_method(so, nv50->screen->tesla, 0x1280, 3);
+	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
+
+	start = 0; count = p->exec_size;
+	while (count) {
+		struct nouveau_winsys *nvws = nv50->screen->nvws;
+		unsigned nr;
+
+		so_emit(nvws, so);
+
+		nr = MIN2(count, 2047);
+		nr = MIN2(nvws->channel->pushbuf->remaining, nr);
+		if (nvws->channel->pushbuf->remaining < (nr + 3)) {
+			FIRE_RING(chan);
+			continue;
+		}
+
+		BEGIN_RING(chan, tesla, 0x0f00, 1);
+		OUT_RING  (chan, (start << 8) | NV50_CB_PUPLOAD);
+		BEGIN_RING(chan, tesla, 0x40000f04, nr);	
+		OUT_RINGp (chan, up + start, nr);
+
+		start += nr;
+		count -= nr;
+	}
+
+	FREE(up);
+	so_ref(NULL, &so);
+}
+
+void
+nv50_vertprog_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *p = nv50->vertprog;
+	struct nouveau_stateobj *so;
+
+	if (!p->translated) {
+		nv50_program_validate(nv50, p);
+		if (!p->translated)
+			assert(0);
+	}
+
+	nv50_program_validate_data(nv50, p);
+	nv50_program_validate_code(nv50, p);
+
+	so = so_new(13, 2);
+	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
+	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_LOW, 0, 0);
+	so_method(so, tesla, 0x1650, 2);
+	so_data  (so, p->cfg.vp.attr[0]);
+	so_data  (so, p->cfg.vp.attr[1]);
+	so_method(so, tesla, 0x16b8, 1);
+	so_data  (so, p->cfg.high_result);
+	so_method(so, tesla, 0x16ac, 2);
+	so_data  (so, p->cfg.high_result); //8);
+	so_data  (so, p->cfg.high_temp);
+	so_method(so, tesla, 0x140c, 1);
+	so_data  (so, 0); /* program start offset */
+	so_ref(so, &nv50->state.vertprog);
+}
+
+void
+nv50_fragprog_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *p = nv50->fragprog;
+	struct nouveau_stateobj *so;
+
+	if (!p->translated) {
+		nv50_program_validate(nv50, p);
+		if (!p->translated)
+			assert(0);
+	}
+
+	nv50_program_validate_data(nv50, p);
+	nv50_program_validate_code(nv50, p);
+
+	so = so_new(64, 2);
+	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
+	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_LOW, 0, 0);
+	so_method(so, tesla, 0x1904, 4);
+	so_data  (so, 0x00040404); /* p: 0x01000404 */
+	so_data  (so, 0x00000004);
+	so_data  (so, 0x00000000);
+	so_data  (so, 0x00000000);
+	so_method(so, tesla, 0x16bc, 3); /*XXX: fixme */
+	so_data  (so, 0x03020100);
+	so_data  (so, 0x07060504);
+	so_data  (so, 0x0b0a0908);
+	so_method(so, tesla, 0x1988, 2);
+	so_data  (so, 0x08080408); //0x08040404); /* p: 0x0f000401 */
+	so_data  (so, p->cfg.high_temp);
+	so_method(so, tesla, 0x1414, 1);
+	so_data  (so, 0); /* program start offset */
+	so_ref(so, &nv50->state.fragprog);
+}
+
+void
+nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
+{
+	struct pipe_screen *pscreen = nv50->pipe.screen;
+
+	while (p->exec_head) {
+		struct nv50_program_exec *e = p->exec_head;
+
+		p->exec_head = e->next;
+		FREE(e);
+	}
+	p->exec_tail = NULL;
+	p->exec_size = 0;
+
+	if (p->buffer)
+		pipe_buffer_reference(pscreen, &p->buffer, NULL);
+
+	nv50->screen->nvws->res_free(&p->data);
+
+	p->translated = 0;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
new file mode 100644
index 0000000000..78deed6a38
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -0,0 +1,45 @@
+#ifndef __NV50_PROGRAM_H__
+#define __NV50_PROGRAM_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv50_program_exec {
+	struct nv50_program_exec *next;
+
+	unsigned inst[2];
+	struct {
+		int index;
+		unsigned mask;
+		unsigned shift;
+	} param;
+};
+
+struct nv50_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+	boolean translated;
+
+	unsigned type;
+	struct nv50_program_exec *exec_head;
+	struct nv50_program_exec *exec_tail;
+	unsigned exec_size;
+	struct nouveau_resource *data;
+	unsigned data_start;
+
+	struct pipe_buffer *buffer;
+
+	float *immd;
+	unsigned immd_nr;
+	unsigned param_nr;
+
+	struct {
+		unsigned high_temp;
+		unsigned high_result;
+		struct {
+			unsigned attr[2];
+		} vp;
+	} cfg;
+};
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c
new file mode 100644
index 0000000000..7c8831a46d
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_inlines.h"
+
+#include "nv50_context.h"
+
+struct nv50_query {
+	struct pipe_buffer *buffer;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static INLINE struct nv50_query *
+nv50_query(struct pipe_query *pipe)
+{
+	return (struct nv50_query *)pipe;
+}
+
+static struct pipe_query *
+nv50_query_create(struct pipe_context *pipe, unsigned type)
+{
+	struct pipe_winsys *ws = pipe->winsys;
+	struct nv50_query *q = CALLOC_STRUCT(nv50_query);
+
+	assert (q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+	q->type = type;
+
+	q->buffer = ws->buffer_create(ws, 256, 0, 16);
+	if (!q->buffer) {
+		FREE(q);
+		return NULL;
+	}
+
+	return (struct pipe_query *)q;
+}
+
+static void
+nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv50_query *q = nv50_query(pq);
+
+	if (q) {
+		pipe_buffer_reference(pipe->screen, &q->buffer, NULL);
+		FREE(q);
+	}
+}
+
+static void
+nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_query *q = nv50_query(pq);
+
+	BEGIN_RING(chan, tesla, 0x1530, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, tesla, 0x1514, 1);
+	OUT_RING  (chan, 1);
+
+	q->ready = FALSE;
+}
+
+static void
+nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_query *q = nv50_query(pq);
+	struct nouveau_bo *bo = nv50->screen->nvws->get_bo(q->buffer);
+
+	WAIT_RING (chan, 5);
+	BEGIN_RING(chan, tesla, 0x1b00, 4);
+	OUT_RELOCh(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RING  (chan, 0x00000000);
+	OUT_RING  (chan, 0x0100f002);
+	FIRE_RING (chan);
+}
+
+static boolean
+nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+		  boolean wait, uint64_t *result)
+{
+	struct pipe_winsys *ws = pipe->winsys;
+	struct nv50_query *q = nv50_query(pq);
+
+	/*XXX: Want to be able to return FALSE here instead of blocking
+	 *     until the result is available..
+	 */
+
+	if (!q->ready) {
+		uint32_t *map = ws->buffer_map(ws, q->buffer,
+					       PIPE_BUFFER_USAGE_CPU_READ);
+		q->result = map[1];
+		q->ready = TRUE;
+		ws->buffer_unmap(ws, q->buffer);
+	}
+
+	*result = q->result;
+	return q->ready;
+}
+
+void
+nv50_init_query_functions(struct nv50_context *nv50)
+{
+	nv50->pipe.create_query = nv50_query_create;
+	nv50->pipe.destroy_query = nv50_query_destroy;
+	nv50->pipe.begin_query = nv50_query_begin;
+	nv50->pipe.end_query = nv50_query_end;
+	nv50->pipe.get_query_result = nv50_query_result;
+}
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
new file mode 100644
index 0000000000..ee24405d36
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -0,0 +1,373 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_screen.h"
+
+#include "util/u_simple_screen.h"
+
+#include "nv50_context.h"
+#include "nv50_screen.h"
+
+#include "nouveau/nouveau_stateobj.h"
+
+#define NV5X_GRCLASS5097_CHIPSETS 0x00000001
+#define NV8X_GRCLASS8297_CHIPSETS 0x00000050
+#define NV9X_GRCLASS8297_CHIPSETS 0x00000014
+
+static boolean
+nv50_screen_is_format_supported(struct pipe_screen *pscreen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_A8L8_UNORM:
+		case PIPE_FORMAT_DXT1_RGB:
+		case PIPE_FORMAT_DXT1_RGBA:
+		case PIPE_FORMAT_DXT3_RGBA:
+		case PIPE_FORMAT_DXT5_RGBA:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static const char *
+nv50_screen_get_name(struct pipe_screen *pscreen)
+{
+	struct nv50_screen *screen = nv50_screen(pscreen);
+	struct nouveau_device *dev = screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv50_screen_get_vendor(struct pipe_screen *pscreen)
+{
+	return "nouveau";
+}
+
+static int
+nv50_screen_get_param(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 32;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 1;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 1;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 8;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 0;
+	case NOUVEAU_CAP_HW_VTXBUF:	
+		return 1;
+	case NOUVEAU_CAP_HW_IDXBUF:	
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv50_screen_get_paramf(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 16.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static void
+nv50_screen_destroy(struct pipe_screen *pscreen)
+{
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv50_screen *screen = CALLOC_STRUCT(nv50_screen);
+	struct nouveau_stateobj *so;
+	unsigned tesla_class = 0, ret;
+	unsigned chipset = nvws->channel->device->chipset;
+	int i;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* DMA engine object */
+	ret = nvws->grobj_alloc(nvws, 0x5039, &screen->m2mf);
+	if (ret) {
+		NOUVEAU_ERR("Error creating M2MF object: %d\n", ret);
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* 2D object */
+	ret = nvws->grobj_alloc(nvws, NV50_2D, &screen->eng2d);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 2D object: %d\n", ret);
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* 3D object */
+	if ((chipset & 0xf0) != 0x50 && (chipset & 0xf0) != 0x80) {
+		NOUVEAU_ERR("Not a G8x chipset\n");
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	switch (chipset & 0xf0) {
+	case 0x50:
+		if (NV5X_GRCLASS5097_CHIPSETS & (1 << (chipset & 0x0f)))
+			tesla_class = 0x5097;
+		break;
+	case 0x80:
+		if (NV8X_GRCLASS8297_CHIPSETS & (1 << (chipset & 0x0f)))
+			tesla_class = 0x8297;
+		break;
+	case 0x90:
+		if (NV9X_GRCLASS8297_CHIPSETS & (1 << (chipset & 0x0f)))
+			tesla_class = 0x8297;
+		break;
+	default:
+		break;
+	}
+
+	if (tesla_class == 0) {
+		NOUVEAU_ERR("Unknown G8x chipset: NV%02x\n", chipset);
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, tesla_class, &screen->tesla);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Sync notifier */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Static M2MF init */
+	so = so_new(32, 0);
+	so_method(so, screen->m2mf, 0x0180, 3);
+	so_data  (so, screen->sync->handle);
+	so_data  (so, screen->nvws->channel->vram->handle);
+	so_data  (so, screen->nvws->channel->vram->handle);
+	so_emit(nvws, so);
+	so_ref (NULL, &so);
+
+	/* Static 2D init */
+	so = so_new(64, 0);
+	so_method(so, screen->eng2d, NV50_2D_DMA_NOTIFY, 4);
+	so_data  (so, screen->sync->handle);
+	so_data  (so, screen->nvws->channel->vram->handle);
+	so_data  (so, screen->nvws->channel->vram->handle);
+	so_data  (so, screen->nvws->channel->vram->handle);
+	so_method(so, screen->eng2d, NV50_2D_OPERATION, 1);
+	so_data  (so, NV50_2D_OPERATION_SRCCOPY);
+	so_method(so, screen->eng2d, 0x0290, 1);
+	so_data  (so, 0);
+	so_method(so, screen->eng2d, 0x0888, 1);
+	so_data  (so, 1);
+	so_emit(nvws, so);
+	so_ref(NULL, &so);
+
+	/* Static tesla init */
+	so = so_new(256, 20);
+
+	so_method(so, screen->tesla, 0x1558, 1);
+	so_data  (so, 1);
+	so_method(so, screen->tesla, NV50TCL_DMA_NOTIFY, 1);
+	so_data  (so, screen->sync->handle);
+	so_method(so, screen->tesla, NV50TCL_DMA_UNK0(0),
+				     NV50TCL_DMA_UNK0__SIZE);
+	for (i = 0; i < NV50TCL_DMA_UNK0__SIZE; i++)
+		so_data(so, nvws->channel->vram->handle);
+	so_method(so, screen->tesla, NV50TCL_DMA_UNK1(0),
+				     NV50TCL_DMA_UNK1__SIZE);
+	for (i = 0; i < NV50TCL_DMA_UNK1__SIZE; i++)
+		so_data(so, nvws->channel->vram->handle);
+	so_method(so, screen->tesla, 0x121c, 1);
+	so_data  (so, 1);
+
+	so_method(so, screen->tesla, 0x13bc, 1);
+	so_data  (so, 0x54);
+	so_method(so, screen->tesla, 0x13ac, 1);
+	so_data  (so, 1);
+	so_method(so, screen->tesla, 0x16b8, 1);
+	so_data  (so, 8);
+
+	/* Shared constant buffer */
+	screen->constbuf = ws->buffer_create(ws, 0, 0, 128 * 4 * 4);
+	if (nvws->res_init(&screen->vp_data_heap, 0, 128)) {
+		NOUVEAU_ERR("Error initialising constant buffer\n");
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	so_method(so, screen->tesla, 0x1280, 3);
+	so_reloc (so, screen->constbuf, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->constbuf, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_PMISC << 16) | 0x00001000);
+
+	/* Texture sampler/image unit setup - we abuse the constant buffer
+	 * upload mechanism for the moment to upload data to the tex config
+	 * blocks.  At some point we *may* want to go the NVIDIA way of doing
+	 * things?
+	 */
+	screen->tic = ws->buffer_create(ws, 0, 0, 32 * 8 * 4);
+	so_method(so, screen->tesla, 0x1280, 3);
+	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_TIC << 16) | 0x0800);
+	so_method(so, screen->tesla, 0x1574, 3);
+	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, 0x00000800);
+
+	screen->tsc = ws->buffer_create(ws, 0, 0, 32 * 8 * 4);
+	so_method(so, screen->tesla, 0x1280, 3);
+	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_TSC << 16) | 0x0800);
+	so_method(so, screen->tesla, 0x155c, 3);
+	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, 0x00000800);
+
+
+	/* Vertex array limits - max them out */
+	for (i = 0; i < 16; i++) {
+		so_method(so, screen->tesla, 0x1080 + (i * 8), 2);
+		so_data  (so, 0x000000ff);
+		so_data  (so, 0xffffffff);
+	}
+
+	so_method(so, screen->tesla, NV50TCL_DEPTH_RANGE_NEAR, 2);
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(1.0));
+
+	so_method(so, screen->tesla, 0x1234, 1);
+	so_data  (so, 1);
+	so_method(so, screen->tesla, 0x1458, 1);
+	so_data  (so, 1);
+
+	so_emit(nvws, so);
+	so_ref(so, &screen->static_init);
+	nvws->push_flush(nvws, 0, NULL);
+
+	screen->pipe.winsys = ws;
+
+	screen->pipe.destroy = nv50_screen_destroy;
+
+	screen->pipe.get_name = nv50_screen_get_name;
+	screen->pipe.get_vendor = nv50_screen_get_vendor;
+	screen->pipe.get_param = nv50_screen_get_param;
+	screen->pipe.get_paramf = nv50_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv50_screen_is_format_supported;
+
+	nv50_screen_init_miptree_functions(&screen->pipe);
+	nv50_transfer_init_screen_functions(&screen->pipe);
+	u_simple_screen_init(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
new file mode 100644
index 0000000000..db567aaac8
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -0,0 +1,35 @@
+#ifndef __NV50_SCREEN_H__
+#define __NV50_SCREEN_H__
+
+#include "pipe/p_screen.h"
+
+struct nv50_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	unsigned cur_pctx;
+
+	struct nouveau_grobj *tesla;
+	struct nouveau_grobj *eng2d;
+	struct nouveau_grobj *m2mf;
+	struct nouveau_notifier *sync;
+
+	struct pipe_buffer *constbuf;
+	struct nouveau_resource *vp_data_heap;
+
+	struct pipe_buffer *tic;
+	struct pipe_buffer *tsc;
+
+	struct nouveau_stateobj *static_init;
+};
+
+static INLINE struct nv50_screen *
+nv50_screen(struct pipe_screen *screen)
+{
+	return (struct nv50_screen *)screen;
+}
+
+void nv50_transfer_init_screen_functions(struct pipe_screen *);
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
new file mode 100644
index 0000000000..787ff958ec
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -0,0 +1,664 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv50_context.h"
+#include "nv50_texture.h"
+
+#include "nouveau/nouveau_stateobj.h"
+
+static void *
+nv50_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
+	struct nv50_blend_stateobj *bso = CALLOC_STRUCT(nv50_blend_stateobj);
+	unsigned cmask = 0, i;
+
+	/*XXX ignored:
+	 * 	- dither
+	 */
+
+	if (cso->blend_enable == 0) {
+		so_method(so, tesla, NV50TCL_BLEND_ENABLE(0), 8);
+		for (i = 0; i < 8; i++)
+			so_data(so, 0);
+	} else {
+		so_method(so, tesla, NV50TCL_BLEND_ENABLE(0), 8);
+		for (i = 0; i < 8; i++)
+			so_data(so, 1);
+		so_method(so, tesla, NV50TCL_BLEND_EQUATION_RGB, 5);
+		so_data  (so, nvgl_blend_eqn(cso->rgb_func));
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->rgb_src_factor));
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->rgb_dst_factor));
+		so_data  (so, nvgl_blend_eqn(cso->alpha_func));
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->alpha_src_factor));
+		so_method(so, tesla, NV50TCL_BLEND_FUNC_DST_ALPHA, 1);
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->alpha_dst_factor));
+	}
+
+	if (cso->logicop_enable == 0 ) {
+		so_method(so, tesla, NV50TCL_LOGIC_OP_ENABLE, 1);
+		so_data  (so, 0);
+	} else {
+		so_method(so, tesla, NV50TCL_LOGIC_OP_ENABLE, 2);
+		so_data  (so, 1);
+		so_data  (so, nvgl_logicop_func(cso->logicop_func));
+	}
+
+	if (cso->colormask & PIPE_MASK_R)
+		cmask |= (1 << 0);
+	if (cso->colormask & PIPE_MASK_G)
+		cmask |= (1 << 4);
+	if (cso->colormask & PIPE_MASK_B)
+		cmask |= (1 << 8);
+	if (cso->colormask & PIPE_MASK_A)
+		cmask |= (1 << 12);
+	so_method(so, tesla, NV50TCL_COLOR_MASK(0), 8);
+	for (i = 0; i < 8; i++)
+		so_data(so, cmask);
+
+	bso->pipe = *cso;
+	so_ref(so, &bso->so);
+	return (void *)bso;
+}
+
+static void
+nv50_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->blend = hwcso;
+	nv50->dirty |= NV50_NEW_BLEND;
+}
+
+static void
+nv50_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_blend_stateobj *bso = hwcso;
+
+	so_ref(NULL, &bso->so);
+	FREE(bso);
+}
+
+static INLINE unsigned
+wrap_mode(unsigned wrap)
+{
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		return NV50TSC_1_0_WRAPS_REPEAT;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		return NV50TSC_1_0_WRAPS_MIRROR_REPEAT;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		return NV50TSC_1_0_WRAPS_CLAMP_TO_EDGE;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		return NV50TSC_1_0_WRAPS_CLAMP_TO_BORDER;
+	case PIPE_TEX_WRAP_CLAMP:
+		return NV50TSC_1_0_WRAPS_CLAMP;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		return NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_EDGE;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		return NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_BORDER;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		return NV50TSC_1_0_WRAPS_MIRROR_CLAMP;
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		return NV50TSC_1_0_WRAPS_REPEAT;
+	}
+}
+static void *
+nv50_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	unsigned *tsc = CALLOC(8, sizeof(unsigned));
+
+	tsc[0] = (0x00024000 |
+		  (wrap_mode(cso->wrap_s) << 0) |
+		  (wrap_mode(cso->wrap_t) << 3) |
+		  (wrap_mode(cso->wrap_r) << 6));
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		tsc[1] |= NV50TSC_1_1_MAGF_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		tsc[1] |= NV50TSC_1_1_MAGF_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		tsc[1] |= NV50TSC_1_1_MINF_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		tsc[1] |= NV50TSC_1_1_MINF_NEAREST;
+		break;
+	}
+
+	switch (cso->min_mip_filter) {
+	case PIPE_TEX_MIPFILTER_LINEAR:
+		tsc[1] |= NV50TSC_1_1_MIPF_LINEAR;
+		break;
+	case PIPE_TEX_MIPFILTER_NEAREST:
+		tsc[1] |= NV50TSC_1_1_MIPF_NEAREST;
+		break;
+	case PIPE_TEX_MIPFILTER_NONE:
+	default:
+		tsc[1] |= NV50TSC_1_1_MIPF_NONE;
+		break;
+	}
+
+	if (cso->max_anisotropy >= 16.0)
+		tsc[0] |= (7 << 20);
+	else
+	if (cso->max_anisotropy >= 12.0)
+		tsc[0] |= (6 << 20);
+	else
+	if (cso->max_anisotropy >= 10.0)
+		tsc[0] |= (5 << 20);
+	else
+	if (cso->max_anisotropy >= 8.0)
+		tsc[0] |= (4 << 20);
+	else
+	if (cso->max_anisotropy >= 6.0)
+		tsc[0] |= (3 << 20);
+	else
+	if (cso->max_anisotropy >= 4.0)
+		tsc[0] |= (2 << 20);
+	else
+	if (cso->max_anisotropy >= 2.0)
+		tsc[0] |= (1 << 20);
+
+	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		tsc[0] |= (1 << 8);
+		tsc[0] |= (nvgl_comparison_op(cso->compare_func) & 0x7);
+	}
+
+	return (void *)tsc;
+}
+
+static void
+nv50_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	int i;
+
+	nv50->sampler_nr = nr;
+	for (i = 0; i < nv50->sampler_nr; i++)
+		nv50->sampler[i] = sampler[i];
+
+	nv50->dirty |= NV50_NEW_SAMPLER;
+}
+
+static void
+nv50_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv50_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **pt)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	int i;
+
+	for (i = 0; i < nr; i++)
+		pipe_texture_reference((void *)&nv50->miptree[i], pt[i]);
+	for (i = nr; i < nv50->miptree_nr; i++)
+		pipe_texture_reference((void *)&nv50->miptree[i], NULL);
+
+	nv50->miptree_nr = nr;
+	nv50->dirty |= NV50_NEW_TEXTURE;
+}
+
+static void *
+nv50_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
+	struct nv50_rasterizer_stateobj *rso =
+		CALLOC_STRUCT(nv50_rasterizer_stateobj);
+
+	/*XXX: ignored
+	 * 	- light_twosize
+	 * 	- point_smooth
+	 * 	- multisample
+	 * 	- point_sprite / sprite_coord_mode
+	 */
+
+	so_method(so, tesla, NV50TCL_SHADE_MODEL, 1);
+	so_data  (so, cso->flatshade ? NV50TCL_SHADE_MODEL_FLAT :
+				       NV50TCL_SHADE_MODEL_SMOOTH);
+
+	so_method(so, tesla, NV50TCL_LINE_WIDTH, 1);
+	so_data  (so, fui(cso->line_width));
+	so_method(so, tesla, NV50TCL_LINE_SMOOTH_ENABLE, 1);
+	so_data  (so, cso->line_smooth ? 1 : 0);
+	if (cso->line_stipple_enable) {
+		so_method(so, tesla, NV50TCL_LINE_STIPPLE_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, NV50TCL_LINE_STIPPLE_PATTERN, 1);
+		so_data  (so, (cso->line_stipple_pattern << 8) |
+			       cso->line_stipple_factor);
+	} else {
+		so_method(so, tesla, NV50TCL_LINE_STIPPLE_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, tesla, NV50TCL_POINT_SIZE, 1);
+	so_data  (so, fui(cso->point_size));
+
+	so_method(so, tesla, NV50TCL_POLYGON_MODE_FRONT, 3);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+	} else {
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+	}
+	so_data(so, cso->poly_smooth ? 1 : 0);
+
+	so_method(so, tesla, NV50TCL_CULL_FACE_ENABLE, 3);
+	so_data  (so, cso->cull_mode != PIPE_WINDING_NONE);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, NV50TCL_FRONT_FACE_CCW);
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV50TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV50TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV50TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV50TCL_CULL_FACE_BACK);
+			break;
+		}
+	} else {
+		so_data(so, NV50TCL_FRONT_FACE_CW);
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV50TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV50TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV50TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV50TCL_CULL_FACE_BACK);
+			break;
+		}
+	}
+
+	so_method(so, tesla, NV50TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
+
+	so_method(so, tesla, NV50TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_POINT) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_POINT))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_LINE) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_LINE))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_FILL) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_FILL))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+
+	if (cso->offset_cw || cso->offset_ccw) {
+		so_method(so, tesla, NV50TCL_POLYGON_OFFSET_FACTOR, 1);
+		so_data  (so, fui(cso->offset_scale));
+		so_method(so, tesla, NV50TCL_POLYGON_OFFSET_UNITS, 1);
+		so_data  (so, fui(cso->offset_units));
+	}
+
+	rso->pipe = *cso;
+	so_ref(so, &rso->so);
+	return (void *)rso;
+}
+
+static void
+nv50_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->rasterizer = hwcso;
+	nv50->dirty |= NV50_NEW_RASTERIZER;
+}
+
+static void
+nv50_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_rasterizer_stateobj *rso = hwcso;
+
+	so_ref(NULL, &rso->so);
+	FREE(rso);
+}
+
+static void *
+nv50_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
+	struct nv50_zsa_stateobj *zsa = CALLOC_STRUCT(nv50_zsa_stateobj);
+	struct nouveau_stateobj *so = so_new(64, 0);
+
+	so_method(so, tesla, NV50TCL_DEPTH_WRITE_ENABLE, 1);
+	so_data  (so, cso->depth.writemask ? 1 : 0);
+	if (cso->depth.enabled) {
+		so_method(so, tesla, NV50TCL_DEPTH_TEST_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, NV50TCL_DEPTH_TEST_FUNC, 1);
+		so_data  (so, nvgl_comparison_op(cso->depth.func));
+	} else {
+		so_method(so, tesla, NV50TCL_DEPTH_TEST_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	/*XXX: yes, I know they're backwards.. header needs fixing */
+	if (cso->stencil[0].enabled) {
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_ENABLE, 5);
+		so_data  (so, 1);
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_FUNC_REF, 3);
+		so_data  (so, cso->stencil[0].ref_value);
+		so_data  (so, cso->stencil[0].writemask);
+		so_data  (so, cso->stencil[0].valuemask);
+	} else {
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		so_method(so, tesla, NV50TCL_STENCIL_FRONT_ENABLE, 8);
+		so_data  (so, 1);
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_data  (so, cso->stencil[1].ref_value);
+		so_data  (so, cso->stencil[1].writemask);
+		so_data  (so, cso->stencil[1].valuemask);
+	} else {
+		so_method(so, tesla, NV50TCL_STENCIL_FRONT_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->alpha.enabled) {
+		so_method(so, tesla, NV50TCL_ALPHA_TEST_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, NV50TCL_ALPHA_TEST_REF, 2);
+		so_data  (so, fui(cso->alpha.ref_value));
+		so_data  (so, nvgl_comparison_op(cso->alpha.func));
+	} else {
+		so_method(so, tesla, NV50TCL_ALPHA_TEST_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	zsa->pipe = *cso;
+	so_ref(so, &zsa->so);
+	return (void *)zsa;
+}
+
+static void
+nv50_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->zsa = hwcso;
+	nv50->dirty |= NV50_NEW_ZSA;
+}
+
+static void
+nv50_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_zsa_stateobj *zsa = hwcso;
+
+	so_ref(NULL, &zsa->so);
+	FREE(zsa);
+}
+
+static void *
+nv50_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv50_program *p = CALLOC_STRUCT(nv50_program);
+
+	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	p->type = PIPE_SHADER_VERTEX;
+	tgsi_scan_shader(p->pipe.tokens, &p->info);
+	return (void *)p;
+}
+
+static void
+nv50_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->vertprog = hwcso;
+	nv50->dirty |= NV50_NEW_VERTPROG;
+}
+
+static void
+nv50_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_program *p = hwcso;
+
+	nv50_program_destroy(nv50, p);
+	FREE((void*)p->pipe.tokens);
+	FREE(p);
+}
+
+static void *
+nv50_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv50_program *p = CALLOC_STRUCT(nv50_program);
+
+	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	p->type = PIPE_SHADER_FRAGMENT;
+	tgsi_scan_shader(p->pipe.tokens, &p->info);
+	return (void *)p;
+}
+
+static void
+nv50_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->fragprog = hwcso;
+	nv50->dirty |= NV50_NEW_FRAGPROG;
+}
+
+static void
+nv50_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_program *p = hwcso;
+
+	nv50_program_destroy(nv50, p);
+	FREE((void*)p->pipe.tokens);
+	FREE(p);
+}
+
+static void
+nv50_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->blend_colour = *bcol;
+	nv50->dirty |= NV50_NEW_BLEND_COLOUR;
+}
+
+static void
+nv50_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+}
+
+static void
+nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv50->constbuf[PIPE_SHADER_VERTEX] = buf->buffer;
+		nv50->dirty |= NV50_NEW_VERTPROG_CB;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv50->constbuf[PIPE_SHADER_FRAGMENT] = buf->buffer;
+		nv50->dirty |= NV50_NEW_FRAGPROG_CB;
+	}
+}
+
+static void
+nv50_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->framebuffer = *fb;
+	nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+}
+
+static void
+nv50_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->stipple = *stipple;
+	nv50->dirty |= NV50_NEW_STIPPLE;
+}
+
+static void
+nv50_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->scissor = *s;
+	nv50->dirty |= NV50_NEW_SCISSOR;
+}
+
+static void
+nv50_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->viewport = *vpt;
+	nv50->dirty |= NV50_NEW_VIEWPORT;
+}
+
+static void
+nv50_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	memcpy(nv50->vtxbuf, vb, sizeof(*vb) * count);
+	nv50->vtxbuf_nr = count;
+
+	nv50->dirty |= NV50_NEW_ARRAYS;
+}
+
+static void
+nv50_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	memcpy(nv50->vtxelt, ve, sizeof(*ve) * count);
+	nv50->vtxelt_nr = count;
+
+	nv50->dirty |= NV50_NEW_ARRAYS;
+}
+
+void
+nv50_init_state_functions(struct nv50_context *nv50)
+{
+	nv50->pipe.create_blend_state = nv50_blend_state_create;
+	nv50->pipe.bind_blend_state = nv50_blend_state_bind;
+	nv50->pipe.delete_blend_state = nv50_blend_state_delete;
+
+	nv50->pipe.create_sampler_state = nv50_sampler_state_create;
+	nv50->pipe.bind_sampler_states = nv50_sampler_state_bind;
+	nv50->pipe.delete_sampler_state = nv50_sampler_state_delete;
+	nv50->pipe.set_sampler_textures = nv50_set_sampler_texture;
+
+	nv50->pipe.create_rasterizer_state = nv50_rasterizer_state_create;
+	nv50->pipe.bind_rasterizer_state = nv50_rasterizer_state_bind;
+	nv50->pipe.delete_rasterizer_state = nv50_rasterizer_state_delete;
+
+	nv50->pipe.create_depth_stencil_alpha_state =
+		nv50_depth_stencil_alpha_state_create;
+	nv50->pipe.bind_depth_stencil_alpha_state =
+		nv50_depth_stencil_alpha_state_bind;
+	nv50->pipe.delete_depth_stencil_alpha_state =
+		nv50_depth_stencil_alpha_state_delete;
+
+	nv50->pipe.create_vs_state = nv50_vp_state_create;
+	nv50->pipe.bind_vs_state = nv50_vp_state_bind;
+	nv50->pipe.delete_vs_state = nv50_vp_state_delete;
+
+	nv50->pipe.create_fs_state = nv50_fp_state_create;
+	nv50->pipe.bind_fs_state = nv50_fp_state_bind;
+	nv50->pipe.delete_fs_state = nv50_fp_state_delete;
+
+	nv50->pipe.set_blend_color = nv50_set_blend_color;
+	nv50->pipe.set_clip_state = nv50_set_clip_state;
+	nv50->pipe.set_constant_buffer = nv50_set_constant_buffer;
+	nv50->pipe.set_framebuffer_state = nv50_set_framebuffer_state;
+	nv50->pipe.set_polygon_stipple = nv50_set_polygon_stipple;
+	nv50->pipe.set_scissor_state = nv50_set_scissor_state;
+	nv50->pipe.set_viewport_state = nv50_set_viewport_state;
+
+	nv50->pipe.set_vertex_buffers = nv50_set_vertex_buffers;
+	nv50->pipe.set_vertex_elements = nv50_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
new file mode 100644
index 0000000000..948112ffa9
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -0,0 +1,313 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nv50_context.h"
+#include "nouveau/nouveau_stateobj.h"
+
+static void
+nv50_state_validate_fb(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so = so_new(128, 18);
+	struct pipe_framebuffer_state *fb = &nv50->framebuffer;
+	unsigned i, w, h, gw = 0;
+
+	for (i = 0; i < fb->nr_cbufs; i++) {
+		if (!gw) {
+			w = fb->cbufs[i]->width;
+			h = fb->cbufs[i]->height;
+			gw = 1;
+		} else {
+			assert(w == fb->cbufs[i]->width);
+			assert(h == fb->cbufs[i]->height);
+		}
+
+		so_method(so, tesla, NV50TCL_RT_HORIZ(i), 2);
+		so_data  (so, fb->cbufs[i]->width);
+		so_data  (so, fb->cbufs[i]->height);
+
+		so_method(so, tesla, NV50TCL_RT_ADDRESS_HIGH(i), 5);
+		so_reloc (so, nv50_surface_buffer(fb->cbufs[i]), fb->cbufs[i]->offset,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH |
+			  NOUVEAU_BO_RDWR, 0, 0);
+		so_reloc (so, nv50_surface_buffer(fb->cbufs[i]), fb->cbufs[i]->offset,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
+			  NOUVEAU_BO_RDWR, 0, 0);
+		switch (fb->cbufs[i]->format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+			so_data(so, 0xcf);
+			break;
+		case PIPE_FORMAT_R5G6B5_UNORM:
+			so_data(so, 0xe8);
+			break;
+		default:
+			NOUVEAU_ERR("AIIII unknown format %s\n",
+				    pf_name(fb->cbufs[i]->format));
+			so_data(so, 0xe6);
+			break;
+		}
+		so_data(so, 0x00000000);
+		so_data(so, 0x00000000);
+
+		so_method(so, tesla, 0x1224, 1);
+		so_data  (so, 1);
+	}
+
+	if (fb->zsbuf) {
+		if (!gw) {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+			gw = 1;
+		} else {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		}
+
+		so_method(so, tesla, NV50TCL_ZETA_ADDRESS_HIGH, 5);
+		so_reloc (so, nv50_surface_buffer(fb->zsbuf), fb->zsbuf->offset,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH |
+			  NOUVEAU_BO_RDWR, 0, 0);
+		so_reloc (so, nv50_surface_buffer(fb->zsbuf), fb->zsbuf->offset,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
+			  NOUVEAU_BO_RDWR, 0, 0);
+		switch (fb->zsbuf->format) {
+		case PIPE_FORMAT_Z24S8_UNORM:
+			so_data(so, 0x16);
+			break;
+		case PIPE_FORMAT_Z16_UNORM:
+			so_data(so, 0x15);
+			break;
+		default:
+			NOUVEAU_ERR("AIIII unknown format %s\n",
+				    pf_name(fb->zsbuf->format));
+			so_data(so, 0x16);
+			break;
+		}
+		so_data(so, 0x00000000);
+		so_data(so, 0x00000000);
+
+		so_method(so, tesla, 0x1538, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, 0x1228, 3);
+		so_data  (so, fb->zsbuf->width);
+		so_data  (so, fb->zsbuf->height);
+		so_data  (so, 0x00010001);
+	}
+
+	so_method(so, tesla, NV50TCL_VIEWPORT_HORIZ, 2);
+	so_data  (so, w << 16);
+	so_data  (so, h << 16);
+	so_method(so, tesla, 0x0e04, 2);
+	so_data  (so, w << 16);
+	so_data  (so, h << 16);
+	so_method(so, tesla, 0xdf8, 2);
+	so_data  (so, 0);
+	so_data  (so, h);
+
+	so_ref(so, &nv50->state.fb);
+}
+
+static void
+nv50_state_emit(struct nv50_context *nv50)
+{
+	struct nv50_screen *screen = nv50->screen;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	if (nv50->pctx_id != screen->cur_pctx) {
+		nv50->state.dirty |= 0xffffffff;
+		screen->cur_pctx = nv50->pctx_id;
+	}
+
+	if (nv50->state.dirty & NV50_NEW_FRAMEBUFFER)
+		so_emit(nvws, nv50->state.fb);
+	if (nv50->state.dirty & NV50_NEW_BLEND)
+		so_emit(nvws, nv50->state.blend);
+	if (nv50->state.dirty & NV50_NEW_ZSA)
+		so_emit(nvws, nv50->state.zsa);
+	if (nv50->state.dirty & NV50_NEW_VERTPROG)
+		so_emit(nvws, nv50->state.vertprog);
+	if (nv50->state.dirty & NV50_NEW_FRAGPROG)
+		so_emit(nvws, nv50->state.fragprog);
+	if (nv50->state.dirty & NV50_NEW_RASTERIZER)
+		so_emit(nvws, nv50->state.rast);
+	if (nv50->state.dirty & NV50_NEW_BLEND_COLOUR)
+		so_emit(nvws, nv50->state.blend_colour);
+	if (nv50->state.dirty & NV50_NEW_STIPPLE)
+		so_emit(nvws, nv50->state.stipple);
+	if (nv50->state.dirty & NV50_NEW_SCISSOR)
+		so_emit(nvws, nv50->state.scissor);
+	if (nv50->state.dirty & NV50_NEW_VIEWPORT)
+		so_emit(nvws, nv50->state.viewport);
+	if (nv50->state.dirty & NV50_NEW_SAMPLER)
+		so_emit(nvws, nv50->state.tsc_upload);
+	if (nv50->state.dirty & NV50_NEW_TEXTURE)
+		so_emit(nvws, nv50->state.tic_upload);
+	if (nv50->state.dirty & NV50_NEW_ARRAYS) {
+		so_emit(nvws, nv50->state.vtxfmt);
+		so_emit(nvws, nv50->state.vtxbuf);
+	}
+	nv50->state.dirty = 0;
+
+	so_emit_reloc_markers(nvws, nv50->state.fb);
+	so_emit_reloc_markers(nvws, nv50->state.vertprog);
+	so_emit_reloc_markers(nvws, nv50->state.fragprog);
+	so_emit_reloc_markers(nvws, nv50->state.vtxbuf);
+	so_emit_reloc_markers(nvws, nv50->screen->static_init);
+}
+
+boolean
+nv50_state_validate(struct nv50_context *nv50)
+{
+	const struct pipe_framebuffer_state *fb = &nv50->framebuffer;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so;
+	unsigned i;
+
+	for (i = 0; i < fb->nr_cbufs; i++)
+		fb->cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	if (fb->zsbuf)
+		fb->zsbuf->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	if (nv50->dirty & NV50_NEW_FRAMEBUFFER)
+		nv50_state_validate_fb(nv50);
+
+	if (nv50->dirty & NV50_NEW_BLEND)
+		so_ref(nv50->blend->so, &nv50->state.blend);
+
+	if (nv50->dirty & NV50_NEW_ZSA)
+		so_ref(nv50->zsa->so, &nv50->state.zsa);
+
+	if (nv50->dirty & (NV50_NEW_VERTPROG | NV50_NEW_VERTPROG_CB))
+		nv50_vertprog_validate(nv50);
+
+	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_FRAGPROG_CB))
+		nv50_fragprog_validate(nv50);
+
+	if (nv50->dirty & NV50_NEW_RASTERIZER)
+		so_ref(nv50->rasterizer->so, &nv50->state.rast);
+
+	if (nv50->dirty & NV50_NEW_BLEND_COLOUR) {
+		so = so_new(5, 0);
+		so_method(so, tesla, NV50TCL_BLEND_COLOR(0), 4);
+		so_data  (so, fui(nv50->blend_colour.color[0]));
+		so_data  (so, fui(nv50->blend_colour.color[1]));
+		so_data  (so, fui(nv50->blend_colour.color[2]));
+		so_data  (so, fui(nv50->blend_colour.color[3]));
+		so_ref(so, &nv50->state.blend_colour);
+	}
+
+	if (nv50->dirty & NV50_NEW_STIPPLE) {
+		so = so_new(33, 0);
+		so_method(so, tesla, NV50TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+		for (i = 0; i < 32; i++)
+			so_data(so, nv50->stipple.stipple[i]);
+		so_ref(so, &nv50->state.stipple);
+	}
+
+	if (nv50->dirty & (NV50_NEW_SCISSOR | NV50_NEW_RASTERIZER)) {
+		struct pipe_rasterizer_state *rast = &nv50->rasterizer->pipe;
+		struct pipe_scissor_state *s = &nv50->scissor;
+
+		if (nv50->state.scissor &&
+		    (rast->scissor == 0 && nv50->state.scissor_enabled == 0))
+			goto scissor_uptodate;
+		nv50->state.scissor_enabled = rast->scissor;
+
+		so = so_new(3, 0);
+		so_method(so, tesla, 0x0ff4, 2);
+		if (nv50->state.scissor_enabled) {
+			so_data(so, ((s->maxx - s->minx) << 16) | s->minx);
+			so_data(so, ((s->maxy - s->miny) << 16) | s->miny);
+		} else {
+			so_data(so, (8192 << 16));
+			so_data(so, (8192 << 16));
+		}
+		so_ref(so, &nv50->state.scissor);
+		nv50->state.dirty |= NV50_NEW_SCISSOR;
+	}
+scissor_uptodate:
+
+	if (nv50->dirty & NV50_NEW_VIEWPORT) {
+		unsigned bypass;
+
+		if (!nv50->rasterizer->pipe.bypass_clipping)
+			bypass = 0;
+		else
+			bypass = 1;
+
+		if (nv50->state.viewport &&
+		    (bypass || !(nv50->dirty & NV50_NEW_VIEWPORT)) &&
+		    nv50->state.viewport_bypass == bypass)
+			goto viewport_uptodate;
+		nv50->state.viewport_bypass = bypass;
+
+		so = so_new(12, 0);
+		if (!bypass) {
+			so_method(so, tesla, NV50TCL_VIEWPORT_UNK1(0), 3);
+			so_data  (so, fui(nv50->viewport.translate[0]));
+			so_data  (so, fui(nv50->viewport.translate[1]));
+			so_data  (so, fui(nv50->viewport.translate[2]));
+			so_method(so, tesla, NV50TCL_VIEWPORT_UNK0(0), 3);
+			so_data  (so, fui(nv50->viewport.scale[0]));
+			so_data  (so, fui(-nv50->viewport.scale[1]));
+			so_data  (so, fui(nv50->viewport.scale[2]));
+			so_method(so, tesla, 0x192c, 1);
+			so_data  (so, 1);
+			so_method(so, tesla, 0x0f90, 1);
+			so_data  (so, 0);
+		} else {
+			so_method(so, tesla, 0x192c, 1);
+			so_data  (so, 0);
+			so_method(so, tesla, 0x0f90, 1);
+			so_data  (so, 1);
+		}
+
+		so_ref(so, &nv50->state.viewport);
+	}
+viewport_uptodate:
+
+	if (nv50->dirty & NV50_NEW_SAMPLER) {
+		int i;
+
+		so = so_new(nv50->sampler_nr * 8 + 3, 0);
+		so_method(so, tesla, 0x0f00, 1);
+		so_data  (so, NV50_CB_TSC);
+		so_method(so, tesla, 0x40000f04, nv50->sampler_nr * 8);
+		for (i = 0; i < nv50->sampler_nr; i++)
+			so_datap (so, nv50->sampler[i], 8);
+		so_ref(so, &nv50->state.tsc_upload);
+	}
+
+	if (nv50->dirty & NV50_NEW_TEXTURE)
+		nv50_tex_validate(nv50);
+
+	if (nv50->dirty & NV50_NEW_ARRAYS)
+		nv50_vbo_validate(nv50);
+
+	nv50->state.dirty |= nv50->dirty;
+	nv50->dirty = 0;
+	nv50_state_emit(nv50);
+
+	return TRUE;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
new file mode 100644
index 0000000000..b0936518b0
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define __NOUVEAU_PUSH_H__
+#include <stdint.h>
+#include "nouveau/nouveau_pushbuf.h"
+#include "nv50_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "pipe/p_inlines.h"
+
+#include "util/u_tile.h"
+
+static INLINE int
+nv50_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case PIPE_FORMAT_Z24S8_UNORM:
+		return NV50_2D_DST_FORMAT_32BPP;
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
+		return NV50_2D_DST_FORMAT_24BPP;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		return NV50_2D_DST_FORMAT_16BPP;
+	case PIPE_FORMAT_A8_UNORM:
+		return NV50_2D_DST_FORMAT_8BPP;
+	default:
+		return -1;
+	}
+}
+
+static int
+nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
+{
+	struct nv50_miptree *mt = nv50_miptree(ps->texture);
+	struct nouveau_channel *chan = screen->nvws->channel;
+	struct nouveau_grobj *eng2d = screen->eng2d;
+	struct nouveau_bo *bo;
+ 	int format, mthd = dst ? NV50_2D_DST_FORMAT : NV50_2D_SRC_FORMAT;
+ 	int flags = NOUVEAU_BO_VRAM | (dst ? NOUVEAU_BO_WR : NOUVEAU_BO_RD);
+ 
+	bo = screen->nvws->get_bo(nv50_miptree(ps->texture)->buffer);
+	if (!bo)
+		return 1;
+
+ 	format = nv50_format(ps->format);
+ 	if (format < 0)
+ 		return 1;
+  
+ 	if (!bo->tiled) {
+ 		BEGIN_RING(chan, eng2d, mthd, 2);
+ 		OUT_RING  (chan, format);
+ 		OUT_RING  (chan, 1);
+ 		BEGIN_RING(chan, eng2d, mthd + 0x14, 5);
+ 		OUT_RING  (chan, mt->level[0].pitch);
+ 		OUT_RING  (chan, ps->width);
+ 		OUT_RING  (chan, ps->height);
+ 		OUT_RELOCh(chan, bo, ps->offset, flags);
+ 		OUT_RELOCl(chan, bo, ps->offset, flags);
+ 	} else {
+ 		BEGIN_RING(chan, eng2d, mthd, 5);
+ 		OUT_RING  (chan, format);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, 1);
+ 		OUT_RING  (chan, 0);
+ 		BEGIN_RING(chan, eng2d, mthd + 0x18, 4);
+ 		OUT_RING  (chan, ps->width);
+ 		OUT_RING  (chan, ps->height);
+ 		OUT_RELOCh(chan, bo, ps->offset, flags);
+ 		OUT_RELOCl(chan, bo, ps->offset, flags);
+ 	}
+ 
+#if 0
+ 	if (dst) {
+ 		BEGIN_RING(chan, eng2d, NV50_2D_CLIP_X, 4);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, surf->width);
+ 		OUT_RING  (chan, surf->height);
+ 	}
+#endif
+  
+ 	return 0;
+}
+
+int
+nv50_surface_do_copy(struct nv50_screen *screen, struct pipe_surface *dst,
+		     int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		     int w, int h)
+{
+	struct nouveau_channel *chan = screen->nvws->channel;
+	struct nouveau_grobj *eng2d = screen->eng2d;
+	int ret;
+
+	WAIT_RING (chan, 32);
+
+	ret = nv50_surface_set(screen, dst, 1);
+	if (ret)
+		return ret;
+
+	ret = nv50_surface_set(screen, src, 0);
+	if (ret)
+		return ret;
+
+	BEGIN_RING(chan, eng2d, 0x088c, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, eng2d, NV50_2D_BLIT_DST_X, 4);
+	OUT_RING  (chan, dx);
+	OUT_RING  (chan, dy);
+	OUT_RING  (chan, w);
+	OUT_RING  (chan, h);
+	BEGIN_RING(chan, eng2d, 0x08c0, 4);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, eng2d, 0x08d0, 4);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, sx);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, sy);
+
+	return 0;
+}
+
+static void
+nv50_surface_copy(struct pipe_context *pipe, boolean flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+	struct nv50_screen *screen = nv50->screen;
+
+	assert(src->format == dest->format);
+
+	if (flip) {
+		desty += height;
+		while (height--) {
+			nv50_surface_do_copy(screen, dest, destx, desty--, src,
+					     srcx, srcy++, width, 1);
+		}
+	} else {
+		nv50_surface_do_copy(screen, dest, destx, desty, src, srcx,
+				     srcy, width, height);
+	}
+}
+
+static void
+nv50_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+	struct nv50_screen *screen = nv50->screen;
+	struct nouveau_channel *chan = screen->nvws->channel;
+	struct nouveau_grobj *eng2d = screen->eng2d;
+	int format, ret;
+
+	format = nv50_format(dest->format);
+	if (format < 0)
+		return;
+
+	WAIT_RING (chan, 32);
+
+	ret = nv50_surface_set(screen, dest, 1);
+	if (ret)
+		return;
+
+	BEGIN_RING(chan, eng2d, 0x0580, 3);
+	OUT_RING  (chan, 4);
+	OUT_RING  (chan, format);
+	OUT_RING  (chan, value);
+	BEGIN_RING(chan, eng2d, NV50_2D_RECT_X1, 4);
+	OUT_RING  (chan, destx);
+	OUT_RING  (chan, desty);
+	OUT_RING  (chan, width);
+	OUT_RING  (chan, height);
+}
+
+void
+nv50_init_surface_functions(struct nv50_context *nv50)
+{
+	nv50->pipe.surface_copy = nv50_surface_copy;
+	nv50->pipe.surface_fill = nv50_surface_fill;
+}
+
+
diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c
new file mode 100644
index 0000000000..31bf59675e
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_tex.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nv50_context.h"
+#include "nv50_texture.h"
+
+#include "nouveau/nouveau_stateobj.h"
+
+static int
+nv50_tex_construct(struct nouveau_stateobj *so, struct nv50_miptree *mt)
+{
+	switch (mt->base.format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8_8_8_8);
+		break;
+	case PIPE_FORMAT_A1R5G5B5_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_1_5_5_5);
+		break;
+	case PIPE_FORMAT_A4R4G4B4_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_4_4_4_4);
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_5_6_5);
+		break;
+	case PIPE_FORMAT_L8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8);
+		break;
+	case PIPE_FORMAT_A8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C0 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_ZERO | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_ZERO | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_ZERO | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8);
+		break;
+	case PIPE_FORMAT_I8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C0 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8);
+		break;
+	case PIPE_FORMAT_A8L8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C1 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8_8);
+		break;
+	case PIPE_FORMAT_DXT1_RGB:
+		so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_DXT1);
+		break;
+	case PIPE_FORMAT_DXT1_RGBA:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_DXT1);
+		break;
+	case PIPE_FORMAT_DXT3_RGBA:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_DXT3);
+		break;
+	case PIPE_FORMAT_DXT5_RGBA:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_DXT5);
+		break;
+	default:
+		return 1;
+	}
+
+	so_reloc(so, mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
+		     NOUVEAU_BO_RD, 0, 0);
+	so_data (so, 0xd0005000);
+	so_data (so, 0x00300000);
+	so_data (so, mt->base.width[0]);
+	so_data (so, (mt->base.depth[0] << 16) | mt->base.height[0]);
+	so_data (so, 0x03000000);
+	so_reloc(so, mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH |
+		     NOUVEAU_BO_RD, 0, 0);
+
+	return 0;
+}
+
+void
+nv50_tex_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so;
+	int unit;
+
+	so = so_new(nv50->miptree_nr * 8 + 3, nv50->miptree_nr * 2);
+	so_method(so, tesla, 0x0f00, 1);
+	so_data  (so, NV50_CB_TIC);
+	so_method(so, tesla, 0x40000f04, nv50->miptree_nr * 8);
+	for (unit = 0; unit < nv50->miptree_nr; unit++) {
+		struct nv50_miptree *mt = nv50->miptree[unit];
+
+		if (nv50_tex_construct(so, mt)) {
+			NOUVEAU_ERR("failed tex validate\n");
+			so_ref(NULL, &so);
+			return;
+		}
+	}
+
+	so_ref(so, &nv50->state.tic_upload);
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_texture.h b/src/gallium/drivers/nv50/nv50_texture.h
new file mode 100644
index 0000000000..aca622c73b
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_texture.h
@@ -0,0 +1,129 @@
+#ifndef __NV50_TEXTURE_H__
+#define __NV50_TEXTURE_H__
+
+/* It'd be really nice to have these in nouveau_class.h generated by
+ * renouveau like the rest of the object header - but not sure it can
+ * handle non-object stuff nicely - need to look into it.
+ */
+
+/* Texture image control block */
+#define NV50TIC_0_0_MAPA_MASK                                     0x38000000
+#define NV50TIC_0_0_MAPA_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPA_C0                                       0x10000000
+#define NV50TIC_0_0_MAPA_C1                                       0x18000000
+#define NV50TIC_0_0_MAPA_C2                                       0x20000000
+#define NV50TIC_0_0_MAPA_C3                                       0x28000000
+#define NV50TIC_0_0_MAPA_ONE                                      0x38000000
+#define NV50TIC_0_0_MAPR_MASK                                     0x07000000
+#define NV50TIC_0_0_MAPR_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPR_C0                                       0x02000000
+#define NV50TIC_0_0_MAPR_C1                                       0x03000000
+#define NV50TIC_0_0_MAPR_C2                                       0x04000000
+#define NV50TIC_0_0_MAPR_C3                                       0x05000000
+#define NV50TIC_0_0_MAPR_ONE                                      0x07000000
+#define NV50TIC_0_0_MAPG_MASK                                     0x00e00000
+#define NV50TIC_0_0_MAPG_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPG_C0                                       0x00400000
+#define NV50TIC_0_0_MAPG_C1                                       0x00600000
+#define NV50TIC_0_0_MAPG_C2                                       0x00800000
+#define NV50TIC_0_0_MAPG_C3                                       0x00a00000
+#define NV50TIC_0_0_MAPG_ONE                                      0x00e00000
+#define NV50TIC_0_0_MAPB_MASK                                     0x001c0000
+#define NV50TIC_0_0_MAPB_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPB_C0                                       0x00080000
+#define NV50TIC_0_0_MAPB_C1                                       0x000c0000
+#define NV50TIC_0_0_MAPB_C2                                       0x00100000
+#define NV50TIC_0_0_MAPB_C3                                       0x00140000
+#define NV50TIC_0_0_MAPB_ONE                                      0x001c0000
+#define NV50TIC_0_0_TYPEA_MASK                                    0x00038000
+#define NV50TIC_0_0_TYPEA_UNORM                                   0x00010000
+#define NV50TIC_0_0_TYPER_MASK                                    0x00007000
+#define NV50TIC_0_0_TYPER_UNORM                                   0x00002000
+#define NV50TIC_0_0_TYPEG_MASK                                    0x00000e00
+#define NV50TIC_0_0_TYPEG_UNORM                                   0x00000400
+#define NV50TIC_0_0_TYPEB_MASK                                    0x000001c0
+#define NV50TIC_0_0_TYPEB_UNORM                                   0x00000080
+#define NV50TIC_0_0_FMT_MASK                                      0x0000003c
+#define NV50TIC_0_0_FMT_8_8_8_8                                   0x00000008
+#define NV50TIC_0_0_FMT_4_4_4_4                                   0x00000012
+#define NV50TIC_0_0_FMT_1_5_5_5                                   0x00000013
+#define NV50TIC_0_0_FMT_5_6_5                                     0x00000015
+#define NV50TIC_0_0_FMT_8_8                                       0x00000018
+#define NV50TIC_0_0_FMT_8                                         0x0000001d
+#define NV50TIC_0_0_FMT_DXT1                                      0x00000024
+#define NV50TIC_0_0_FMT_DXT3                                      0x00000025
+#define NV50TIC_0_0_FMT_DXT5                                      0x00000026
+
+#define NV50TIC_0_1_OFFSET_LOW_MASK                               0xffffffff
+#define NV50TIC_0_1_OFFSET_LOW_SHIFT                                       0
+
+#define NV50TIC_0_2_UNKNOWN_MASK                                  0xffffffff
+
+#define NV50TIC_0_3_UNKNOWN_MASK                                  0xffffffff
+
+#define NV50TIC_0_4_WIDTH_MASK                                    0x0000ffff
+#define NV50TIC_0_4_WIDTH_SHIFT                                            0
+
+#define NV50TIC_0_5_DEPTH_MASK                                    0xffff0000
+#define NV50TIC_0_5_DEPTH_SHIFT                                           16
+#define NV50TIC_0_5_HEIGHT_MASK                                   0x0000ffff
+#define NV50TIC_0_5_HEIGHT_SHIFT                                           0
+
+#define NV50TIC_0_6_UNKNOWN_MASK                                  0xffffffff
+
+#define NV50TIC_0_7_OFFSET_HIGH_MASK                              0xffffffff
+#define NV50TIC_0_7_OFFSET_HIGH_SHIFT                                      0
+
+/* Texture sampler control block */
+#define NV50TSC_1_0_WRAPS_MASK                                   0x00000007
+#define NV50TSC_1_0_WRAPS_REPEAT                                 0x00000000
+#define NV50TSC_1_0_WRAPS_MIRROR_REPEAT                          0x00000001
+#define NV50TSC_1_0_WRAPS_CLAMP_TO_EDGE                          0x00000002
+#define NV50TSC_1_0_WRAPS_CLAMP_TO_BORDER                        0x00000003
+#define NV50TSC_1_0_WRAPS_CLAMP                                  0x00000004
+#define NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_EDGE                   0x00000005
+#define NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_BORDER                 0x00000006
+#define NV50TSC_1_0_WRAPS_MIRROR_CLAMP                           0x00000007
+#define NV50TSC_1_0_WRAPT_MASK                                   0x00000038
+#define NV50TSC_1_0_WRAPT_REPEAT                                 0x00000000
+#define NV50TSC_1_0_WRAPT_MIRROR_REPEAT                          0x00000008
+#define NV50TSC_1_0_WRAPT_CLAMP_TO_EDGE                          0x00000010
+#define NV50TSC_1_0_WRAPT_CLAMP_TO_BORDER                        0x00000018
+#define NV50TSC_1_0_WRAPT_CLAMP                                  0x00000020
+#define NV50TSC_1_0_WRAPT_MIRROR_CLAMP_TO_EDGE                   0x00000028
+#define NV50TSC_1_0_WRAPT_MIRROR_CLAMP_TO_BORDER                 0x00000030
+#define NV50TSC_1_0_WRAPT_MIRROR_CLAMP                           0x00000038
+#define NV50TSC_1_0_WRAPR_MASK                                   0x000001c0
+#define NV50TSC_1_0_WRAPR_REPEAT                                 0x00000000
+#define NV50TSC_1_0_WRAPR_MIRROR_REPEAT                          0x00000040
+#define NV50TSC_1_0_WRAPR_CLAMP_TO_EDGE                          0x00000080
+#define NV50TSC_1_0_WRAPR_CLAMP_TO_BORDER                        0x000000c0
+#define NV50TSC_1_0_WRAPR_CLAMP                                  0x00000100
+#define NV50TSC_1_0_WRAPR_MIRROR_CLAMP_TO_EDGE                   0x00000140
+#define NV50TSC_1_0_WRAPR_MIRROR_CLAMP_TO_BORDER                 0x00000180
+#define NV50TSC_1_0_WRAPR_MIRROR_CLAMP                           0x000001c0
+
+#define NV50TSC_1_1_MAGF_MASK                                    0x00000003
+#define NV50TSC_1_1_MAGF_NEAREST                                 0x00000001
+#define NV50TSC_1_1_MAGF_LINEAR                                  0x00000002
+#define NV50TSC_1_1_MINF_MASK                                    0x00000030
+#define NV50TSC_1_1_MINF_NEAREST                                 0x00000010
+#define NV50TSC_1_1_MINF_LINEAR                                  0x00000020
+#define NV50TSC_1_1_MIPF_MASK                                    0x000000c0
+#define NV50TSC_1_1_MIPF_NONE                                    0x00000040
+#define NV50TSC_1_1_MIPF_NEAREST                                 0x00000080
+#define NV50TSC_1_1_MIPF_LINEAR                                  0x000000c0
+
+#define NV50TSC_1_2_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_3_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_4_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_5_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_6_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_7_UNKNOWN_MASK                                 0xffffffff
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_transfer.c b/src/gallium/drivers/nv50/nv50_transfer.c
new file mode 100644
index 0000000000..a00c999510
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_transfer.c
@@ -0,0 +1,216 @@
+
+#include "pipe/p_context.h"
+#include "pipe/p_inlines.h"
+
+#include "nv50_context.h"
+
+struct nv50_transfer {
+	struct pipe_transfer base;
+	struct pipe_buffer *buffer;
+	struct nv50_miptree_level *level;
+	int level_pitch;
+	int level_width;
+	int level_height;
+	int level_x;
+	int level_y;
+};
+
+static void
+nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, struct pipe_buffer *src,
+			int src_pitch, int sx, int sy, int sw, int sh,
+			struct pipe_buffer *dst, int dst_pitch, int dx, int dy,
+			int dw, int dh, int cpp, int width, int height,
+			unsigned src_reloc, unsigned dst_reloc)
+{
+	struct nv50_screen *screen = nv50_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+	struct nouveau_channel *chan = nvws->channel;
+	struct nouveau_grobj *m2mf = screen->m2mf;
+	struct nouveau_bo *src_bo = nvws->get_bo(src);
+	struct nouveau_bo *dst_bo = nvws->get_bo(dst);
+	unsigned src_offset = 0, dst_offset = 0;
+
+	src_reloc |= NOUVEAU_BO_RD;
+	dst_reloc |= NOUVEAU_BO_WR;
+
+	WAIT_RING (chan, 14);
+
+	if (!src_bo->tiled) {
+		BEGIN_RING(chan, m2mf, 0x0200, 1);
+		OUT_RING  (chan, 1);
+		BEGIN_RING(chan, m2mf, 0x0314, 1);
+		OUT_RING  (chan, src_pitch);
+		src_offset = (sy * src_pitch) + (sx * cpp);
+	} else {
+		BEGIN_RING(chan, m2mf, 0x0200, 6);
+		OUT_RING  (chan, 0);
+		OUT_RING  (chan, 0);
+		OUT_RING  (chan, sw * cpp);
+		OUT_RING  (chan, sh);
+		OUT_RING  (chan, 1);
+		OUT_RING  (chan, 0);
+	}
+
+	if (!dst_bo->tiled) {
+		BEGIN_RING(chan, m2mf, 0x021c, 1);
+		OUT_RING  (chan, 1);
+		BEGIN_RING(chan, m2mf, 0x0318, 1);
+		OUT_RING  (chan, dst_pitch);
+		dst_offset = (dy * dst_pitch) + (dx * cpp);
+	} else {
+		BEGIN_RING(chan, m2mf, 0x021c, 6);
+		OUT_RING  (chan, 0);
+		OUT_RING  (chan, 0);
+		OUT_RING  (chan, dw * cpp);
+		OUT_RING  (chan, dh);
+		OUT_RING  (chan, 1);
+		OUT_RING  (chan, 0);
+	}
+
+	while (height) {
+		int line_count = height > 2047 ? 2047 : height;
+
+		WAIT_RING (chan, 15);
+		BEGIN_RING(chan, m2mf, 0x0238, 2);
+		OUT_RELOCh(chan, src_bo, src_offset, src_reloc);
+		OUT_RELOCh(chan, dst_bo, dst_offset, dst_reloc);
+		BEGIN_RING(chan, m2mf, 0x030c, 2);
+		OUT_RELOCl(chan, src_bo, src_offset, src_reloc);
+		OUT_RELOCl(chan, dst_bo, dst_offset, dst_reloc);
+		if (src_bo->tiled) {
+			BEGIN_RING(chan, m2mf, 0x0218, 1);
+			OUT_RING  (chan, (dy << 16) | sx);
+		} else {
+			src_offset += (line_count * src_pitch);
+		}
+		if (dst_bo->tiled) {
+			BEGIN_RING(chan, m2mf, 0x0234, 1);
+			OUT_RING  (chan, (sy << 16) | dx);
+		} else {
+			dst_offset += (line_count * dst_pitch);
+		}
+		BEGIN_RING(chan, m2mf, 0x031c, 4);
+		OUT_RING  (chan, width * cpp);
+		OUT_RING  (chan, line_count);
+		OUT_RING  (chan, 0x00000101);
+		OUT_RING  (chan, 0);
+		FIRE_RING (chan);
+
+		height -= line_count;
+		sy += line_count;
+		dy += line_count;
+	}
+}
+
+static struct pipe_transfer *
+nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+		  unsigned face, unsigned level, unsigned zslice,
+		  enum pipe_transfer_usage usage,
+		  unsigned x, unsigned y, unsigned w, unsigned h)
+{
+	struct nv50_miptree *mt = nv50_miptree(pt);
+	struct nv50_miptree_level *lvl = &mt->level[level];
+	struct nv50_transfer *tx;
+	unsigned image = 0;
+
+	if (pt->target == PIPE_TEXTURE_CUBE)
+		image = face;
+	else
+	if (pt->target == PIPE_TEXTURE_3D)
+		image = zslice;
+
+	tx = CALLOC_STRUCT(nv50_transfer);
+	if (!tx)
+		return NULL;
+
+	tx->base.refcount = 1;
+	pipe_texture_reference(&tx->base.texture, pt);
+	tx->base.format = pt->format;
+	tx->base.width = w;
+	tx->base.height = h;
+	tx->base.block = pt->block;
+	tx->base.nblocksx = pt->nblocksx[level];
+	tx->base.nblocksy = pt->nblocksy[level];
+	tx->base.stride = (w * pt->block.size);
+	tx->base.usage = usage;
+
+	tx->level = lvl;
+	tx->level_pitch = lvl->pitch;
+	tx->level_width = mt->base.width[level];
+	tx->level_height = mt->base.height[level];
+	tx->level_x = x;
+	tx->level_y = y;
+	tx->buffer =
+		pipe_buffer_create(pscreen, 0, NOUVEAU_BUFFER_USAGE_TRANSFER,
+				   w * tx->base.block.size * h);
+
+	if (usage != PIPE_TRANSFER_WRITE) {
+		nv50_transfer_rect_m2mf(pscreen, mt->buffer, tx->level_pitch,
+					x, y, tx->level_width, tx->level_height,
+					tx->buffer, tx->base.stride, 0, 0,
+					tx->base.width, tx->base.height,
+					tx->base.block.size, w, h,
+					NOUVEAU_BO_VRAM | NOUVEAU_BO_GART,
+					NOUVEAU_BO_GART);
+	}
+
+	return &tx->base;
+}
+
+static void
+nv50_transfer_del(struct pipe_screen *pscreen, struct pipe_transfer **pptx)
+{
+	struct pipe_transfer *ptx = *pptx;
+	struct nv50_transfer *tx = (struct nv50_transfer *)ptx;
+	struct nv50_miptree *mt = nv50_miptree(ptx->texture);
+
+	*pptx = NULL;
+	if (--ptx->refcount)
+		return;
+
+	if (ptx->usage != PIPE_TRANSFER_READ) {
+		nv50_transfer_rect_m2mf(pscreen, tx->buffer, tx->base.stride,
+					0, 0, tx->base.width, tx->base.height,
+					mt->buffer, tx->level_pitch,
+					tx->level_x, tx->level_y,
+					tx->level_width, tx->level_height,
+					tx->base.block.size, tx->base.width,
+					tx->base.height, NOUVEAU_BO_GART,
+					NOUVEAU_BO_VRAM | NOUVEAU_BO_GART);
+	}
+
+	pipe_buffer_reference(pscreen, &tx->buffer, NULL);
+	pipe_texture_reference(&ptx->texture, NULL);
+	FREE(ptx);
+}
+
+static void *
+nv50_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
+{
+	struct nv50_transfer *tx = (struct nv50_transfer *)ptx;
+	unsigned flags = 0;
+
+	if (ptx->usage & PIPE_TRANSFER_WRITE)
+		flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
+	if (ptx->usage & PIPE_TRANSFER_READ)
+		flags |= PIPE_BUFFER_USAGE_CPU_READ;
+
+	return pipe_buffer_map(pscreen, tx->buffer, flags);
+}
+
+static void
+nv50_transfer_unmap(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
+{
+	struct nv50_transfer *tx = (struct nv50_transfer *)ptx;
+
+	pipe_buffer_unmap(pscreen, tx->buffer);
+}
+
+void
+nv50_transfer_init_screen_functions(struct pipe_screen *pscreen)
+{
+	pscreen->get_tex_transfer = nv50_transfer_new;
+	pscreen->tex_transfer_release = nv50_transfer_del;
+	pscreen->transfer_map = nv50_transfer_map;
+	pscreen->transfer_unmap = nv50_transfer_unmap;
+}
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
new file mode 100644
index 0000000000..08d751dddb
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv50_context.h"
+
+static INLINE unsigned
+nv50_prim(unsigned mode)
+{
+	switch (mode) {
+	case PIPE_PRIM_POINTS: return NV50TCL_VERTEX_BEGIN_POINTS;
+	case PIPE_PRIM_LINES: return NV50TCL_VERTEX_BEGIN_LINES;
+	case PIPE_PRIM_LINE_LOOP: return NV50TCL_VERTEX_BEGIN_LINE_LOOP;
+	case PIPE_PRIM_LINE_STRIP: return NV50TCL_VERTEX_BEGIN_LINE_STRIP;
+	case PIPE_PRIM_TRIANGLES: return NV50TCL_VERTEX_BEGIN_TRIANGLES;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		return NV50TCL_VERTEX_BEGIN_TRIANGLE_STRIP;
+	case PIPE_PRIM_TRIANGLE_FAN: return NV50TCL_VERTEX_BEGIN_TRIANGLE_FAN;
+	case PIPE_PRIM_QUADS: return NV50TCL_VERTEX_BEGIN_QUADS;
+	case PIPE_PRIM_QUAD_STRIP: return NV50TCL_VERTEX_BEGIN_QUAD_STRIP;
+	case PIPE_PRIM_POLYGON: return NV50TCL_VERTEX_BEGIN_POLYGON;
+	default:
+		break;
+	}
+
+	NOUVEAU_ERR("invalid primitive type %d\n", mode);
+	return NV50TCL_VERTEX_BEGIN_POINTS;
+}
+
+boolean
+nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
+		 unsigned count)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+
+	nv50_state_validate(nv50);
+
+	BEGIN_RING(chan, tesla, 0x142c, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, tesla, 0x142c, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, tesla, 0x1440, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, tesla, 0x1334, 1);
+	OUT_RING  (chan, 0);
+
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+	OUT_RING  (chan, nv50_prim(mode));
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2);
+	OUT_RING  (chan, start);
+	OUT_RING  (chan, count);
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+	OUT_RING  (chan, 0);
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static INLINE void
+nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
+			      unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+
+	map += start;
+
+	if (count & 1) {
+		BEGIN_RING(chan, tesla, 0x15e8, 1);
+		OUT_RING  (chan, map[0]);
+		map++;
+		count--;
+	}
+
+	while (count) {
+		unsigned nr = count > 2046 ? 2046 : count;
+		int i;
+
+		BEGIN_RING(chan, tesla, 0x400015f0, nr >> 1);
+		for (i = 0; i < nr; i += 2)
+			OUT_RING  (chan, (map[1] << 16) | map[0]);
+
+		count -= nr;
+		map += nr;
+	}
+}
+
+static INLINE void
+nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
+			      unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+
+	map += start;
+
+	if (count & 1) {
+		BEGIN_RING(chan, tesla, 0x15e8, 1);
+		OUT_RING  (chan, map[0]);
+		map++;
+		count--;
+	}
+
+	while (count) {
+		unsigned nr = count > 2046 ? 2046 : count;
+		int i;
+
+		BEGIN_RING(chan, tesla, 0x400015f0, nr >> 1);
+		for (i = 0; i < nr; i += 2)
+			OUT_RING  (chan, (map[1] << 16) | map[0]);
+
+		count -= nr;
+		map += nr;
+	}
+}
+
+static INLINE void
+nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint8_t *map,
+			      unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+
+	map += start;
+
+	while (count) {
+		unsigned nr = count > 2047 ? 2047 : count;
+
+		BEGIN_RING(chan, tesla, 0x400015e8, nr);
+		OUT_RINGp (chan, map, nr);
+
+		count -= nr;
+		map += nr;
+	}
+}
+
+boolean
+nv50_draw_elements(struct pipe_context *pipe,
+		   struct pipe_buffer *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->nvws->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct pipe_winsys *ws = pipe->winsys;
+	void *map = ws->buffer_map(ws, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ);
+
+	nv50_state_validate(nv50);
+
+	BEGIN_RING(chan, tesla, 0x142c, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, tesla, 0x142c, 1);
+	OUT_RING  (chan, 0);
+
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+	OUT_RING  (chan, nv50_prim(mode));
+	switch (indexSize) {
+	case 1:
+		nv50_draw_elements_inline_u08(nv50, map, start, count);
+		break;
+	case 2:
+		nv50_draw_elements_inline_u16(nv50, map, start, count);
+		break;
+	case 4:
+		nv50_draw_elements_inline_u32(nv50, map, start, count);
+		break;
+	default:
+		assert(0);
+	}
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+	OUT_RING  (chan, 0);
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+void
+nv50_vbo_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *vtxbuf, *vtxfmt;
+	int i;
+
+	vtxbuf = so_new(nv50->vtxelt_nr * 4, nv50->vtxelt_nr * 2);
+	vtxfmt = so_new(nv50->vtxelt_nr + 1, 0);
+	so_method(vtxfmt, tesla, 0x1ac0, nv50->vtxelt_nr);
+
+	for (i = 0; i < nv50->vtxelt_nr; i++) {
+		struct pipe_vertex_element *ve = &nv50->vtxelt[i];
+		struct pipe_vertex_buffer *vb =
+			&nv50->vtxbuf[ve->vertex_buffer_index];
+
+		switch (ve->src_format) {
+		case PIPE_FORMAT_R32G32B32A32_FLOAT:
+			so_data(vtxfmt, 0x7e080000 | i);
+			break;
+		case PIPE_FORMAT_R32G32B32_FLOAT:
+			so_data(vtxfmt, 0x7e100000 | i);
+			break;
+		case PIPE_FORMAT_R32G32_FLOAT:
+			so_data(vtxfmt, 0x7e200000 | i);
+			break;
+		case PIPE_FORMAT_R32_FLOAT:
+			so_data(vtxfmt, 0x7e900000 | i);
+			break;
+		case PIPE_FORMAT_R8G8B8A8_UNORM:
+			so_data(vtxfmt, 0x24500000 | i);
+			break;
+		default:
+		{
+			NOUVEAU_ERR("invalid vbo format %s\n",
+				    pf_name(ve->src_format));
+			assert(0);
+			return;
+		}
+		}
+
+		so_method(vtxbuf, tesla, 0x900 + (i * 16), 3);
+		so_data  (vtxbuf, 0x20000000 | vb->stride);
+		so_reloc (vtxbuf, vb->buffer, vb->buffer_offset +
+			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+			  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+		so_reloc (vtxbuf, vb->buffer, vb->buffer_offset +
+			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+			  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	so_ref (vtxfmt, &nv50->state.vtxfmt);
+	so_ref (vtxbuf, &nv50->state.vtxbuf);
+}
+
diff --git a/src/gallium/drivers/r300/Makefile b/src/gallium/drivers/r300/Makefile
new file mode 100644
index 0000000000..9b7524b523
--- /dev/null
+++ b/src/gallium/drivers/r300/Makefile
@@ -0,0 +1,20 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = r300
+
+C_SOURCES = \
+	r300_chipset.c \
+	r300_clear.c \
+	r300_context.c \
+	r300_emit.c \
+	r300_flush.c \
+	r300_screen.c \
+	r300_state.c \
+	r300_state_derived.c \
+	r300_state_shader.c \
+	r300_surface.c \
+	r300_swtcl_emit.c \
+	r300_texture.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/r300/SConscript b/src/gallium/drivers/r300/SConscript
new file mode 100644
index 0000000000..18684c3e7f
--- /dev/null
+++ b/src/gallium/drivers/r300/SConscript
@@ -0,0 +1,17 @@
+Import('*')
+
+env = env.Clone()
+
+r300 = env.ConvenienceLibrary(
+	target = 'r300',
+	source = [
+		'r300_blit.c',
+		'r300_clear.c',
+		'r300_context.c',
+		'r300_screen.c',
+		'r300_state.c',
+		'r300_surface.c',
+	])
+
+Export('r300')
+
diff --git a/src/gallium/drivers/r300/r300_chipset.c b/src/gallium/drivers/r300/r300_chipset.c
new file mode 100644
index 0000000000..196537a432
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_chipset.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_chipset.h"
+#include "util/u_debug.h"
+
+/* r300_chipset: A file all to itself for deducing the various properties of
+ * Radeons. */
+
+/* Parse a PCI ID and fill an r300_capabilities struct with information. */
+void r300_parse_chipset(struct r300_capabilities* caps)
+{
+    /* Reasonable defaults */
+    caps->has_tcl = TRUE;
+    caps->is_r500 = FALSE;
+    caps->num_vert_fpus = 4;
+
+
+    /* Note: These are not ordered by PCI ID. I leave that task to GCC,
+     * which will perform the ordering while collating jump tables. Instead,
+     * I've tried to group them according to capabilities and age. */
+    switch (caps->pci_id) {
+        case 0x4144:
+            caps->family = CHIP_FAMILY_R300;
+            break;
+
+        case 0x4145:
+        case 0x4146:
+        case 0x4147:
+        case 0x4E44:
+        case 0x4E45:
+        case 0x4E46:
+        case 0x4E47:
+            caps->family = CHIP_FAMILY_R300;
+            break;
+
+        case 0x4150:
+        case 0x4151:
+        case 0x4152:
+        case 0x4153:
+        case 0x4154:
+        case 0x4155:
+        case 0x4156:
+        case 0x4E50:
+        case 0x4E51:
+        case 0x4E52:
+        case 0x4E53:
+        case 0x4E54:
+        case 0x4E56:
+            caps->family = CHIP_FAMILY_RV350;
+            break;
+
+        case 0x4148:
+        case 0x4149:
+        case 0x414A:
+        case 0x414B:
+        case 0x4E48:
+        case 0x4E49:
+        case 0x4E4B:
+            caps->family = CHIP_FAMILY_R350;
+            break;
+
+        case 0x4E4A:
+            caps->family = CHIP_FAMILY_R360;
+            break;
+
+        case 0x5460:
+        case 0x5462:
+        case 0x5464:
+        case 0x5B60:
+        case 0x5B62:
+        case 0x5B63:
+        case 0x5B64:
+        case 0x5B65:
+            caps->family = CHIP_FAMILY_RV370;
+            break;
+
+        case 0x3150:
+        case 0x3152:
+        case 0x3154:
+        case 0x3E50:
+        case 0x3E54:
+            caps->family = CHIP_FAMILY_RV380;
+            break;
+
+        case 0x4A48:
+        case 0x4A49:
+        case 0x4A4A:
+        case 0x4A4B:
+        case 0x4A4C:
+        case 0x4A4D:
+        case 0x4A4E:
+        case 0x4A4F:
+        case 0x4A50:
+        case 0x4A54:
+            caps->family = CHIP_FAMILY_R420;
+            caps->num_vert_fpus = 6;
+            break;
+
+        case 0x5548:
+        case 0x5549:
+        case 0x554A:
+        case 0x554B:
+        case 0x5550:
+        case 0x5551:
+        case 0x5552:
+        case 0x5554:
+        case 0x5D57:
+            caps->family = CHIP_FAMILY_R423;
+            caps->num_vert_fpus = 6;
+            break;
+
+        case 0x554C:
+        case 0x554D:
+        case 0x554E:
+        case 0x554F:
+        case 0x5D48:
+        case 0x5D49:
+        case 0x5D4A:
+            caps->family = CHIP_FAMILY_R430;
+            caps->num_vert_fpus = 6;
+            break;
+
+        case 0x5D4C:
+        case 0x5D4D:
+        case 0x5D4E:
+        case 0x5D4F:
+        case 0x5D50:
+        case 0x5D52:
+            caps->family = CHIP_FAMILY_R480;
+            caps->num_vert_fpus = 6;
+            break;
+
+        case 0x4B49:
+        case 0x4B4A:
+        case 0x4B4B:
+        case 0x4B4C:
+            caps->family = CHIP_FAMILY_R481;
+            caps->num_vert_fpus = 6;
+            break;
+
+        case 0x5E4C:
+        case 0x5E4F:
+        case 0x564A:
+        case 0x564B:
+        case 0x564F:
+        case 0x5652:
+        case 0x5653:
+        case 0x5657:
+        case 0x5E48:
+        case 0x5E4A:
+        case 0x5E4B:
+        case 0x5E4D:
+            caps->family = CHIP_FAMILY_RV410;
+            caps->num_vert_fpus = 6;
+            break;
+
+        case 0x5954:
+        case 0x5955:
+            caps->family = CHIP_FAMILY_RS480;
+            caps->has_tcl = FALSE;
+            break;
+
+        case 0x5974:
+        case 0x5975:
+            caps->family = CHIP_FAMILY_RS482;
+            caps->has_tcl = FALSE;
+            break;
+
+        case 0x5A41:
+        case 0x5A42:
+            caps->family = CHIP_FAMILY_RS400;
+            caps->has_tcl = FALSE;
+            break;
+
+        case 0x5A61:
+        case 0x5A62:
+            caps->family = CHIP_FAMILY_RC410;
+            caps->has_tcl = FALSE;
+            break;
+
+        case 0x791E:
+        case 0x791F:
+            caps->family = CHIP_FAMILY_RS690;
+            caps->has_tcl = FALSE;
+            break;
+
+        case 0x796C:
+        case 0x796D:
+        case 0x796E:
+        case 0x796F:
+            caps->family = CHIP_FAMILY_RS740;
+            caps->has_tcl = FALSE;
+            break;
+
+        case 0x7100:
+        case 0x7101:
+        case 0x7102:
+        case 0x7103:
+        case 0x7104:
+        case 0x7105:
+        case 0x7106:
+        case 0x7108:
+        case 0x7109:
+        case 0x710A:
+        case 0x710B:
+        case 0x710C:
+        case 0x710E:
+        case 0x710F:
+            caps->family = CHIP_FAMILY_R520;
+            caps->num_vert_fpus = 8;
+            caps->is_r500 = TRUE;
+            break;
+
+        case 0x7140:
+        case 0x7141:
+        case 0x7142:
+        case 0x7143:
+        case 0x7144:
+        case 0x7145:
+        case 0x7146:
+        case 0x7147:
+        case 0x7149:
+        case 0x714A:
+        case 0x714B:
+        case 0x714C:
+        case 0x714D:
+        case 0x714E:
+        case 0x714F:
+        case 0x7151:
+        case 0x7152:
+        case 0x7153:
+        case 0x715E:
+        case 0x715F:
+        case 0x7180:
+        case 0x7181:
+        case 0x7183:
+        case 0x7186:
+        case 0x7187:
+        case 0x7188:
+        case 0x718A:
+        case 0x718B:
+        case 0x718C:
+        case 0x718D:
+        case 0x718F:
+        case 0x7193:
+        case 0x7196:
+        case 0x719B:
+        case 0x719F:
+        case 0x7200:
+        case 0x7210:
+        case 0x7211:
+            caps->family = CHIP_FAMILY_RV515;
+            caps->num_vert_fpus = 2;
+            caps->is_r500 = TRUE;
+            break;
+
+        case 0x71C0:
+        case 0x71C1:
+        case 0x71C2:
+        case 0x71C3:
+        case 0x71C4:
+        case 0x71C5:
+        case 0x71C6:
+        case 0x71C7:
+        case 0x71CD:
+        case 0x71CE:
+        case 0x71D2:
+        case 0x71D4:
+        case 0x71D5:
+        case 0x71D6:
+        case 0x71DA:
+        case 0x71DE:
+            caps->family = CHIP_FAMILY_RV530;
+            caps->num_vert_fpus = 5;
+            caps->is_r500 = TRUE;
+            break;
+
+        case 0x7240:
+        case 0x7243:
+        case 0x7244:
+        case 0x7245:
+        case 0x7246:
+        case 0x7247:
+        case 0x7248:
+        case 0x7249:
+        case 0x724A:
+        case 0x724B:
+        case 0x724C:
+        case 0x724D:
+        case 0x724E:
+        case 0x724F:
+        case 0x7284:
+            caps->family = CHIP_FAMILY_R580;
+            caps->num_vert_fpus = 8;
+            caps->is_r500 = TRUE;
+            break;
+
+        case 0x7280:
+            caps->family = CHIP_FAMILY_RV570;
+            caps->num_vert_fpus = 5;
+            caps->is_r500 = TRUE;
+            break;
+
+        case 0x7281:
+        case 0x7283:
+        case 0x7287:
+        case 0x7288:
+        case 0x7289:
+        case 0x728B:
+        case 0x728C:
+        case 0x7290:
+        case 0x7291:
+        case 0x7293:
+        case 0x7297:
+            caps->family = CHIP_FAMILY_RV560;
+            caps->num_vert_fpus = 5;
+            caps->is_r500 = TRUE;
+            break;
+
+        default:
+            debug_printf("r300: Warning: Unknown chipset 0x%x\n",
+                caps->pci_id);
+            break;
+    }
+
+    /* XXX SW TCL is broken so no forcing it off right now
+    caps->has_tcl = FALSE; */
+}
diff --git a/src/gallium/drivers/r300/r300_chipset.h b/src/gallium/drivers/r300/r300_chipset.h
new file mode 100644
index 0000000000..a9cd372ec5
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_chipset.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_CHIPSET_H
+#define R300_CHIPSET_H
+
+#include "pipe/p_compiler.h"
+
+/* Structure containing all the possible information about a specific Radeon
+ * in the R3xx, R4xx, and R5xx families. */
+struct r300_capabilities {
+    /* PCI ID */
+    uint32_t pci_id;
+    /* Chipset family */
+    int family;
+    /* The number of vertex floating-point units */
+    int num_vert_fpus;
+    /* The number of fragment pipes */
+    int num_frag_pipes;
+    /* Whether or not TCL is physically present */
+    boolean has_tcl;
+    /* Whether or not this is an RV515 or newer; R500s have many differences
+     * that require extra consideration, compared to their R3xx cousins:
+     * - Extra bit of width and height on texture sizes
+     * - Blend color is split across two registers
+     * - Universal Shader (US) block used for fragment shaders */
+    boolean is_r500;
+};
+
+/* Enumerations for legibility and telling which card we're running on. */
+enum {
+    CHIP_FAMILY_R300 = 0,
+    CHIP_FAMILY_R350,
+    CHIP_FAMILY_R360,
+    CHIP_FAMILY_RV350,
+    CHIP_FAMILY_RV370,
+    CHIP_FAMILY_RV380,
+    CHIP_FAMILY_R420,
+    CHIP_FAMILY_R423,
+    CHIP_FAMILY_R430,
+    CHIP_FAMILY_R480,
+    CHIP_FAMILY_R481,
+    CHIP_FAMILY_RV410,
+    CHIP_FAMILY_RS400,
+    CHIP_FAMILY_RC410,
+    CHIP_FAMILY_RS480,
+    CHIP_FAMILY_RS482,
+    CHIP_FAMILY_RS690,
+    CHIP_FAMILY_RS740,
+    CHIP_FAMILY_RV515,
+    CHIP_FAMILY_R520,
+    CHIP_FAMILY_RV530,
+    CHIP_FAMILY_R580,
+    CHIP_FAMILY_RV560,
+    CHIP_FAMILY_RV570
+};
+
+void r300_parse_chipset(struct r300_capabilities* caps);
+
+#endif /* R300_CHIPSET_H */
diff --git a/src/gallium/drivers/r300/r300_clear.c b/src/gallium/drivers/r300/r300_clear.c
new file mode 100644
index 0000000000..fd28437aaa
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_clear.c
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_clear.h"
+
+/* This gets its own file because Intel's is in its own file.
+ * I assume there's a good reason. */
+void r300_clear(struct pipe_context* pipe,
+                struct pipe_surface* ps,
+                unsigned color)
+{
+    pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, color);
+    ps->status = PIPE_SURFACE_STATUS_DEFINED;
+}
+\ No newline at end of file
diff --git a/src/gallium/drivers/r300/r300_clear.h b/src/gallium/drivers/r300/r300_clear.h
new file mode 100644
index 0000000000..e24a0690c9
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_clear.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "pipe/p_context.h"
+
+void r300_clear(struct pipe_context* pipe,
+                struct pipe_surface* ps,
+                unsigned color);
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
new file mode 100644
index 0000000000..15a8751549
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_context.h"
+
+static boolean r300_draw_range_elements(struct pipe_context* pipe,
+                                        struct pipe_buffer* indexBuffer,
+                                        unsigned indexSize,
+                                        unsigned minIndex,
+                                        unsigned maxIndex,
+                                        unsigned mode,
+                                        unsigned start,
+                                        unsigned count)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    int i;
+
+    if (r300->dirty_state) {
+        r300_emit_dirty_state(r300);
+    }
+
+    for (i = 0; i < r300->vertex_buffer_count; i++) {
+        void* buf = pipe_buffer_map(pipe->screen,
+                                    r300->vertex_buffers[i].buffer,
+                                    PIPE_BUFFER_USAGE_CPU_READ);
+        draw_set_mapped_vertex_buffer(r300->draw, i, buf);
+    }
+
+    if (indexBuffer) {
+        void* indices = pipe_buffer_map(pipe->screen, indexBuffer,
+                                        PIPE_BUFFER_USAGE_CPU_READ);
+        draw_set_mapped_element_buffer_range(r300->draw, indexSize,
+                                             minIndex, maxIndex, indices);
+    } else {
+        draw_set_mapped_element_buffer(r300->draw, 0, NULL);
+    }
+
+    draw_set_mapped_constant_buffer(r300->draw,
+            r300->shader_constants[PIPE_SHADER_VERTEX].constants,
+            r300->shader_constants[PIPE_SHADER_VERTEX].user_count *
+                (sizeof(float) * 4));
+
+    /* Abandon all hope, ye who enter here. */
+    draw_arrays(r300->draw, mode, start, count);
+
+    for (i = 0; i < r300->vertex_buffer_count; i++) {
+        pipe_buffer_unmap(pipe->screen, r300->vertex_buffers[i].buffer);
+        draw_set_mapped_vertex_buffer(r300->draw, i, NULL);
+    }
+
+    if (indexBuffer) {
+        pipe_buffer_unmap(pipe->screen, indexBuffer);
+        draw_set_mapped_element_buffer_range(r300->draw, 0, start,
+                                             start + count - 1, NULL);
+    }
+
+    return true;
+}
+
+static boolean r300_draw_elements(struct pipe_context* pipe,
+                                  struct pipe_buffer* indexBuffer,
+                                  unsigned indexSize, unsigned mode,
+                                  unsigned start, unsigned count)
+{
+    return r300_draw_range_elements(pipe, indexBuffer, indexSize, 0, ~0,
+                                    mode, start, count);
+}
+
+static boolean r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
+                                unsigned start, unsigned count)
+{
+    return r300_draw_elements(pipe, NULL, 0, mode, start, count);
+}
+
+static void r300_destroy_context(struct pipe_context* context) {
+    struct r300_context* r300 = r300_context(context);
+
+    draw_destroy(r300->draw);
+
+    FREE(r300->blend_color_state);
+    FREE(r300->scissor_state);
+    FREE(r300);
+}
+
+struct pipe_context* r300_create_context(struct pipe_screen* screen,
+                                         struct pipe_winsys* winsys,
+                                         struct r300_winsys* r300_winsys)
+{
+    struct r300_context* r300 = CALLOC_STRUCT(r300_context);
+
+    if (!r300)
+        return NULL;
+
+    r300->winsys = r300_winsys;
+    r300->context.winsys = winsys;
+    r300->context.screen = r300_create_screen(winsys, r300_winsys);
+
+    r300->context.destroy = r300_destroy_context;
+
+    r300->context.clear = r300_clear;
+
+    r300->context.draw_arrays = r300_draw_arrays;
+    r300->context.draw_elements = r300_draw_elements;
+    r300->context.draw_range_elements = r300_draw_range_elements;
+
+    r300->draw = draw_create();
+    draw_set_rasterize_stage(r300->draw, r300_draw_swtcl_stage(r300));
+
+    r300->blend_color_state = CALLOC_STRUCT(r300_blend_color_state);
+    r300->scissor_state = CALLOC_STRUCT(r300_scissor_state);
+
+    r300_init_flush_functions(r300);
+
+    r300_init_surface_functions(r300);
+
+    r300_init_state_functions(r300);
+
+    r300->dirty_state = R300_NEW_KITCHEN_SINK;
+    r300->dirty_hw++;
+
+    return &r300->context;
+}
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
new file mode 100644
index 0000000000..aaab1dd2bc
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_CONTEXT_H
+#define R300_CONTEXT_H
+
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "pipe/p_context.h"
+#include "tgsi/tgsi_scan.h"
+#include "util/u_memory.h"
+
+#include "r300_clear.h"
+#include "r300_screen.h"
+#include "r300_winsys.h"
+
+struct r300_blend_state {
+    uint32_t blend_control;       /* R300_RB3D_CBLEND: 0x4e04 */
+    uint32_t alpha_blend_control; /* R300_RB3D_ABLEND: 0x4e08 */
+    uint32_t rop;                 /* R300_RB3D_ROPCNTL: 0x4e18 */
+    uint32_t dither;              /* R300_RB3D_DITHER_CTL: 0x4e50 */
+};
+
+struct r300_blend_color_state {
+    /* RV515 and earlier */
+    uint32_t blend_color;            /* R300_RB3D_BLEND_COLOR: 0x4e10 */
+    /* R520 and newer */
+    uint32_t blend_color_red_alpha;  /* R500_RB3D_CONSTANT_COLOR_AR: 0x4ef8 */
+    uint32_t blend_color_green_blue; /* R500_RB3D_CONSTANT_COLOR_GB: 0x4efc */
+};
+
+struct r300_dsa_state {
+    uint32_t alpha_function;    /* R300_FG_ALPHA_FUNC: 0x4bd4 */
+    uint32_t alpha_reference;   /* R500_FG_ALPHA_VALUE: 0x4be0 */
+    uint32_t z_buffer_control;  /* R300_ZB_CNTL: 0x4f00 */
+    uint32_t z_stencil_control; /* R300_ZB_ZSTENCILCNTL: 0x4f04 */
+    uint32_t stencil_ref_mask;  /* R300_ZB_STENCILREFMASK: 0x4f08 */
+    uint32_t z_buffer_top;      /* R300_ZB_ZTOP: 0x4f14 */
+    uint32_t stencil_ref_bf;    /* R500_ZB_STENCILREFMASK_BF: 0x4fd4 */
+};
+
+struct r300_rs_state {
+    /* XXX icky as fucking hell */
+    struct pipe_rasterizer_state rs;
+
+    uint32_t vap_control_status;    /* R300_VAP_CNTL_STATUS: 0x2140 */
+    uint32_t point_size;            /* R300_GA_POINT_SIZE: 0x421c */
+    uint32_t line_control;          /* R300_GA_LINE_CNTL: 0x4234 */
+    uint32_t depth_scale_front;  /* R300_SU_POLY_OFFSET_FRONT_SCALE: 0x42a4 */
+    uint32_t depth_offset_front;/* R300_SU_POLY_OFFSET_FRONT_OFFSET: 0x42a8 */
+    uint32_t depth_scale_back;    /* R300_SU_POLY_OFFSET_BACK_SCALE: 0x42ac */
+    uint32_t depth_offset_back;  /* R300_SU_POLY_OFFSET_BACK_OFFSET: 0x42b0 */
+    uint32_t polygon_offset_enable; /* R300_SU_POLY_OFFSET_ENABLE: 0x42b4 */
+    uint32_t cull_mode;             /* R300_SU_CULL_MODE: 0x42b8 */
+    uint32_t line_stipple_config;   /* R300_GA_LINE_STIPPLE_CONFIG: 0x4328 */
+    uint32_t line_stipple_value;    /* R300_GA_LINE_STIPPLE_VALUE: 0x4260 */
+};
+
+struct r300_sampler_state {
+    uint32_t filter0;      /* R300_TX_FILTER0: 0x4400 */
+    uint32_t filter1;      /* R300_TX_FILTER1: 0x4440 */
+    uint32_t border_color; /* R300_TX_BORDER_COLOR: 0x45c0 */
+};
+
+struct r300_scissor_state {
+    uint32_t scissor_top_left;     /* R300_SC_SCISSORS_TL: 0x43e0 */
+    uint32_t scissor_bottom_right; /* R300_SC_SCISSORS_BR: 0x43e4 */
+};
+
+struct r300_texture_state {
+};
+
+#define R300_NEW_BLEND           0x0000001
+#define R300_NEW_BLEND_COLOR     0x0000002
+#define R300_NEW_CONSTANTS       0x0000004
+#define R300_NEW_DSA             0x0000008
+#define R300_NEW_FRAMEBUFFERS    0x0000010
+#define R300_NEW_FRAGMENT_SHADER 0x0000020
+#define R300_NEW_RASTERIZER      0x0000040
+#define R300_NEW_SAMPLER         0x0000080
+#define R300_NEW_SCISSOR         0x0008000
+#define R300_NEW_TEXTURE         0x0010000
+#define R300_NEW_VERTEX_FORMAT   0x1000000
+#define R300_NEW_VERTEX_SHADER   0x2000000
+#define R300_NEW_KITCHEN_SINK    0x3ffffff
+
+/* The next several objects are not pure Radeon state; they inherit from
+ * various Gallium classes. */
+
+struct r300_constant_buffer {
+    /* Buffer of constants */
+    /* XXX first number should be raised */
+    float constants[8][4];
+    /* Number of user-defined constants */
+    int user_count;
+    /* Total number of constants */
+    int count;
+};
+
+struct r3xx_fragment_shader {
+    /* Parent class */
+    struct pipe_shader_state state;
+    struct tgsi_shader_info info;
+
+    /* Has this shader been translated yet? */
+    boolean translated;
+
+    /* Pixel stack size */
+    int stack_size;
+};
+
+struct r300_fragment_shader {
+    /* Parent class */
+    struct r3xx_fragment_shader shader;
+
+    /* Number of ALU instructions */
+    int alu_instruction_count;
+
+    /* Number of texture instructions */
+    int tex_instruction_count;
+
+    /* Number of texture indirections */
+    int indirections;
+
+    /* Indirection node offsets */
+    int offset0;
+    int offset1;
+    int offset2;
+    int offset3;
+
+    /* Machine instructions */
+    struct {
+        uint32_t alu_rgb_inst;
+        uint32_t alu_rgb_addr;
+        uint32_t alu_alpha_inst;
+        uint32_t alu_alpha_addr;
+    } instructions[64]; /* XXX magic num */
+};
+
+struct r500_fragment_shader {
+    /* Parent class */
+    struct r3xx_fragment_shader shader;
+
+    /* Number of used instructions */
+    int instruction_count;
+
+    /* Machine instructions */
+    struct {
+        uint32_t inst0;
+        uint32_t inst1;
+        uint32_t inst2;
+        uint32_t inst3;
+        uint32_t inst4;
+        uint32_t inst5;
+    } instructions[256]; /*< XXX magic number */
+};
+
+struct r300_texture {
+    /* Parent class */
+    struct pipe_texture tex;
+
+    /* Offsets into the buffer. */
+    unsigned offset[PIPE_MAX_TEXTURE_LEVELS];
+
+    /* Stride (pitch?) of this texture in bytes */
+    unsigned stride;
+    
+    /* Total size of this texture, in bytes. */
+    unsigned size;
+
+    /* Pipe buffer backing this texture. */
+    struct pipe_buffer* buffer;
+};
+
+struct r300_vertex_format {
+    /* Parent class */
+    struct vertex_info vinfo;
+    /* R300_VAP_PROG_STREAK_CNTL_[0-7] */
+    uint32_t vap_prog_stream_cntl[8];
+    /* R300_VAP_PROG_STREAK_CNTL_EXT_[0-7] */
+    uint32_t vap_prog_stream_cntl_ext[8];
+};
+
+struct r300_context {
+    /* Parent class */
+    struct pipe_context context;
+
+    /* The interface to the windowing system, etc. */
+    struct r300_winsys* winsys;
+    /* Draw module. Used mostly for SW TCL. */
+    struct draw_context* draw;
+
+    /* Various CSO state objects. */
+    /* Blend state. */
+    struct r300_blend_state* blend_state;
+    /* Blend color state. */
+    struct r300_blend_color_state* blend_color_state;
+    /* Shader constants. */
+    struct r300_constant_buffer shader_constants[PIPE_SHADER_TYPES];
+    /* Depth, stencil, and alpha state. */
+    struct r300_dsa_state* dsa_state;
+    /* Fragment shader. */
+    struct r3xx_fragment_shader* fs;
+    /* Framebuffer state. We currently don't need our own version of this. */
+    struct pipe_framebuffer_state framebuffer_state;
+    /* Rasterizer state. */
+    struct r300_rs_state* rs_state;
+    /* Sampler states. */
+    struct r300_sampler_state* sampler_states[8];
+    int sampler_count;
+    /* Scissor state. */
+    struct r300_scissor_state* scissor_state;
+    /* Texture states. */
+    struct r300_texture* textures[8];
+    struct r300_texture_state* texture_states[8];
+    int texture_count;
+    /* Vertex buffers. */
+    struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
+    int vertex_buffer_count;
+    /* Vertex information. */
+    struct r300_vertex_format vertex_info;
+    /* Bitmask of dirty state objects. */
+    uint32_t dirty_state;
+    /* Flag indicating whether or not the HW is dirty. */
+    uint32_t dirty_hw;
+};
+
+/* Convenience cast wrapper. */
+static struct r300_context* r300_context(struct pipe_context* context) {
+    return (struct r300_context*)context;
+}
+
+/* Context initialization. */
+struct draw_stage* r300_draw_swtcl_stage(struct r300_context* r300);
+void r300_init_state_functions(struct r300_context* r300);
+void r300_init_surface_functions(struct r300_context* r300);
+
+/* Fun with includes: r300_winsys also declares this prototype.
+ * We'll just step out in that case... */
+#ifndef R300_WINSYS_H
+struct pipe_context* r300_create_context(struct pipe_screen* screen,
+                                         struct pipe_winsys* winsys,
+                                         struct r300_winsys* r300_winsys);
+#endif
+
+#endif /* R300_CONTEXT_H */
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
new file mode 100644
index 0000000000..d8038ff1e1
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_CS_H
+#define R300_CS_H
+
+#include "util/u_math.h"
+
+#include "r300_reg.h"
+#include "r300_winsys.h"
+
+/* Yes, I know macros are ugly. However, they are much prettier than the code
+ * that they neatly hide away, and don't have the cost of function setup,so
+ * we're going to use them. */
+
+#define MAX_CS_SIZE 64 * 1024 / 4
+
+#define VERY_VERBOSE_REGISTERS 0
+
+/* XXX stolen from radeon_drm.h */
+#define RADEON_GEM_DOMAIN_CPU  0x1
+#define RADEON_GEM_DOMAIN_GTT  0x2
+#define RADEON_GEM_DOMAIN_VRAM 0x4
+
+/* XXX stolen from radeon_reg.h */
+#define RADEON_CP_PACKET0 0x0
+
+#define CP_PACKET0(register, count) \
+    (RADEON_CP_PACKET0 | ((count) << 16) | ((register) >> 2))
+
+#define CP_PACKET3(op, count) \
+    (RADEON_CP_PACKET3 | (op) | ((count) << 16))
+
+#define CS_LOCALS(context) \
+    struct r300_winsys* cs_winsys = context->winsys; \
+    struct radeon_cs* cs = cs_winsys->cs; \
+    int cs_count = 0;
+
+#define CHECK_CS(size) \
+    cs_winsys->check_cs(cs, (size))
+
+#define BEGIN_CS(size) do { \
+    CHECK_CS(size); \
+    debug_printf("r300: BEGIN_CS, count %d, in %s (%s:%d)\n", \
+        size, __FUNCTION__, __FILE__, __LINE__); \
+    cs_winsys->begin_cs(cs, (size), __FILE__, __FUNCTION__, __LINE__); \
+    cs_count = size; \
+} while (0)
+
+#define OUT_CS(value) do { \
+    cs_winsys->write_cs_dword(cs, (value)); \
+    cs_count--; \
+} while (0)
+
+#define OUT_CS_32F(value) do { \
+    cs_winsys->write_cs_dword(cs, fui(value)); \
+    cs_count--; \
+} while (0)
+
+#define OUT_CS_REG(register, value) do { \
+    if (VERY_VERBOSE_REGISTERS) \
+        debug_printf("r300: writing 0x%08X to register 0x%04X\n", \
+            value, register); \
+    assert(register); \
+    OUT_CS(CP_PACKET0(register, 0)); \
+    OUT_CS(value); \
+} while (0)
+
+/* Note: This expects count to be the number of registers,
+ * not the actual packet0 count! */
+#define OUT_CS_REG_SEQ(register, count) do { \
+    if (VERY_VERBOSE_REGISTERS) \
+        debug_printf("r300: writing register sequence of %d to 0x%04X\n", \
+            count, register); \
+    assert(register); \
+    OUT_CS(CP_PACKET0(register, ((count) - 1))); \
+} while (0)
+
+#define OUT_CS_RELOC(bo, offset, rd, wd, flags) do { \
+    debug_printf("r300: writing relocation for buffer %p, offset %d\n", \
+        bo, offset); \
+    assert(bo); \
+    OUT_CS(offset); \
+    cs_winsys->write_cs_reloc(cs, bo, rd, wd, flags); \
+    cs_count -= 2; \
+} while (0)
+
+#define END_CS do { \
+    debug_printf("r300: END_CS in %s (%s:%d)\n", __FUNCTION__, __FILE__, \
+        __LINE__); \
+    if (cs_count != 0) \
+        debug_printf("r300: Warning: cs_count off by %d\n", cs_count); \
+    cs_winsys->end_cs(cs, __FILE__, __FUNCTION__, __LINE__); \
+} while (0)
+
+#define FLUSH_CS do { \
+    debug_printf("r300: FLUSH_CS in %s (%s:%d)\n\n", __FUNCTION__, __FILE__, \
+        __LINE__); \
+    cs_winsys->flush_cs(cs); \
+} while (0)
+
+#include "r300_cs_inlines.h"
+
+#endif /* R300_CS_H */
diff --git a/src/gallium/drivers/r300/r300_cs_inlines.h b/src/gallium/drivers/r300/r300_cs_inlines.h
new file mode 100644
index 0000000000..03bb608eb9
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_cs_inlines.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/* r300_cs_inlines: This is just a handful of useful inlines for sending
+ * (very) common instructions to the CS buffer. Should only be included from
+ * r300_cs.h, probably. */
+
+#ifdef R300_CS_H
+
+#define RADEON_ONE_REG_WR        (1 << 15)
+
+#define OUT_CS_ONE_REG(register, count) do { \
+    if (VERY_VERBOSE_REGISTERS) \
+        debug_printf("r300: writing data sequence of %d to 0x%04X\n", \
+            count, register); \
+    assert(register); \
+    OUT_CS(CP_PACKET0(register, ((count) - 1)) | RADEON_ONE_REG_WR); \
+} while (0)
+
+#define R300_PACIFY do { \
+    OUT_CS_REG(RADEON_WAIT_UNTIL, (1 << 14) | (1 << 15) | (1 << 16) | (1 << 17) | \
+        (1 << 18)); \
+} while (0)
+
+#define R300_SCREENDOOR do { \
+    OUT_CS_REG(R300_SC_SCREENDOOR, 0x0); \
+    R300_PACIFY; \
+    OUT_CS_REG(R300_SC_SCREENDOOR, 0xffffff); \
+} while (0)
+
+#endif /* R300_CS_H */
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
new file mode 100644
index 0000000000..960f45f651
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -0,0 +1,303 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/* r300_emit: Functions for emitting state. */
+
+#include "r300_emit.h"
+
+void r300_emit_blend_state(struct r300_context* r300,
+                           struct r300_blend_state* blend)
+{
+    CS_LOCALS(r300);
+    BEGIN_CS(7);
+    OUT_CS_REG_SEQ(R300_RB3D_CBLEND, 2);
+    OUT_CS(blend->blend_control);
+    OUT_CS(blend->alpha_blend_control);
+    OUT_CS_REG(R300_RB3D_ROPCNTL, blend->rop);
+    OUT_CS_REG(R300_RB3D_DITHER_CTL, blend->dither);
+    END_CS;
+}
+
+void r300_emit_blend_color_state(struct r300_context* r300,
+                                 struct r300_blend_color_state* bc)
+{
+    struct r300_screen* r300screen =
+        (struct r300_screen*)r300->context.screen;
+    CS_LOCALS(r300);
+    if (r300screen->caps->is_r500) {
+        BEGIN_CS(3);
+        OUT_CS_REG_SEQ(R500_RB3D_CONSTANT_COLOR_AR, 2);
+        OUT_CS(bc->blend_color_red_alpha);
+        OUT_CS(bc->blend_color_green_blue);
+        END_CS;
+    } else {
+        BEGIN_CS(2);
+        OUT_CS_REG(R300_RB3D_BLEND_COLOR, bc->blend_color);
+        END_CS;
+    }
+}
+
+void r300_emit_dsa_state(struct r300_context* r300,
+                           struct r300_dsa_state* dsa)
+{
+    struct r300_screen* r300screen =
+        (struct r300_screen*)r300->context.screen;
+    CS_LOCALS(r300);
+    BEGIN_CS(r300screen->caps->is_r500 ? 8 : 8);
+    OUT_CS_REG(R300_FG_ALPHA_FUNC, dsa->alpha_function);
+    /* XXX figure out the r300 counterpart for this */
+    if (r300screen->caps->is_r500) {
+        /* OUT_CS_REG(R500_FG_ALPHA_VALUE, dsa->alpha_reference); */
+    }
+    OUT_CS_REG_SEQ(R300_ZB_CNTL, 3);
+    OUT_CS(dsa->z_buffer_control);
+    OUT_CS(dsa->z_stencil_control);
+    OUT_CS(dsa->stencil_ref_mask);
+    OUT_CS_REG(R300_ZB_ZTOP, dsa->z_buffer_top);
+    if (r300screen->caps->is_r500) {
+        /* OUT_CS_REG(R500_ZB_STENCILREFMASK_BF, dsa->stencil_ref_bf); */
+    }
+    END_CS;
+}
+
+void r300_emit_fragment_shader(struct r300_context* r300,
+                               struct r300_fragment_shader* fs)
+{
+    CS_LOCALS(r300);
+    int i;
+    BEGIN_CS(22);
+
+    OUT_CS_REG(R300_US_CONFIG, MAX2(fs->indirections - 1, 0));
+    OUT_CS_REG(R300_US_PIXSIZE, fs->shader.stack_size);
+    /* XXX figure out exactly how big the sizes are on this reg */
+    OUT_CS_REG(R300_US_CODE_OFFSET, 0x0);
+    /* XXX figure these ones out a bit better kthnx */
+    OUT_CS_REG(R300_US_CODE_ADDR_0, 0x0);
+    OUT_CS_REG(R300_US_CODE_ADDR_1, 0x0);
+    OUT_CS_REG(R300_US_CODE_ADDR_2, 0x0);
+    OUT_CS_REG(R300_US_CODE_ADDR_3, R300_RGBA_OUT);
+
+    for (i = 0; i < fs->alu_instruction_count; i++) {
+        OUT_CS_REG(R300_US_ALU_RGB_INST_0 + (4 * i),
+            fs->instructions[i].alu_rgb_inst);
+        OUT_CS_REG(R300_US_ALU_RGB_ADDR_0 + (4 * i),
+            fs->instructions[i].alu_rgb_addr);
+        OUT_CS_REG(R300_US_ALU_ALPHA_INST_0 + (4 * i),
+            fs->instructions[i].alu_alpha_inst);
+        OUT_CS_REG(R300_US_ALU_ALPHA_ADDR_0 + (4 * i),
+            fs->instructions[i].alu_alpha_addr);
+    }
+
+    END_CS;
+}
+
+void r500_emit_fragment_shader(struct r300_context* r300,
+                               struct r500_fragment_shader* fs)
+{
+    CS_LOCALS(r300);
+    int i = 0;
+    BEGIN_CS(9 + (fs->instruction_count * 6));
+    OUT_CS_REG(R500_US_CONFIG, R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
+    OUT_CS_REG(R500_US_PIXSIZE, fs->shader.stack_size);
+    OUT_CS_REG(R500_US_CODE_ADDR, R500_US_CODE_START_ADDR(0) |
+        R500_US_CODE_END_ADDR(fs->instruction_count));
+
+    OUT_CS_REG(R500_GA_US_VECTOR_INDEX, R500_GA_US_VECTOR_INDEX_TYPE_INSTR);
+    OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA,
+        fs->instruction_count * 6);
+    for (i = 0; i < fs->instruction_count; i++) {
+        OUT_CS(fs->instructions[i].inst0);
+        OUT_CS(fs->instructions[i].inst1);
+        OUT_CS(fs->instructions[i].inst2);
+        OUT_CS(fs->instructions[i].inst3);
+        OUT_CS(fs->instructions[i].inst4);
+        OUT_CS(fs->instructions[i].inst5);
+    }
+    END_CS;
+}
+
+/* Translate pipe_format into US_OUT_FMT. Note that formats are stored from
+ * C3 to C0. */
+uint32_t translate_out_fmt(enum pipe_format format)
+{
+    switch (format) {
+        case PIPE_FORMAT_A8R8G8B8_UNORM:
+            return R300_US_OUT_FMT_C4_8 |
+                R300_C0_SEL_B | R300_C1_SEL_G |
+                R300_C2_SEL_R | R300_C3_SEL_A;
+        default:
+            return R300_US_OUT_FMT_UNUSED;
+    }
+    return 0;
+}
+
+/* XXX add pitch, stride */
+void r300_emit_fb_state(struct r300_context* r300,
+                        struct pipe_framebuffer_state* fb)
+{
+    CS_LOCALS(r300);
+    struct r300_texture* tex;
+    int i;
+
+    BEGIN_CS((5 * fb->nr_cbufs) + (fb->zsbuf ? 5 : 0) + 4);
+    for (i = 0; i < fb->nr_cbufs; i++) {
+        tex = (struct r300_texture*)fb->cbufs[i]->texture;
+        OUT_CS_REG_SEQ(R300_RB3D_COLOROFFSET0 + (4 * i), 1);
+        OUT_CS_RELOC(tex->buffer, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+
+        OUT_CS_REG(R300_US_OUT_FMT_0 + (4 * i),
+            translate_out_fmt(fb->cbufs[i]->format));
+    }
+
+    if (fb->zsbuf) {
+        tex = (struct r300_texture*)fb->zsbuf->texture;
+        OUT_CS_REG_SEQ(R300_ZB_DEPTHOFFSET, 1);
+        OUT_CS_RELOC(tex->buffer, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+        if (fb->zsbuf->format == PIPE_FORMAT_Z24S8_UNORM) {
+            OUT_CS_REG(R300_ZB_FORMAT,
+                R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL);
+        } else {
+            OUT_CS_REG(R300_ZB_FORMAT, 0x0);
+        }
+    }
+
+    OUT_CS_REG(R300_RB3D_DSTCACHE_CTLSTAT,
+        R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
+        R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+    OUT_CS_REG(R300_ZB_ZCACHE_CTLSTAT,
+        R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
+        R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
+    END_CS;
+}
+
+void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs)
+{
+    struct r300_screen* r300screen =
+        (struct r300_screen*)r300->context.screen;
+    CS_LOCALS(r300);
+    BEGIN_CS(13);
+    OUT_CS_REG(R300_VAP_CNTL_STATUS, rs->vap_control_status);
+    OUT_CS_REG_SEQ(R300_SU_POLY_OFFSET_FRONT_SCALE, 6);
+    OUT_CS(rs->depth_scale_front);
+    OUT_CS(rs->depth_offset_front);
+    OUT_CS(rs->depth_scale_back);
+    OUT_CS(rs->depth_offset_back);
+    OUT_CS(rs->polygon_offset_enable);
+    OUT_CS(rs->cull_mode);
+    OUT_CS_REG(R300_GA_LINE_STIPPLE_CONFIG, rs->line_stipple_config);
+    OUT_CS_REG(R300_GA_LINE_STIPPLE_VALUE, rs->line_stipple_value);
+    END_CS;
+}
+
+void r300_emit_scissor_state(struct r300_context* r300,
+                             struct r300_scissor_state* scissor)
+{
+    CS_LOCALS(r300);
+    BEGIN_CS(3);
+    OUT_CS_REG_SEQ(R300_SC_SCISSORS_TL, 2);
+    OUT_CS(scissor->scissor_top_left);
+    OUT_CS(scissor->scissor_bottom_right);
+    END_CS;
+}
+
+void r300_emit_vertex_format_state(struct r300_context* r300)
+{
+    CS_LOCALS(r300);
+    int i;
+
+    BEGIN_CS(6);
+    OUT_CS_REG_SEQ(R300_VAP_VTX_STATE_CNTL, 2);
+    OUT_CS(r300->vertex_info.vinfo.hwfmt[0]);
+    OUT_CS(r300->vertex_info.vinfo.hwfmt[1]);
+    OUT_CS_REG_SEQ(R300_VAP_OUTPUT_VTX_FMT_0, 2);
+    OUT_CS(r300->vertex_info.vinfo.hwfmt[2]);
+    OUT_CS(r300->vertex_info.vinfo.hwfmt[3]);
+    END_CS;
+
+    BEGIN_CS(18);
+    OUT_CS_REG_SEQ(R300_VAP_PROG_STREAM_CNTL_0, 8);
+    for (i = 0; i < 8; i++) {
+        OUT_CS(r300->vertex_info.vap_prog_stream_cntl[i]);
+    }
+    OUT_CS_REG_SEQ(R300_VAP_PROG_STREAM_CNTL_EXT_0, 8);
+    for (i = 0; i < 8; i++) {
+        OUT_CS(r300->vertex_info.vap_prog_stream_cntl_ext[i]);
+    }
+    END_CS;
+}
+
+/* Emit all dirty state. */
+void r300_emit_dirty_state(struct r300_context* r300)
+{
+    struct r300_screen* r300screen =
+        (struct r300_screen*)r300->context.screen;
+    CS_LOCALS(r300);
+
+    if (!(r300->dirty_state) && !(r300->dirty_hw)) {
+        return;
+    }
+
+    r300_update_derived_state(r300);
+
+    /* XXX check size */
+
+    if (r300->dirty_state & R300_NEW_BLEND) {
+        r300_emit_blend_state(r300, r300->blend_state);
+        r300->dirty_state &= ~R300_NEW_BLEND;
+    }
+
+    if (r300->dirty_state & R300_NEW_BLEND_COLOR) {
+        r300_emit_blend_color_state(r300, r300->blend_color_state);
+        r300->dirty_state &= ~R300_NEW_BLEND_COLOR;
+    }
+
+    if (r300->dirty_state & R300_NEW_DSA) {
+        r300_emit_dsa_state(r300, r300->dsa_state);
+        r300->dirty_state &= ~R300_NEW_DSA;
+    }
+
+    if (r300->dirty_state & R300_NEW_FRAGMENT_SHADER) {
+        if (r300screen->caps->is_r500) {
+            r500_emit_fragment_shader(r300,
+                (struct r500_fragment_shader*)r300->fs);
+        } else {
+            r300_emit_fragment_shader(r300,
+                (struct r300_fragment_shader*)r300->fs);
+        }
+        r300->dirty_state &= ~R300_NEW_FRAGMENT_SHADER;
+    }
+
+    if (r300->dirty_state & R300_NEW_RASTERIZER) {
+        r300_emit_rs_state(r300, r300->rs_state);
+        r300->dirty_state &= ~R300_NEW_RASTERIZER;
+    }
+
+    if (r300->dirty_state & R300_NEW_SCISSOR) {
+        r300_emit_scissor_state(r300, r300->scissor_state);
+        r300->dirty_state &= ~R300_NEW_SCISSOR;
+    }
+
+    if (r300->dirty_state & R300_NEW_VERTEX_FORMAT) {
+        r300_emit_vertex_format_state(r300);
+        r300->dirty_state &= ~R300_NEW_VERTEX_FORMAT;
+    }
+}
diff --git a/src/gallium/drivers/r300/r300_emit.h b/src/gallium/drivers/r300/r300_emit.h
new file mode 100644
index 0000000000..f21ca33171
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_emit.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_EMIT_H
+#define R300_EMIT_H
+
+#include "util/u_math.h"
+
+#include "r300_context.h"
+#include "r300_cs.h"
+#include "r300_screen.h"
+
+void r300_emit_blend_state(struct r300_context* r300,
+                           struct r300_blend_state* blend);
+
+void r300_emit_blend_color_state(struct r300_context* r300,
+                                 struct r300_blend_color_state* bc);
+
+void r300_emit_dsa_state(struct r300_context* r300,
+                         struct r300_dsa_state* dsa);
+
+void r300_emit_fragment_shader(struct r300_context* r300,
+                               struct r300_fragment_shader* fs);
+
+void r500_emit_fragment_shader(struct r300_context* r300,
+                               struct r500_fragment_shader* fs);
+
+void r300_emit_fb_state(struct r300_context* r300,
+                        struct pipe_framebuffer_state* fb);
+
+void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs);
+
+void r300_emit_scissor_state(struct r300_context* r300,
+                             struct r300_scissor_state* scissor);
+
+
+/* Emit all dirty state. */
+void r300_emit_dirty_state(struct r300_context* r300);
+
+#endif /* R300_EMIT_H */
diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c
new file mode 100644
index 0000000000..3766f0a0a7
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_flush.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_flush.h"
+
+static void r300_flush(struct pipe_context* pipe,
+                       unsigned flags,
+                       struct pipe_fence_handle** fence)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    CS_LOCALS(r300);
+
+    if (r300->dirty_hw) {
+        FLUSH_CS;
+        r300->dirty_state = R300_NEW_KITCHEN_SINK;
+        r300->dirty_hw = 0;
+    }
+}
+
+void r300_init_flush_functions(struct r300_context* r300)
+{
+    r300->context.flush = r300_flush;
+}
diff --git a/src/gallium/drivers/r300/r300_flush.h b/src/gallium/drivers/r300/r300_flush.h
new file mode 100644
index 0000000000..a1b224b39c
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_flush.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_FLUSH_H
+#define R300_FLUSH_H
+
+#include "pipe/p_context.h"
+
+#include "r300_context.h"
+#include "r300_cs.h"
+
+void r300_init_flush_functions(struct r300_context* r300);
+
+#endif /* R300_FLUSH_H */
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
new file mode 100644
index 0000000000..8888b39a2f
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -0,0 +1,3263 @@
+/**************************************************************************
+
+Copyright (C) 2004-2005 Nicolai Haehnle et al.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/* *INDENT-OFF* */
+
+#ifndef _R300_REG_H
+#define _R300_REG_H
+
+#define R300_MC_INIT_MISC_LAT_TIMER	0x180
+#	define R300_MC_MISC__MC_CPR_INIT_LAT_SHIFT	0
+#	define R300_MC_MISC__MC_VF_INIT_LAT_SHIFT	4
+#	define R300_MC_MISC__MC_DISP0R_INIT_LAT_SHIFT	8
+#	define R300_MC_MISC__MC_DISP1R_INIT_LAT_SHIFT	12
+#	define R300_MC_MISC__MC_FIXED_INIT_LAT_SHIFT	16
+#	define R300_MC_MISC__MC_E2R_INIT_LAT_SHIFT	20
+#	define R300_MC_MISC__MC_SAME_PAGE_PRIO_SHIFT	24
+#	define R300_MC_MISC__MC_GLOBW_INIT_LAT_SHIFT	28
+
+
+#define R300_MC_INIT_GFX_LAT_TIMER	0x154
+#	define R300_MC_MISC__MC_G3D0R_INIT_LAT_SHIFT	0
+#	define R300_MC_MISC__MC_G3D1R_INIT_LAT_SHIFT	4
+#	define R300_MC_MISC__MC_G3D2R_INIT_LAT_SHIFT	8
+#	define R300_MC_MISC__MC_G3D3R_INIT_LAT_SHIFT	12
+#	define R300_MC_MISC__MC_TX0R_INIT_LAT_SHIFT	16
+#	define R300_MC_MISC__MC_TX1R_INIT_LAT_SHIFT	20
+#	define R300_MC_MISC__MC_GLOBR_INIT_LAT_SHIFT	24
+#	define R300_MC_MISC__MC_GLOBW_FULL_LAT_SHIFT	28
+
+/*
+ * This file contains registers and constants for the R300. They have been
+ * found mostly by examining command buffers captured using glxtest, as well
+ * as by extrapolating some known registers and constants from the R200.
+ * I am fairly certain that they are correct unless stated otherwise
+ * in comments.
+ */
+
+#define R300_SE_VPORT_XSCALE                0x1D98
+#define R300_SE_VPORT_XOFFSET               0x1D9C
+#define R300_SE_VPORT_YSCALE                0x1DA0
+#define R300_SE_VPORT_YOFFSET               0x1DA4
+#define R300_SE_VPORT_ZSCALE                0x1DA8
+#define R300_SE_VPORT_ZOFFSET               0x1DAC
+
+#define R300_VAP_PORT_IDX0		    0x2040
+/*
+ * Vertex Array Processing (VAP) Control
+ */
+#define R300_VAP_CNTL	0x2080
+#       define R300_PVS_NUM_SLOTS_SHIFT                 0
+#       define R300_PVS_NUM_CNTLRS_SHIFT                4
+#       define R300_PVS_NUM_FPUS_SHIFT                  8
+#       define R300_VF_MAX_VTX_NUM_SHIFT                18
+#       define R300_GL_CLIP_SPACE_DEF                   (0 << 22)
+#       define R300_DX_CLIP_SPACE_DEF                   (1 << 22)
+#       define R500_TCL_STATE_OPTIMIZATION              (1 << 23)
+
+/* This register is written directly and also starts data section
+ * in many 3d CP_PACKET3's
+ */
+#define R300_VAP_VF_CNTL	0x2084
+#	define	R300_VAP_VF_CNTL__PRIM_TYPE__SHIFT              0
+#	define  R300_VAP_VF_CNTL__PRIM_NONE                     (0<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_POINTS                   (1<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_LINES                    (2<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_LINE_STRIP               (3<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_TRIANGLES                (4<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_TRIANGLE_FAN             (5<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_TRIANGLE_STRIP           (6<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_LINE_LOOP                (12<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_QUADS                    (13<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_QUAD_STRIP               (14<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_POLYGON                  (15<<0)
+
+#	define	R300_VAP_VF_CNTL__PRIM_WALK__SHIFT              4
+	/* State based - direct writes to registers trigger vertex
+           generation */
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_STATE_BASED         (0<<4)
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_INDICES             (1<<4)
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST         (2<<4)
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_EMBEDDED     (3<<4)
+
+	/* I don't think I saw these three used.. */
+#	define	R300_VAP_VF_CNTL__COLOR_ORDER__SHIFT            6
+#	define	R300_VAP_VF_CNTL__TCL_OUTPUT_CTL_ENA__SHIFT     9
+#	define	R300_VAP_VF_CNTL__PROG_STREAM_ENA__SHIFT        10
+
+	/* index size - when not set the indices are assumed to be 16 bit */
+#	define	R300_VAP_VF_CNTL__INDEX_SIZE_32bit              (1<<11)
+	/* number of vertices */
+#	define	R300_VAP_VF_CNTL__NUM_VERTICES__SHIFT           16
+
+#define R500_VAP_INDEX_OFFSET		    0x208c
+
+#define R300_VAP_OUTPUT_VTX_FMT_0           0x2090
+#       define R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT     (1<<0)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT (1<<1)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT (1<<2)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT (1<<3)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT (1<<4)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT (1<<16)
+
+#define R300_VAP_OUTPUT_VTX_FMT_1           0x2094
+	/* each of the following is 3 bits wide, specifies number
+	   of components */
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_0_COMP_CNT_SHIFT 0
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_1_COMP_CNT_SHIFT 3
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_2_COMP_CNT_SHIFT 6
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_3_COMP_CNT_SHIFT 9
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_4_COMP_CNT_SHIFT 12
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_5_COMP_CNT_SHIFT 15
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_6_COMP_CNT_SHIFT 18
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_7_COMP_CNT_SHIFT 21
+#	define R300_VAP_OUTPUT_VTX_FMT_1__NOT_PRESENT  0
+#	define R300_VAP_OUTPUT_VTX_FMT_1__1_COMPONENT  1
+#	define R300_VAP_OUTPUT_VTX_FMT_1__2_COMPONENTS 2
+#	define R300_VAP_OUTPUT_VTX_FMT_1__3_COMPONENTS 3
+#	define R300_VAP_OUTPUT_VTX_FMT_1__4_COMPONENTS 4
+
+#define R300_SE_VTE_CNTL                  0x20b0
+#	define     R300_VPORT_X_SCALE_ENA                (1 << 0)
+#	define     R300_VPORT_X_OFFSET_ENA               (1 << 1)
+#	define     R300_VPORT_Y_SCALE_ENA                (1 << 2)
+#	define     R300_VPORT_Y_OFFSET_ENA               (1 << 3)
+#	define     R300_VPORT_Z_SCALE_ENA                (1 << 4)
+#	define     R300_VPORT_Z_OFFSET_ENA               (1 << 5)
+#	define     R300_VTX_XY_FMT                       (1 << 8)
+#	define     R300_VTX_Z_FMT                        (1 << 9)
+#	define     R300_VTX_W0_FMT                       (1 << 10)
+#	define     R300_SERIAL_PROC_ENA                  (1 << 11)
+
+#define R300_VAP_VTX_SIZE               0x20b4
+
+/* BEGIN: Vertex data assembly - lots of uncertainties */
+
+/* gap */
+
+/* Maximum Vertex Indx Clamp */
+#define R300_VAP_VF_MAX_VTX_INDX         0x2134
+/* Minimum Vertex Indx Clamp */
+#define R300_VAP_VF_MIN_VTX_INDX         0x2138
+
+/** Vertex assembler/processor control status */
+#define R300_VAP_CNTL_STATUS              0x2140
+/* No swap at all (default) */
+#	define R300_VC_NO_SWAP                  (0 << 0)
+/* 16-bit swap: 0xAABBCCDD becomes 0xBBAADDCC */
+#	define R300_VC_16BIT_SWAP               (1 << 0)
+/* 32-bit swap: 0xAABBCCDD becomes 0xDDCCBBAA */
+#	define R300_VC_32BIT_SWAP               (2 << 0)
+/* Half-dword swap: 0xAABBCCDD becomes 0xCCDDAABB */
+#	define R300_VC_HALF_DWORD_SWAP          (3 << 0)
+/* The TCL engine will not be used (as it is logically or even physically removed) */
+#	define R300_VAP_TCL_BYPASS		(1 << 8)
+/* Read only flag if TCL engine is busy. */
+#	define R300_VAP_PVS_BUSY                (1 << 11)
+/* TODO: gap for MAX_MPS */
+/* Read only flag if the vertex store is busy. */
+#	define R300_VAP_VS_BUSY                 (1 << 24)
+/* Read only flag if the reciprocal engine is busy. */
+#	define R300_VAP_RCP_BUSY                (1 << 25)
+/* Read only flag if the viewport transform engine is busy. */
+#	define R300_VAP_VTE_BUSY                (1 << 26)
+/* Read only flag if the memory interface unit is busy. */
+#	define R300_VAP_MUI_BUSY                (1 << 27)
+/* Read only flag if the vertex cache is busy. */
+#	define R300_VAP_VC_BUSY                 (1 << 28)
+/* Read only flag if the vertex fetcher is busy. */
+#	define R300_VAP_VF_BUSY                 (1 << 29)
+/* Read only flag if the register pipeline is busy. */
+#	define R300_VAP_REGPIPE_BUSY            (1 << 30)
+/* Read only flag if the VAP engine is busy. */
+#	define R300_VAP_VAP_BUSY                (1 << 31)
+
+/* gap */
+
+/* Where do we get our vertex data?
+ *
+ * Vertex data either comes either from immediate mode registers or from
+ * vertex arrays.
+ * There appears to be no mixed mode (though we can force the pitch of
+ * vertex arrays to 0, effectively reusing the same element over and over
+ * again).
+ *
+ * Immediate mode is controlled by the INPUT_CNTL registers. I am not sure
+ * if these registers influence vertex array processing.
+ *
+ * Vertex arrays are controlled via the 3D_LOAD_VBPNTR packet3.
+ *
+ * In both cases, vertex attributes are then passed through INPUT_ROUTE.
+ *
+ * Beginning with INPUT_ROUTE_0_0 is a list of WORDs that route vertex data
+ * into the vertex processor's input registers.
+ * The first word routes the first input, the second word the second, etc.
+ * The corresponding input is routed into the register with the given index.
+ * The list is ended by a word with INPUT_ROUTE_END set.
+ *
+ * Always set COMPONENTS_4 in immediate mode.
+ */
+
+#define R300_VAP_PROG_STREAM_CNTL_0                     0x2150
+#       define R300_DATA_TYPE_0_SHIFT                   0
+#       define R300_DATA_TYPE_FLOAT_1                   0
+#       define R300_DATA_TYPE_FLOAT_2                   1
+#       define R300_DATA_TYPE_FLOAT_3                   2
+#       define R300_DATA_TYPE_FLOAT_4                   3
+#       define R300_DATA_TYPE_BYTE                      4
+#       define R300_DATA_TYPE_D3DCOLOR                  5
+#       define R300_DATA_TYPE_SHORT_2                   6
+#       define R300_DATA_TYPE_SHORT_4                   7
+#       define R300_DATA_TYPE_VECTOR_3_TTT              8
+#       define R300_DATA_TYPE_VECTOR_3_EET              9
+#       define R300_SKIP_DWORDS_SHIFT                   4
+#       define R300_DST_VEC_LOC_SHIFT                   8
+#       define R300_LAST_VEC                            (1 << 13)
+#       define R300_SIGNED                              (1 << 14)
+#       define R300_NORMALIZE                           (1 << 15)
+#       define R300_DATA_TYPE_1_SHIFT                   16
+#define R300_VAP_PROG_STREAM_CNTL_1                     0x2154
+#define R300_VAP_PROG_STREAM_CNTL_2                     0x2158
+#define R300_VAP_PROG_STREAM_CNTL_3                     0x215C
+#define R300_VAP_PROG_STREAM_CNTL_4                     0x2160
+#define R300_VAP_PROG_STREAM_CNTL_5                     0x2164
+#define R300_VAP_PROG_STREAM_CNTL_6                     0x2168
+#define R300_VAP_PROG_STREAM_CNTL_7                     0x216C
+/* gap */
+
+/* Notes:
+ *  - always set up to produce at least two attributes:
+ *    if vertex program uses only position, fglrx will set normal, too
+ *  - INPUT_CNTL_0_COLOR and INPUT_CNTL_COLOR bits are always equal.
+ */
+#define R300_VAP_VTX_STATE_CNTL               0x2180
+#       define R300_COLOR_0_ASSEMBLY_SHIFT    0
+#       define R300_SEL_COLOR                 0
+#       define R300_SEL_USER_COLOR_0          1
+#       define R300_SEL_USER_COLOR_1          2
+#       define R300_COLOR_1_ASSEMBLY_SHIFT    2
+#       define R300_COLOR_2_ASSEMBLY_SHIFT    4
+#       define R300_COLOR_3_ASSEMBLY_SHIFT    6
+#       define R300_COLOR_4_ASSEMBLY_SHIFT    8
+#       define R300_COLOR_5_ASSEMBLY_SHIFT    10
+#       define R300_COLOR_6_ASSEMBLY_SHIFT    12
+#       define R300_COLOR_7_ASSEMBLY_SHIFT    14
+#       define R300_UPDATE_USER_COLOR_0_ENA   (1 << 16)
+
+/*
+ * Each bit in this field applies to the corresponding vector in the VSM
+ * memory (i.e. Bit 0 applies to VECTOR_0 (POSITION), etc.). If the bit
+ * is set, then the corresponding 4-Dword Vector is output into the Vertex Stream.
+ */
+#define R300_VAP_VSM_VTX_ASSM               0x2184
+#       define R300_INPUT_CNTL_POS               0x00000001
+#       define R300_INPUT_CNTL_NORMAL            0x00000002
+#       define R300_INPUT_CNTL_COLOR             0x00000004
+#       define R300_INPUT_CNTL_TC0               0x00000400
+#       define R300_INPUT_CNTL_TC1               0x00000800
+#       define R300_INPUT_CNTL_TC2               0x00001000 /* GUESS */
+#       define R300_INPUT_CNTL_TC3               0x00002000 /* GUESS */
+#       define R300_INPUT_CNTL_TC4               0x00004000 /* GUESS */
+#       define R300_INPUT_CNTL_TC5               0x00008000 /* GUESS */
+#       define R300_INPUT_CNTL_TC6               0x00010000 /* GUESS */
+#       define R300_INPUT_CNTL_TC7               0x00020000 /* GUESS */
+
+/* Programmable Stream Control Signed Normalize Control */
+#define R300_VAP_PSC_SGN_NORM_CNTL         0x21dc
+#	define SGN_NORM_ZERO                 0
+#	define SGN_NORM_ZERO_CLAMP_MINUS_ONE 1
+#	define SGN_NORM_NO_ZERO              2
+
+/* gap */
+
+/* Words parallel to INPUT_ROUTE_0; All words that are active in INPUT_ROUTE_0
+ * are set to a swizzling bit pattern, other words are 0.
+ *
+ * In immediate mode, the pattern is always set to xyzw. In vertex array
+ * mode, the swizzling pattern is e.g. used to set zw components in texture
+ * coordinates with only tweo components.
+ */
+#define R300_VAP_PROG_STREAM_CNTL_EXT_0                 0x21e0
+#       define R300_SWIZZLE0_SHIFT                      0
+#       define R300_SWIZZLE_SELECT_X_SHIFT              0
+#       define R300_SWIZZLE_SELECT_Y_SHIFT              3
+#       define R300_SWIZZLE_SELECT_Z_SHIFT              6
+#       define R300_SWIZZLE_SELECT_W_SHIFT              9
+
+#       define R300_SWIZZLE_SELECT_X                    0
+#       define R300_SWIZZLE_SELECT_Y                    1
+#       define R300_SWIZZLE_SELECT_Z                    2
+#       define R300_SWIZZLE_SELECT_W                    3
+#       define R300_SWIZZLE_SELECT_FP_ZERO              4
+#       define R300_SWIZZLE_SELECT_FP_ONE               5
+/* alternate forms for r300_emit.c */
+#       define R300_INPUT_ROUTE_SELECT_X    0
+#       define R300_INPUT_ROUTE_SELECT_Y    1
+#       define R300_INPUT_ROUTE_SELECT_Z    2
+#       define R300_INPUT_ROUTE_SELECT_W    3
+#       define R300_INPUT_ROUTE_SELECT_ZERO 4
+#       define R300_INPUT_ROUTE_SELECT_ONE  5
+
+#       define R300_WRITE_ENA_SHIFT                     12
+#       define R300_WRITE_ENA_X                         1
+#       define R300_WRITE_ENA_Y                         2
+#       define R300_WRITE_ENA_Z                         4
+#       define R300_WRITE_ENA_W                         8
+#       define R300_SWIZZLE1_SHIFT                      16
+#define R300_VAP_PROG_STREAM_CNTL_EXT_1                 0x21e4
+#define R300_VAP_PROG_STREAM_CNTL_EXT_2                 0x21e8
+#define R300_VAP_PROG_STREAM_CNTL_EXT_3                 0x21ec
+#define R300_VAP_PROG_STREAM_CNTL_EXT_4                 0x21f0
+#define R300_VAP_PROG_STREAM_CNTL_EXT_5                 0x21f4
+#define R300_VAP_PROG_STREAM_CNTL_EXT_6                 0x21f8
+#define R300_VAP_PROG_STREAM_CNTL_EXT_7                 0x21fc
+
+/* END: Vertex data assembly */
+
+/* gap */
+
+/* BEGIN: Upload vertex program and data */
+
+/*
+ * The programmable vertex shader unit has a memory bank of unknown size
+ * that can be written to in 16 byte units by writing the address into
+ * UPLOAD_ADDRESS, followed by data in UPLOAD_DATA (multiples of 4 DWORDs).
+ *
+ * Pointers into the memory bank are always in multiples of 16 bytes.
+ *
+ * The memory bank is divided into areas with fixed meaning.
+ *
+ * Starting at address UPLOAD_PROGRAM: Vertex program instructions.
+ * Native limits reported by drivers from ATI suggest size 256 (i.e. 4KB),
+ * whereas the difference between known addresses suggests size 512.
+ *
+ * Starting at address UPLOAD_PARAMETERS: Vertex program parameters.
+ * Native reported limits and the VPI layout suggest size 256, whereas
+ * difference between known addresses suggests size 512.
+ *
+ * At address UPLOAD_POINTSIZE is a vector (0, 0, ps, 0), where ps is the
+ * floating point pointsize. The exact purpose of this state is uncertain,
+ * as there is also the R300_RE_POINTSIZE register.
+ *
+ * Multiple vertex programs and parameter sets can be loaded at once,
+ * which could explain the size discrepancy.
+ */
+#define R300_VAP_PVS_VECTOR_INDX_REG         0x2200
+#       define R300_PVS_CODE_START           0
+#       define R300_MAX_PVS_CODE_LINES       256
+#       define R500_MAX_PVS_CODE_LINES       1024
+#       define R300_PVS_CONST_START          512
+#       define R500_PVS_CONST_START          1024
+#       define R300_MAX_PVS_CONST_VECS       256
+#       define R500_MAX_PVS_CONST_VECS       1024
+#       define R300_PVS_UCP_START            1024
+#       define R500_PVS_UCP_START            1536
+#       define R300_POINT_VPORT_SCALE_OFFSET 1030
+#       define R500_POINT_VPORT_SCALE_OFFSET 1542
+#       define R300_POINT_GEN_TEX_OFFSET     1031
+#       define R500_POINT_GEN_TEX_OFFSET     1543
+
+/*
+ * These are obsolete defines form r300_context.h, but they might give some
+ * clues when investigating the addresses further...
+ */
+#if 0
+#define VSF_DEST_PROGRAM        0x0
+#define VSF_DEST_MATRIX0        0x200
+#define VSF_DEST_MATRIX1        0x204
+#define VSF_DEST_MATRIX2        0x208
+#define VSF_DEST_VECTOR0        0x20c
+#define VSF_DEST_VECTOR1        0x20d
+#define VSF_DEST_UNKNOWN1       0x400
+#define VSF_DEST_UNKNOWN2       0x406
+#endif
+
+/* gap */
+
+#define R300_VAP_PVS_UPLOAD_DATA            0x2208
+
+/* END: Upload vertex program and data */
+
+/* gap */
+
+/* I do not know the purpose of this register. However, I do know that
+ * it is set to 221C_CLEAR for clear operations and to 221C_NORMAL
+ * for normal rendering.
+ *
+ * 2007-11-05: This register is the user clip plane control register, but there
+ * also seems to be a rendering mode control; the NORMAL/CLEAR defines.
+ *
+ * See bug #9871. http://bugs.freedesktop.org/attachment.cgi?id=10672&action=view
+ */
+#define R300_VAP_CLIP_CNTL                       0x221C
+#       define R300_VAP_UCP_ENABLE_0             (1 << 0)
+#       define R300_VAP_UCP_ENABLE_1             (1 << 1)
+#       define R300_VAP_UCP_ENABLE_2             (1 << 2)
+#       define R300_VAP_UCP_ENABLE_3             (1 << 3)
+#       define R300_VAP_UCP_ENABLE_4             (1 << 4)
+#       define R300_VAP_UCP_ENABLE_5             (1 << 5)
+#       define R300_PS_UCP_MODE_DIST_COP         (0 << 14)
+#       define R300_PS_UCP_MODE_RADIUS_COP       (1 << 14)
+#       define R300_PS_UCP_MODE_RADIUS_COP_CLIP  (2 << 14)
+#       define R300_PS_UCP_MODE_CLIP_AS_TRIFAN   (3 << 14)
+#       define R300_CLIP_DISABLE                 (1 << 16)
+#       define R300_UCP_CULL_ONLY_ENABLE         (1 << 17)
+#       define R300_BOUNDARY_EDGE_FLAG_ENABLE    (1 << 18)
+#       define R500_COLOR2_IS_TEXTURE            (1 << 20)
+#       define R500_COLOR3_IS_TEXTURE            (1 << 21)
+
+/* These seem to be per-pixel and per-vertex X and Y clipping planes. The first
+ * plane is per-pixel and the second plane is per-vertex.
+ *
+ * This was determined by experimentation alone but I believe it is correct.
+ *
+ * These registers are called X_QUAD0_1_FL to X_QUAD0_4_FL by glxtest.
+ */
+#define R300_VAP_GB_VERT_CLIP_ADJ                   0x2220
+#define R300_VAP_GB_VERT_DISC_ADJ                   0x2224
+#define R300_VAP_GB_HORZ_CLIP_ADJ                   0x2228
+#define R300_VAP_GB_HORZ_DISC_ADJ                   0x222c
+
+/* gap */
+
+/* Sometimes, END_OF_PKT and 0x2284=0 are the only commands sent between
+ * rendering commands and overwriting vertex program parameters.
+ * Therefore, I suspect writing zero to 0x2284 synchronizes the engine and
+ * avoids bugs caused by still running shaders reading bad data from memory.
+ */
+#define R300_VAP_PVS_STATE_FLUSH_REG        0x2284
+
+/* This register is used to define the number of core clocks to wait for a
+ * vertex to be received by the VAP input controller (while the primitive
+ * path is backed up) before forcing any accumulated vertices to be submitted
+ * to the vertex processing path.
+ */
+#define VAP_PVS_VTX_TIMEOUT_REG             0x2288
+#       define R300_2288_R300                    0x00750000 /* -- nh */
+#       define R300_2288_RV350                   0x0000FFFF /* -- Vladimir */
+
+/* gap */
+
+/* Addresses are relative to the vertex program instruction area of the
+ * memory bank. PROGRAM_END points to the last instruction of the active
+ * program
+ *
+ * The meaning of the two UNKNOWN fields is obviously not known. However,
+ * experiments so far have shown that both *must* point to an instruction
+ * inside the vertex program, otherwise the GPU locks up.
+ *
+ * fglrx usually sets CNTL_3_UNKNOWN to the end of the program and
+ * R300_PVS_CNTL_1_POS_END_SHIFT points to instruction where last write to
+ * position takes place.
+ *
+ * Most likely this is used to ignore rest of the program in cases
+ * where group of verts arent visible. For some reason this "section"
+ * is sometimes accepted other instruction that have no relationship with
+ * position calculations.
+ */
+#define R300_VAP_PVS_CODE_CNTL_0            0x22D0
+#       define R300_PVS_FIRST_INST_SHIFT         0
+#       define R300_PVS_XYZW_VALID_INST_SHIFT    10
+#       define R300_PVS_LAST_INST_SHIFT          20
+/* Addresses are relative the the vertex program parameters area. */
+#define R300_VAP_PVS_CONST_CNTL             0x22D4
+#       define R300_PVS_CONST_BASE_OFFSET_SHIFT  0
+#       define R300_PVS_MAX_CONST_ADDR_SHIFT     16
+#define R300_VAP_PVS_CODE_CNTL_1	    0x22D8
+#       define R300_PVS_LAST_VTX_SRC_INST_SHIFT  0
+#define R300_VAP_PVS_FLOW_CNTL_OPC          0x22DC
+
+/* The entire range from 0x2300 to 0x2AC inclusive seems to be used for
+ * immediate vertices
+ */
+#define R300_VAP_VTX_COLOR_R                0x2464
+#define R300_VAP_VTX_COLOR_G                0x2468
+#define R300_VAP_VTX_COLOR_B                0x246C
+#define R300_VAP_VTX_POS_0_X_1              0x2490 /* used for glVertex2*() */
+#define R300_VAP_VTX_POS_0_Y_1              0x2494
+#define R300_VAP_VTX_COLOR_PKD              0x249C /* RGBA */
+#define R300_VAP_VTX_POS_0_X_2              0x24A0 /* used for glVertex3*() */
+#define R300_VAP_VTX_POS_0_Y_2              0x24A4
+#define R300_VAP_VTX_POS_0_Z_2              0x24A8
+/* write 0 to indicate end of packet? */
+#define R300_VAP_VTX_END_OF_PKT             0x24AC
+
+/* gap */
+
+/* These are values from r300_reg/r300_reg.h - they are known to be correct
+ * and are here so we can use one register file instead of several
+ * - Vladimir
+ */
+#define R300_GB_VAP_RASTER_VTX_FMT_0	0x4000
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__POS_PRESENT	(1<<0)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_0_PRESENT	(1<<1)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_1_PRESENT	(1<<2)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_2_PRESENT	(1<<3)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_3_PRESENT	(1<<4)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_SPACE	(0xf<<5)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__PT_SIZE_PRESENT	(0x1<<16)
+
+#define R300_GB_VAP_RASTER_VTX_FMT_1	0x4004
+	/* each of the following is 3 bits wide, specifies number
+	   of components */
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_0_COMP_CNT_SHIFT	0
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_1_COMP_CNT_SHIFT	3
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_2_COMP_CNT_SHIFT	6
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_3_COMP_CNT_SHIFT	9
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_4_COMP_CNT_SHIFT	12
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_5_COMP_CNT_SHIFT	15
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_6_COMP_CNT_SHIFT	18
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_7_COMP_CNT_SHIFT	21
+
+/* UNK30 seems to enables point to quad transformation on textures
+ * (or something closely related to that).
+ * This bit is rather fatal at the time being due to lackings at pixel
+ * shader side
+ * Specifies top of Raster pipe specific enable controls.
+ */
+#define R300_GB_ENABLE	0x4008
+#	define R300_GB_POINT_STUFF_DISABLE     (0 << 0)
+#	define R300_GB_POINT_STUFF_ENABLE      (1 << 0) /* Specifies if points will have stuffed texture coordinates. */
+#	define R300_GB_LINE_STUFF_DISABLE      (0 << 1)
+#	define R300_GB_LINE_STUFF_ENABLE       (1 << 1) /* Specifies if lines will have stuffed texture coordinates. */
+#	define R300_GB_TRIANGLE_STUFF_DISABLE  (0 << 2)
+#	define R300_GB_TRIANGLE_STUFF_ENABLE   (1 << 2) /* Specifies if triangles will have stuffed texture coordinates. */
+#	define R300_GB_STENCIL_AUTO_DISABLE    (0 << 4)
+#	define R300_GB_STENCIL_AUTO_ENABLE     (1 << 4) /* Enable stencil auto inc/dec based on triangle cw/ccw, force into dzy low bit. */
+#	define R300_GB_STENCIL_AUTO_FORCE      (2 << 4) /* Force 0 into dzy low bit. */
+
+	/* each of the following is 2 bits wide */
+#define R300_GB_TEX_REPLICATE	0 /* Replicate VAP source texture coordinates (S,T,[R,Q]). */
+#define R300_GB_TEX_ST		1 /* Stuff with source texture coordinates (S,T). */
+#define R300_GB_TEX_STR		2 /* Stuff with source texture coordinates (S,T,R). */
+#	define R300_GB_TEX0_SOURCE_SHIFT	16
+#	define R300_GB_TEX1_SOURCE_SHIFT	18
+#	define R300_GB_TEX2_SOURCE_SHIFT	20
+#	define R300_GB_TEX3_SOURCE_SHIFT	22
+#	define R300_GB_TEX4_SOURCE_SHIFT	24
+#	define R300_GB_TEX5_SOURCE_SHIFT	26
+#	define R300_GB_TEX6_SOURCE_SHIFT	28
+#	define R300_GB_TEX7_SOURCE_SHIFT	30
+
+/* MSPOS - positions for multisample antialiasing (?) */
+#define R300_GB_MSPOS0                           0x4010
+	/* shifts - each of the fields is 4 bits */
+#	define R300_GB_MSPOS0__MS_X0_SHIFT	0
+#	define R300_GB_MSPOS0__MS_Y0_SHIFT	4
+#	define R300_GB_MSPOS0__MS_X1_SHIFT	8
+#	define R300_GB_MSPOS0__MS_Y1_SHIFT	12
+#	define R300_GB_MSPOS0__MS_X2_SHIFT	16
+#	define R300_GB_MSPOS0__MS_Y2_SHIFT	20
+#	define R300_GB_MSPOS0__MSBD0_Y		24
+#	define R300_GB_MSPOS0__MSBD0_X		28
+
+#define R300_GB_MSPOS1                           0x4014
+#	define R300_GB_MSPOS1__MS_X3_SHIFT	0
+#	define R300_GB_MSPOS1__MS_Y3_SHIFT	4
+#	define R300_GB_MSPOS1__MS_X4_SHIFT	8
+#	define R300_GB_MSPOS1__MS_Y4_SHIFT	12
+#	define R300_GB_MSPOS1__MS_X5_SHIFT	16
+#	define R300_GB_MSPOS1__MS_Y5_SHIFT	20
+#	define R300_GB_MSPOS1__MSBD1		24
+
+/* Specifies the graphics pipeline configuration for rasterization. */
+#define R300_GB_TILE_CONFIG                      0x4018
+#	define R300_GB_TILE_DISABLE             (0 << 0)
+#	define R300_GB_TILE_ENABLE              (1 << 0)
+#	define R300_GB_TILE_PIPE_COUNT_RV300	(0 << 1) /* RV350 (1 pipe, 1 ctx) */
+#	define R300_GB_TILE_PIPE_COUNT_R300	(3 << 1) /* R300 (2 pipes, 1 ctx) */
+#	define R300_GB_TILE_PIPE_COUNT_R420_3P  (6 << 1) /* R420-3P (3 pipes, 1 ctx) */
+#	define R300_GB_TILE_PIPE_COUNT_R420	(7 << 1) /* R420 (4 pipes, 1 ctx) */
+#	define R300_GB_TILE_SIZE_8		(0 << 4)
+#	define R300_GB_TILE_SIZE_16		(1 << 4)
+#	define R300_GB_TILE_SIZE_32		(2 << 4)
+#	define R300_GB_SUPER_SIZE_1		(0 << 6)
+#	define R300_GB_SUPER_SIZE_2		(1 << 6)
+#	define R300_GB_SUPER_SIZE_4		(2 << 6)
+#	define R300_GB_SUPER_SIZE_8		(3 << 6)
+#	define R300_GB_SUPER_SIZE_16		(4 << 6)
+#	define R300_GB_SUPER_SIZE_32		(5 << 6)
+#	define R300_GB_SUPER_SIZE_64		(6 << 6)
+#	define R300_GB_SUPER_SIZE_128		(7 << 6)
+#	define R300_GB_SUPER_X_SHIFT		9	/* 3 bits wide */
+#	define R300_GB_SUPER_Y_SHIFT		12	/* 3 bits wide */
+#	define R300_GB_SUPER_TILE_A		(0 << 15)
+#	define R300_GB_SUPER_TILE_B		(1 << 15)
+#	define R300_GB_SUBPIXEL_1_12		(0 << 16)
+#	define R300_GB_SUBPIXEL_1_16		(1 << 16)
+#	define GB_TILE_CONFIG_QUADS_PER_RAS_4   (0 << 17)
+#	define GB_TILE_CONFIG_QUADS_PER_RAS_8   (1 << 17)
+#	define GB_TILE_CONFIG_QUADS_PER_RAS_16  (2 << 17)
+#	define GB_TILE_CONFIG_QUADS_PER_RAS_32  (3 << 17)
+#	define GB_TILE_CONFIG_BB_SCAN_INTERCEPT (0 << 19)
+#	define GB_TILE_CONFIG_BB_SCAN_BOUND_BOX (1 << 19)
+#	define GB_TILE_CONFIG_ALT_SCAN_EN_LR    (0 << 20)
+#	define GB_TILE_CONFIG_ALT_SCAN_EN_LRL   (1 << 20)
+#	define GB_TILE_CONFIG_ALT_OFFSET        (0 << 21)
+#	define GB_TILE_CONFIG_SUBPRECISION      (0 << 22)
+#	define GB_TILE_CONFIG_ALT_TILING_DEF    (0 << 23)
+#	define GB_TILE_CONFIG_ALT_TILING_3_2    (1 << 23)
+#	define GB_TILE_CONFIG_Z_EXTENDED_24_1   (0 << 24)
+#	define GB_TILE_CONFIG_Z_EXTENDED_S25_1  (1 << 24)
+
+/* Specifies the sizes of the various FIFO`s in the sc/rs/us. This register must be the first one written */
+#define R300_GB_FIFO_SIZE	0x4024
+	/* each of the following is 2 bits wide */
+#define R300_GB_FIFO_SIZE_32	0
+#define R300_GB_FIFO_SIZE_64	1
+#define R300_GB_FIFO_SIZE_128	2
+#define R300_GB_FIFO_SIZE_256	3
+#	define R300_SC_IFIFO_SIZE_SHIFT	0
+#	define R300_SC_TZFIFO_SIZE_SHIFT	2
+#	define R300_SC_BFIFO_SIZE_SHIFT	4
+
+#	define R300_US_OFIFO_SIZE_SHIFT	12
+#	define R300_US_WFIFO_SIZE_SHIFT	14
+	/* the following use the same constants as above, but meaning is
+	   is times 2 (i.e. instead of 32 words it means 64 */
+#	define R300_RS_TFIFO_SIZE_SHIFT	6
+#	define R300_RS_CFIFO_SIZE_SHIFT	8
+#	define R300_US_RAM_SIZE_SHIFT		10
+	/* watermarks, 3 bits wide */
+#	define R300_RS_HIGHWATER_COL_SHIFT	16
+#	define R300_RS_HIGHWATER_TEX_SHIFT	19
+#	define R300_OFIFO_HIGHWATER_SHIFT	22	/* two bits only */
+#	define R300_CUBE_FIFO_HIGHWATER_COL_SHIFT	24
+
+#define GB_Z_PEQ_CONFIG                          0x4028
+#	define GB_Z_PEQ_CONFIG_Z_PEQ_SIZE_4_4    (0 << 0)
+#	define GB_Z_PEQ_CONFIG_Z_PEQ_SIZE_8_8    (1 << 0)
+
+/* Specifies various polygon specific selects (fog, depth, perspective). */
+#define R300_GB_SELECT                           0x401c
+#	define R300_GB_FOG_SELECT_C0A		(0 << 0)
+#	define R300_GB_FOG_SELECT_C1A           (1 << 0)
+#	define R300_GB_FOG_SELECT_C2A           (2 << 0)
+#	define R300_GB_FOG_SELECT_C3A           (3 << 0)
+#	define R300_GB_FOG_SELECT_1_1_W         (4 << 0)
+#	define R300_GB_FOG_SELECT_Z		(5 << 0)
+#	define R300_GB_DEPTH_SELECT_Z		(0 << 3)
+#	define R300_GB_DEPTH_SELECT_1_1_W	(1 << 3)
+#	define R300_GB_W_SELECT_1_W		(0 << 4)
+#	define R300_GB_W_SELECT_1		(1 << 4)
+#	define R300_GB_FOG_STUFF_DISABLE        (0 << 5)
+#	define R300_GB_FOG_STUFF_ENABLE         (1 << 5)
+#	define R300_GB_FOG_STUFF_TEX_SHIFT      6
+#	define R300_GB_FOG_STUFF_TEX_MASK       0x000003c0
+#	define R300_GB_FOG_STUFF_COMP_SHIFT     10
+#	define R300_GB_FOG_STUFF_COMP_MASK      0x00000c00
+
+/* Specifies the graphics pipeline configuration for antialiasing. */
+#define R300_GB_AA_CONFIG                         0x4020
+#	define GB_AA_CONFIG_AA_DISABLE           (0 << 0)
+#	define GB_AA_CONFIG_AA_ENABLE            (1 << 0)
+#	define GB_AA_CONFIG_NUM_AA_SUBSAMPLES_2  (0 << 1)
+#	define GB_AA_CONFIG_NUM_AA_SUBSAMPLES_3  (1 << 1)
+#	define GB_AA_CONFIG_NUM_AA_SUBSAMPLES_4  (2 << 1)
+#	define GB_AA_CONFIG_NUM_AA_SUBSAMPLES_6  (3 << 1)
+
+/* Selects which of 4 pipes are active. */
+#define GB_PIPE_SELECT                           0x402c
+#	define GB_PIPE_SELECT_PIPE0_ID_SHIFT  0
+#	define GB_PIPE_SELECT_PIPE1_ID_SHIFT  2
+#	define GB_PIPE_SELECT_PIPE2_ID_SHIFT  4
+#	define GB_PIPE_SELECT_PIPE3_ID_SHIFT  6
+#	define GB_PIPE_SELECT_PIPE_MASK_SHIFT 8
+#	define GB_PIPE_SELECT_MAX_PIPE        12
+#	define GB_PIPE_SELECT_BAD_PIPES       14
+#	define GB_PIPE_SELECT_CONFIG_PIPES    18
+
+
+/* Specifies the sizes of the various FIFO`s in the sc/rs. */
+#define GB_FIFO_SIZE1                            0x4070
+/* High water mark for SC input fifo */
+#	define GB_FIFO_SIZE1_SC_HIGHWATER_IFIFO_SHIFT 0
+#	define GB_FIFO_SIZE1_SC_HIGHWATER_IFIFO_MASK  0x0000003f
+/* High water mark for SC input fifo (B) */
+#	define GB_FIFO_SIZE1_SC_HIGHWATER_BFIFO_SHIFT 6
+#	define GB_FIFO_SIZE1_SC_HIGHWATER_BFIFO_MASK  0x00000fc0
+/* High water mark for RS colors' fifo */
+#	define GB_FIFO_SIZE1_SC_HIGHWATER_COL_SHIFT   12
+#	define GB_FIFO_SIZE1_SC_HIGHWATER_COL_MASK    0x0003f000
+/* High water mark for RS textures' fifo */
+#	define GB_FIFO_SIZE1_SC_HIGHWATER_TEX_SHIFT   18
+#	define GB_FIFO_SIZE1_SC_HIGHWATER_TEX_MASK    0x00fc0000
+
+/* This table specifies the source location and format for up to 16 texture
+ * addresses (i[0]:i[15]) and four colors (c[0]:c[3])
+ */
+#define R500_RS_IP_0					0x4074
+#define R500_RS_IP_1					0x4078
+#define R500_RS_IP_2					0x407C
+#define R500_RS_IP_3					0x4080
+#define R500_RS_IP_4					0x4084
+#define R500_RS_IP_5					0x4088
+#define R500_RS_IP_6					0x408C
+#define R500_RS_IP_7					0x4090
+#define R500_RS_IP_8					0x4094
+#define R500_RS_IP_9					0x4098
+#define R500_RS_IP_10					0x409C
+#define R500_RS_IP_11					0x40A0
+#define R500_RS_IP_12					0x40A4
+#define R500_RS_IP_13					0x40A8
+#define R500_RS_IP_14					0x40AC
+#define R500_RS_IP_15					0x40B0
+#define R500_RS_IP_PTR_K0                               62
+#define R500_RS_IP_PTR_K1                               63
+#define R500_RS_IP_TEX_PTR_S_SHIFT 			0
+#define R500_RS_IP_TEX_PTR_T_SHIFT 			6
+#define R500_RS_IP_TEX_PTR_R_SHIFT 			12
+#define R500_RS_IP_TEX_PTR_Q_SHIFT 			18
+#define R500_RS_IP_COL_PTR_SHIFT 			24
+#define R500_RS_IP_COL_FMT_SHIFT 			27
+#	define R500_RS_COL_PTR(x)		        ((x) << 24)
+#       define R500_RS_COL_FMT(x)                       ((x) << 27)
+/* gap */
+#define R500_RS_IP_OFFSET_DIS 				(0 << 31)
+#define R500_RS_IP_OFFSET_EN 				(1 << 31)
+
+/* gap */
+
+/* Zero to flush caches. */
+#define R300_TX_INVALTAGS                   0x4100
+#define R300_TX_FLUSH                       0x0
+
+/* The upper enable bits are guessed, based on fglrx reported limits. */
+#define R300_TX_ENABLE                      0x4104
+#       define R300_TX_ENABLE_0                  (1 << 0)
+#       define R300_TX_ENABLE_1                  (1 << 1)
+#       define R300_TX_ENABLE_2                  (1 << 2)
+#       define R300_TX_ENABLE_3                  (1 << 3)
+#       define R300_TX_ENABLE_4                  (1 << 4)
+#       define R300_TX_ENABLE_5                  (1 << 5)
+#       define R300_TX_ENABLE_6                  (1 << 6)
+#       define R300_TX_ENABLE_7                  (1 << 7)
+#       define R300_TX_ENABLE_8                  (1 << 8)
+#       define R300_TX_ENABLE_9                  (1 << 9)
+#       define R300_TX_ENABLE_10                 (1 << 10)
+#       define R300_TX_ENABLE_11                 (1 << 11)
+#       define R300_TX_ENABLE_12                 (1 << 12)
+#       define R300_TX_ENABLE_13                 (1 << 13)
+#       define R300_TX_ENABLE_14                 (1 << 14)
+#       define R300_TX_ENABLE_15                 (1 << 15)
+
+#define R500_TX_FILTER_4		    0x4110
+#	define R500_TX_WEIGHT_1_SHIFT            (0)
+#	define R500_TX_WEIGHT_0_SHIFT            (11)
+#	define R500_TX_WEIGHT_PAIR               (1<<22)
+#	define R500_TX_PHASE_SHIFT               (23)
+#	define R500_TX_DIRECTION_HORIZONTAL	 (0<<27)
+#	define R500_TX_DIRECTION_VERITCAL	 (1<<27)
+
+/* S Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) */
+#define R300_GA_POINT_S0                              0x4200
+
+/* T Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) */
+#define R300_GA_POINT_T0                              0x4204
+
+/* S Texture Coordinate of Vertex 2 for Point texture stuffing (URC) */
+#define R300_GA_POINT_S1                              0x4208
+
+/* T Texture Coordinate of Vertex 2 for Point texture stuffing (URC) */
+#define R300_GA_POINT_T1                              0x420c
+
+/* Specifies amount to shift integer position of vertex (screen space) before
+ * converting to float for triangle stipple.
+ */
+#define R300_GA_TRIANGLE_STIPPLE            0x4214
+#	define R300_GA_TRIANGLE_STIPPLE_X_SHIFT_SHIFT 0
+#	define R300_GA_TRIANGLE_STIPPLE_X_SHIFT_MASK  0x0000000f
+#	define R300_GA_TRIANGLE_STIPPLE_Y_SHIFT_SHIFT 16
+#	define R300_GA_TRIANGLE_STIPPLE_Y_SHIFT_MASK  0x000f0000
+
+/* The pointsize is given in multiples of 6. The pointsize can be enormous:
+ * Clear() renders a single point that fills the entire framebuffer.
+ * 1/2 Height of point; fixed (16.0), subpixel format (1/12 or 1/16, even if in
+ * 8b precision).
+ */
+#define R300_GA_POINT_SIZE                   0x421C
+#       define R300_POINTSIZE_Y_SHIFT         0
+#       define R300_POINTSIZE_Y_MASK          0x0000ffff
+#       define R300_POINTSIZE_X_SHIFT         16
+#       define R300_POINTSIZE_X_MASK          0xffff0000
+#       define R300_POINTSIZE_MAX             (R300_POINTSIZE_Y_MASK / 6)
+
+/* Blue fill color */
+#define R500_GA_FILL_R                                0x4220
+
+/* Blue fill color */
+#define R500_GA_FILL_G                                0x4224
+
+/* Blue fill color */
+#define R500_GA_FILL_B                                0x4228
+
+/* Alpha fill color */
+#define R500_GA_FILL_A                                0x422c
+
+
+/* Specifies maximum and minimum point & sprite sizes for per vertex size
+ * specification. The lower part (15:0) is MIN and (31:16) is max.
+ */
+#define R300_GA_POINT_MINMAX                0x4230
+#       define R300_GA_POINT_MINMAX_MIN_SHIFT          0
+#       define R300_GA_POINT_MINMAX_MIN_MASK           (0xFFFF << 0)
+#       define R300_GA_POINT_MINMAX_MAX_SHIFT          16
+#       define R300_GA_POINT_MINMAX_MAX_MASK           (0xFFFF << 16)
+
+/* 1/2 width of line, in subpixels (1/12 or 1/16 only, even in 8b
+ * subprecision); (16.0) fixed format.
+ *
+ * The line width is given in multiples of 6.
+ * In default mode lines are classified as vertical lines.
+ * HO: horizontal
+ * VE: vertical or horizontal
+ * HO & VE: no classification
+ */
+#define R300_GA_LINE_CNTL                             0x4234
+#       define R300_GA_LINE_CNTL_WIDTH_SHIFT       0
+#       define R300_GA_LINE_CNTL_WIDTH_MASK        0x0000ffff
+#	define R300_GA_LINE_CNTL_END_TYPE_HOR      (0 << 16)
+#	define R300_GA_LINE_CNTL_END_TYPE_VER      (1 << 16)
+#	define R300_GA_LINE_CNTL_END_TYPE_SQR      (2 << 16) /* horizontal or vertical depending upon slope */
+#	define R300_GA_LINE_CNTL_END_TYPE_COMP     (3 << 16) /* Computed (perpendicular to slope) */
+#	define R500_GA_LINE_CNTL_SORT_NO           (0 << 18)
+#	define R500_GA_LINE_CNTL_SORT_MINX_MINY    (1 << 18)
+/** TODO: looks wrong */
+#       define R300_LINESIZE_MAX              (R300_GA_LINE_CNTL_WIDTH_MASK / 6)
+/** TODO: looks wrong */
+#       define R300_LINE_CNT_HO               (1 << 16)
+/** TODO: looks wrong */
+#       define R300_LINE_CNT_VE               (1 << 17)
+
+/* Line Stipple configuration information. */
+#define R300_GA_LINE_STIPPLE_CONFIG                   0x4238
+#	define R300_GA_LINE_STIPPLE_CONFIG_LINE_RESET_NO     (0 << 0)
+#	define R300_GA_LINE_STIPPLE_CONFIG_LINE_RESET_LINE   (1 << 0)
+#	define R300_GA_LINE_STIPPLE_CONFIG_LINE_RESET_PACKET (2 << 0)
+#	define R300_GA_LINE_STIPPLE_CONFIG_STIPPLE_SCALE_SHIFT 2
+#	define R300_GA_LINE_STIPPLE_CONFIG_STIPPLE_SCALE_MASK  0xfffffffc
+
+/* Used to load US instructions and constants */
+#define R500_GA_US_VECTOR_INDEX               0x4250
+#	define R500_GA_US_VECTOR_INDEX_SHIFT       0
+#	define R500_GA_US_VECTOR_INDEX_MASK        0x000000ff
+#	define R500_GA_US_VECTOR_INDEX_TYPE_INSTR  (0 << 16)
+#	define R500_GA_US_VECTOR_INDEX_TYPE_CONST  (1 << 16)
+#	define R500_GA_US_VECTOR_INDEX_CLAMP_NO    (0 << 17)
+#	define R500_GA_US_VECTOR_INDEX_CLAMP_CONST (1 << 17)
+
+/* Data register for loading US instructions and constants */
+#define R500_GA_US_VECTOR_DATA                0x4254
+
+/* Specifies color properties and mappings of textures. */
+#define R500_GA_COLOR_CONTROL_PS3                     0x4258
+#	define R500_TEX0_SHADING_PS3_SOLID       (0 << 0)
+#	define R500_TEX0_SHADING_PS3_FLAT        (1 << 0)
+#	define R500_TEX0_SHADING_PS3_GOURAUD     (2 << 0)
+#	define R500_TEX1_SHADING_PS3_SOLID       (0 << 2)
+#	define R500_TEX1_SHADING_PS3_FLAT        (1 << 2)
+#	define R500_TEX1_SHADING_PS3_GOURAUD     (2 << 2)
+#	define R500_TEX2_SHADING_PS3_SOLID       (0 << 4)
+#	define R500_TEX2_SHADING_PS3_FLAT        (1 << 4)
+#	define R500_TEX2_SHADING_PS3_GOURAUD     (2 << 4)
+#	define R500_TEX3_SHADING_PS3_SOLID       (0 << 6)
+#	define R500_TEX3_SHADING_PS3_FLAT        (1 << 6)
+#	define R500_TEX3_SHADING_PS3_GOURAUD     (2 << 6)
+#	define R500_TEX4_SHADING_PS3_SOLID       (0 << 8)
+#	define R500_TEX4_SHADING_PS3_FLAT        (1 << 8)
+#	define R500_TEX4_SHADING_PS3_GOURAUD     (2 << 8)
+#	define R500_TEX5_SHADING_PS3_SOLID       (0 << 10)
+#	define R500_TEX5_SHADING_PS3_FLAT        (1 << 10)
+#	define R500_TEX5_SHADING_PS3_GOURAUD     (2 << 10)
+#	define R500_TEX6_SHADING_PS3_SOLID       (0 << 12)
+#	define R500_TEX6_SHADING_PS3_FLAT        (1 << 12)
+#	define R500_TEX6_SHADING_PS3_GOURAUD     (2 << 12)
+#	define R500_TEX7_SHADING_PS3_SOLID       (0 << 14)
+#	define R500_TEX7_SHADING_PS3_FLAT        (1 << 14)
+#	define R500_TEX7_SHADING_PS3_GOURAUD     (2 << 14)
+#	define R500_TEX8_SHADING_PS3_SOLID       (0 << 16)
+#	define R500_TEX8_SHADING_PS3_FLAT        (1 << 16)
+#	define R500_TEX8_SHADING_PS3_GOURAUD     (2 << 16)
+#	define R500_TEX9_SHADING_PS3_SOLID       (0 << 18)
+#	define R500_TEX9_SHADING_PS3_FLAT        (1 << 18)
+#	define R500_TEX9_SHADING_PS3_GOURAUD     (2 << 18)
+#	define R500_TEX10_SHADING_PS3_SOLID      (0 << 20)
+#	define R500_TEX10_SHADING_PS3_FLAT       (1 << 20)
+#	define R500_TEX10_SHADING_PS3_GOURAUD    (2 << 20)
+#	define R500_COLOR0_TEX_OVERRIDE_NO       (0 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_0    (1 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_1    (2 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_2    (3 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_3    (4 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_4    (5 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_5    (6 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_6    (7 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_7    (8 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_8_C2 (9 << 22)
+#	define R500_COLOR0_TEX_OVERRIDE_TEX_9_C3 (10 << 22)
+#	define R500_COLOR1_TEX_OVERRIDE_NO       (0 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_0    (1 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_1    (2 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_2    (3 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_3    (4 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_4    (5 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_5    (6 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_6    (7 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_7    (8 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_8_C2 (9 << 26)
+#	define R500_COLOR1_TEX_OVERRIDE_TEX_9_C3 (10 << 26)
+
+/* Returns idle status of various G3D block, captured when GA_IDLE written or
+ * when hard or soft reset asserted.
+ */
+#define R500_GA_IDLE                                  0x425c
+#	define R500_GA_IDLE_PIPE3_Z_IDLE  (0 << 0)
+#	define R500_GA_IDLE_PIPE2_Z_IDLE  (0 << 1)
+#	define R500_GA_IDLE_PIPE3_CD_IDLE (0 << 2)
+#	define R500_GA_IDLE_PIPE2_CD_IDLE (0 << 3)
+#	define R500_GA_IDLE_PIPE3_FG_IDLE (0 << 4)
+#	define R500_GA_IDLE_PIPE2_FG_IDLE (0 << 5)
+#	define R500_GA_IDLE_PIPE3_US_IDLE (0 << 6)
+#	define R500_GA_IDLE_PIPE2_US_IDLE (0 << 7)
+#	define R500_GA_IDLE_PIPE3_SC_IDLE (0 << 8)
+#	define R500_GA_IDLE_PIPE2_SC_IDLE (0 << 9)
+#	define R500_GA_IDLE_PIPE3_RS_IDLE (0 << 10)
+#	define R500_GA_IDLE_PIPE2_RS_IDLE (0 << 11)
+#	define R500_GA_IDLE_PIPE1_Z_IDLE  (0 << 12)
+#	define R500_GA_IDLE_PIPE0_Z_IDLE  (0 << 13)
+#	define R500_GA_IDLE_PIPE1_CD_IDLE (0 << 14)
+#	define R500_GA_IDLE_PIPE0_CD_IDLE (0 << 15)
+#	define R500_GA_IDLE_PIPE1_FG_IDLE (0 << 16)
+#	define R500_GA_IDLE_PIPE0_FG_IDLE (0 << 17)
+#	define R500_GA_IDLE_PIPE1_US_IDLE (0 << 18)
+#	define R500_GA_IDLE_PIPE0_US_IDLE (0 << 19)
+#	define R500_GA_IDLE_PIPE1_SC_IDLE (0 << 20)
+#	define R500_GA_IDLE_PIPE0_SC_IDLE (0 << 21)
+#	define R500_GA_IDLE_PIPE1_RS_IDLE (0 << 22)
+#	define R500_GA_IDLE_PIPE0_RS_IDLE (0 << 23)
+#	define R500_GA_IDLE_SU_IDLE       (0 << 24)
+#	define R500_GA_IDLE_GA_IDLE       (0 << 25)
+#	define R500_GA_IDLE_GA_UNIT2_IDLE (0 << 26)
+
+/* Current value of stipple accumulator. */
+#define R300_GA_LINE_STIPPLE_VALUE            0x4260
+
+/* S Texture Coordinate Value for Vertex 0 of Line (stuff textures -- i.e. AA) */
+#define R300_GA_LINE_S0                               0x4264
+/* S Texture Coordinate Value for Vertex 1 of Lines (V2 of parallelogram -- stuff textures -- i.e. AA) */
+#define R300_GA_LINE_S1                               0x4268
+
+/* GA Input fifo high water marks */
+#define R500_GA_FIFO_CNTL                             0x4270
+#	define R500_GA_FIFO_CNTL_VERTEX_FIFO_MASK   0x00000007
+#	define R500_GA_FIFO_CNTL_VERTEX_FIFO_SHIFT  0
+#	define R500_GA_FIFO_CNTL_VERTEX_INDEX_MASK  0x00000038
+#	define R500_GA_FIFO_CNTL_VERTEX_INDEX_SHIFT 3
+#	define R500_GA_FIFO_CNTL_VERTEX_REG_MASK    0x00003fc0
+#	define R500_GA_FIFO_CNTL_VERTEX_REG_SHIFT   6
+
+/* GA enhance/tweaks */
+#define R300_GA_ENHANCE                               0x4274
+#	define R300_GA_ENHANCE_DEADLOCK_CNTL_NO_EFFECT   (0 << 0)
+#	define R300_GA_ENHANCE_DEADLOCK_CNTL_PREVENT_TCL (1 << 0) /* Prevents TCL interface from deadlocking on GA side. */
+#	define R300_GA_ENHANCE_FASTSYNC_CNTL_NO_EFFECT   (0 << 1)
+#	define R300_GA_ENHANCE_FASTSYNC_CNTL_ENABLE      (1 << 1) /* Enables high-performance register/primitive switching. */
+#	define R500_GA_ENHANCE_REG_READWRITE_NO_EFFECT   (0 << 2) /* R520+ only */
+#	define R500_GA_ENHANCE_REG_READWRITE_ENABLE      (1 << 2) /* R520+ only, Enables GA support of simultaneous register reads and writes. */
+#	define R500_GA_ENHANCE_REG_NOSTALL_NO_EFFECT     (0 << 3)
+#	define R500_GA_ENHANCE_REG_NOSTALL_ENABLE        (1 << 3) /* Enables GA support of no-stall reads for register read back. */
+
+#define R300_GA_COLOR_CONTROL                   0x4278
+#	define R300_GA_COLOR_CONTROL_RGB0_SHADING_SOLID      (0 << 0)
+#	define R300_GA_COLOR_CONTROL_RGB0_SHADING_FLAT       (1 << 0)
+#	define R300_GA_COLOR_CONTROL_RGB0_SHADING_GOURAUD    (2 << 0)
+#	define R300_GA_COLOR_CONTROL_ALPHA0_SHADING_SOLID    (0 << 2)
+#	define R300_GA_COLOR_CONTROL_ALPHA0_SHADING_FLAT     (1 << 2)
+#	define R300_GA_COLOR_CONTROL_ALPHA0_SHADING_GOURAUD  (2 << 2)
+#	define R300_GA_COLOR_CONTROL_RGB1_SHADING_SOLID      (0 << 4)
+#	define R300_GA_COLOR_CONTROL_RGB1_SHADING_FLAT       (1 << 4)
+#	define R300_GA_COLOR_CONTROL_RGB1_SHADING_GOURAUD    (2 << 4)
+#	define R300_GA_COLOR_CONTROL_ALPHA1_SHADING_SOLID    (0 << 6)
+#	define R300_GA_COLOR_CONTROL_ALPHA1_SHADING_FLAT     (1 << 6)
+#	define R300_GA_COLOR_CONTROL_ALPHA1_SHADING_GOURAUD  (2 << 6)
+#	define R300_GA_COLOR_CONTROL_RGB2_SHADING_SOLID      (0 << 8)
+#	define R300_GA_COLOR_CONTROL_RGB2_SHADING_FLAT       (1 << 8)
+#	define R300_GA_COLOR_CONTROL_RGB2_SHADING_GOURAUD    (2 << 8)
+#	define R300_GA_COLOR_CONTROL_ALPHA2_SHADING_SOLID    (0 << 10)
+#	define R300_GA_COLOR_CONTROL_ALPHA2_SHADING_FLAT     (1 << 10)
+#	define R300_GA_COLOR_CONTROL_ALPHA2_SHADING_GOURAUD  (2 << 10)
+#	define R300_GA_COLOR_CONTROL_RGB3_SHADING_SOLID      (0 << 12)
+#	define R300_GA_COLOR_CONTROL_RGB3_SHADING_FLAT       (1 << 12)
+#	define R300_GA_COLOR_CONTROL_RGB3_SHADING_GOURAUD    (2 << 12)
+#	define R300_GA_COLOR_CONTROL_ALPHA3_SHADING_SOLID    (0 << 14)
+#	define R300_GA_COLOR_CONTROL_ALPHA3_SHADING_FLAT     (1 << 14)
+#	define R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD  (2 << 14)
+#	define R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_FIRST  (0 << 16)
+#	define R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_SECOND (1 << 16)
+#	define R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_THIRD  (2 << 16)
+#	define R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST   (3 << 16)
+
+/** TODO: might be candidate for removal */
+#	define R300_RE_SHADE_MODEL_SMOOTH     ( \
+	R300_GA_COLOR_CONTROL_RGB0_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA0_SHADING_GOURAUD | \
+	R300_GA_COLOR_CONTROL_RGB1_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA1_SHADING_GOURAUD | \
+	R300_GA_COLOR_CONTROL_RGB2_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA2_SHADING_GOURAUD | \
+	R300_GA_COLOR_CONTROL_RGB3_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD | \
+	R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST )
+/** TODO: might be candidate for removal, the GOURAUD stuff also looks buggy to me */
+#	define R300_RE_SHADE_MODEL_FLAT     ( \
+	R300_GA_COLOR_CONTROL_RGB0_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA0_SHADING_FLAT | \
+	R300_GA_COLOR_CONTROL_RGB1_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA1_SHADING_GOURAUD | \
+	R300_GA_COLOR_CONTROL_RGB2_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA2_SHADING_FLAT | \
+	R300_GA_COLOR_CONTROL_RGB3_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD | \
+	R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST )
+
+/* Specifies red & green components of fill color -- S312 format -- Backwards comp. */
+#define R300_GA_SOLID_RG                         0x427c
+#	define GA_SOLID_RG_COLOR_GREEN_SHIFT 0
+#	define GA_SOLID_RG_COLOR_GREEN_MASK  0x0000ffff
+#	define GA_SOLID_RG_COLOR_RED_SHIFT   16
+#	define GA_SOLID_RG_COLOR_RED_MASK    0xffff0000
+/* Specifies blue & alpha components of fill color -- S312 format -- Backwards comp. */
+#define R300_GA_SOLID_BA                         0x4280
+#	define GA_SOLID_BA_COLOR_ALPHA_SHIFT 0
+#	define GA_SOLID_BA_COLOR_ALPHA_MASK  0x0000ffff
+#	define GA_SOLID_BA_COLOR_BLUE_SHIFT  16
+#	define GA_SOLID_BA_COLOR_BLUE_MASK   0xffff0000
+
+/* Polygon Mode
+ * Dangerous
+ */
+#define R300_GA_POLY_MODE                             0x4288
+#	define R300_GA_POLY_MODE_DISABLE           (0 << 0)
+#	define R300_GA_POLY_MODE_DUAL              (1 << 0) /* send 2 sets of 3 polys with specified poly type */
+/* reserved */
+#	define R300_GA_POLY_MODE_FRONT_PTYPE_POINT (0 << 4)
+#	define R300_GA_POLY_MODE_FRONT_PTYPE_LINE  (1 << 4)
+#	define R300_GA_POLY_MODE_FRONT_PTYPE_TRI   (2 << 4)
+/* reserved */
+#	define R300_GA_POLY_MODE_BACK_PTYPE_POINT  (0 << 7)
+#	define R300_GA_POLY_MODE_BACK_PTYPE_LINE   (1 << 7)
+#	define R300_GA_POLY_MODE_BACK_PTYPE_TRI    (2 << 7)
+/* reserved */
+
+/* Specifies the rouding mode for geometry & color SPFP to FP conversions. */
+#define R300_GA_ROUND_MODE                            0x428c
+#	define R300_GA_ROUND_MODE_GEOMETRY_ROUND_TRUNC   (0 << 0)
+#	define R300_GA_ROUND_MODE_GEOMETRY_ROUND_NEAREST (1 << 0)
+#	define R300_GA_ROUND_MODE_COLOR_ROUND_TRUNC      (0 << 2)
+#	define R300_GA_ROUND_MODE_COLOR_ROUND_NEAREST    (1 << 2)
+#	define R300_GA_ROUND_MODE_RGB_CLAMP_RGB          (0 << 4)
+#	define R300_GA_ROUND_MODE_RGB_CLAMP_FP20         (1 << 4)
+#	define R300_GA_ROUND_MODE_ALPHA_CLAMP_RGB        (0 << 5)
+#	define R300_GA_ROUND_MODE_ALPHA_CLAMP_FP20       (1 << 5)
+#	define R500_GA_ROUND_MODE_GEOMETRY_MASK_SHIFT    6
+#	define R500_GA_ROUND_MODE_GEOMETRY_MASK_MASK     0x000003c0
+
+/* Specifies x & y offsets for vertex data after conversion to FP.
+ * Offsets are in S15 format (subpixels -- 1/12 or 1/16, even in 8b
+ * subprecision).
+ */
+#define R300_GA_OFFSET                                0x4290
+#	define R300_GA_OFFSET_X_OFFSET_SHIFT 0
+#	define R300_GA_OFFSET_X_OFFSET_MASK  0x0000ffff
+#	define R300_GA_OFFSET_Y_OFFSET_SHIFT 16
+#	define R300_GA_OFFSET_Y_OFFSET_MASK  0xffff0000
+
+/* Specifies the scale to apply to fog. */
+#define R300_GA_FOG_SCALE                     0x4294
+/* Specifies the offset to apply to fog. */
+#define R300_GA_FOG_OFFSET                    0x4298
+/* Specifies number of cycles to assert reset, and also causes RB3D soft reset to assert. */
+#define R300_GA_SOFT_RESET                    0x429c
+
+/* Not sure why there are duplicate of factor and constant values.
+ * My best guess so far is that there are seperate zbiases for test and write.
+ * Ordering might be wrong.
+ * Some of the tests indicate that fgl has a fallback implementation of zbias
+ * via pixel shaders.
+ */
+#define R300_SU_TEX_WRAP                      0x42A0
+#define R300_SU_POLY_OFFSET_FRONT_SCALE       0x42A4
+#define R300_SU_POLY_OFFSET_FRONT_OFFSET      0x42A8
+#define R300_SU_POLY_OFFSET_BACK_SCALE        0x42AC
+#define R300_SU_POLY_OFFSET_BACK_OFFSET       0x42B0
+
+/* This register needs to be set to (1<<1) for RV350 to correctly
+ * perform depth test (see --vb-triangles in r300_demo)
+ * Don't know about other chips. - Vladimir
+ * This is set to 3 when GL_POLYGON_OFFSET_FILL is on.
+ * My guess is that there are two bits for each zbias primitive
+ * (FILL, LINE, POINT).
+ *  One to enable depth test and one for depth write.
+ * Yet this doesnt explain why depth writes work ...
+ */
+#define R300_SU_POLY_OFFSET_ENABLE	       0x42B4
+#	define R300_FRONT_ENABLE	       (1 << 0)
+#	define R300_BACK_ENABLE 	       (1 << 1)
+#	define R300_PARA_ENABLE 	       (1 << 2)
+
+#define R300_SU_CULL_MODE                      0x42B8
+#       define R300_CULL_FRONT                   (1 << 0)
+#       define R300_CULL_BACK                    (1 << 1)
+#       define R300_FRONT_FACE_CCW               (0 << 2)
+#       define R300_FRONT_FACE_CW                (1 << 2)
+
+/* SU Depth Scale value */
+#define R300_SU_DEPTH_SCALE                 0x42c0
+/* SU Depth Offset value */
+#define R300_SU_DEPTH_OFFSET                0x42c4
+
+
+/* BEGIN: Rasterization / Interpolators - many guesses */
+
+/*
+ * TC_CNT is the number of incoming texture coordinate sets (i.e. it depends
+ * on the vertex program, *not* the fragment program)
+ */
+#define R300_RS_COUNT                      0x4300
+#       define R300_IT_COUNT_SHIFT               0
+#       define R300_IT_COUNT_MASK                0x0000007f
+#       define R300_IC_COUNT_SHIFT               7
+#       define R300_IC_COUNT_MASK                0x00000780
+#       define R300_W_ADDR_SHIFT                 12
+#       define R300_W_ADDR_MASK                  0x0003f000
+#       define R300_HIRES_DIS                    (0 << 18)
+#       define R300_HIRES_EN                     (1 << 18)
+
+#define R300_RS_INST_COUNT                       0x4304
+#       define R300_RS_INST_COUNT_SHIFT          0
+#       define R300_RS_INST_COUNT_MASK           0x0000000f
+#       define R300_RS_TX_OFFSET_SHIFT           5
+#	define R300_RS_TX_OFFSET_MASK            0x000000e0
+
+/* gap */
+
+/* Only used for texture coordinates.
+ * Use the source field to route texture coordinate input from the
+ * vertex program to the desired interpolator. Note that the source
+ * field is relative to the outputs the vertex program *actually*
+ * writes. If a vertex program only writes texcoord[1], this will
+ * be source index 0.
+ * Set INTERP_USED on all interpolators that produce data used by
+ * the fragment program. INTERP_USED looks like a swizzling mask,
+ * but I haven't seen it used that way.
+ *
+ * Note: The _UNKNOWN constants are always set in their respective
+ * register. I don't know if this is necessary.
+ */
+#define R300_RS_IP_0				        0x4310
+#define R300_RS_IP_1				        0x4314
+#define R300_RS_IP_2				        0x4318
+#define R300_RS_IP_3				        0x431C
+#       define R300_RS_INTERP_SRC_SHIFT          2 /* TODO: check for removal */
+#       define R300_RS_INTERP_SRC_MASK           (7 << 2) /* TODO: check for removal */
+#	define R300_RS_TEX_PTR(x)		        (x << 0)
+#	define R300_RS_COL_PTR(x)		        ((x) << 6)
+#	define R300_RS_COL_FMT(x)		        ((x) << 9)
+#	define R300_RS_COL_FMT_RGBA		        0
+#	define R300_RS_COL_FMT_RGB0		        1
+#	define R300_RS_COL_FMT_RGB1		        2
+#	define R300_RS_COL_FMT_000A		        4
+#	define R300_RS_COL_FMT_0000		        5
+#	define R300_RS_COL_FMT_0001		        6
+#	define R300_RS_COL_FMT_111A		        8
+#	define R300_RS_COL_FMT_1110		        9
+#	define R300_RS_COL_FMT_1111		        10
+#	define R300_RS_SEL_S(x)		                ((x) << 13)
+#	define R300_RS_SEL_T(x)		                ((x) << 16)
+#	define R300_RS_SEL_R(x)		                ((x) << 19)
+#	define R300_RS_SEL_Q(x)		                ((x) << 22)
+#	define R300_RS_SEL_C0		                0
+#	define R300_RS_SEL_C1		                1
+#	define R300_RS_SEL_C2		                2
+#	define R300_RS_SEL_C3		                3
+#	define R300_RS_SEL_K0		                4
+#	define R300_RS_SEL_K1		                5
+
+
+/*  */
+#define R500_RS_INST_0					0x4320
+#define R500_RS_INST_1					0x4324
+#define R500_RS_INST_2					0x4328
+#define R500_RS_INST_3					0x432c
+#define R500_RS_INST_4					0x4330
+#define R500_RS_INST_5					0x4334
+#define R500_RS_INST_6					0x4338
+#define R500_RS_INST_7					0x433c
+#define R500_RS_INST_8					0x4340
+#define R500_RS_INST_9					0x4344
+#define R500_RS_INST_10					0x4348
+#define R500_RS_INST_11					0x434c
+#define R500_RS_INST_12					0x4350
+#define R500_RS_INST_13					0x4354
+#define R500_RS_INST_14					0x4358
+#define R500_RS_INST_15					0x435c
+#define R500_RS_INST_TEX_ID_SHIFT			0
+#define R500_RS_INST_TEX_CN_WRITE			(1 << 4)
+#define R500_RS_INST_TEX_ADDR_SHIFT			5
+#define R500_RS_INST_COL_ID_SHIFT			12
+#define R500_RS_INST_COL_CN_NO_WRITE			(0 << 16)
+#define R500_RS_INST_COL_CN_WRITE			(1 << 16)
+#define R500_RS_INST_COL_CN_WRITE_FBUFFER		(2 << 16)
+#define R500_RS_INST_COL_CN_WRITE_BACKFACE		(3 << 16)
+#define R500_RS_INST_COL_ADDR_SHIFT			18
+#define R500_RS_INST_TEX_ADJ				(1 << 25)
+#define R500_RS_INST_W_CN				(1 << 26)
+
+/* These DWORDs control how vertex data is routed into fragment program
+ * registers, after interpolators.
+ */
+#define R300_RS_INST_0                     0x4330
+#define R300_RS_INST_1                     0x4334
+#define R300_RS_INST_2                     0x4338
+#define R300_RS_INST_3                     0x433C
+#define R300_RS_INST_4                     0x4340
+#define R300_RS_INST_5                     0x4344
+#define R300_RS_INST_6                     0x4348
+#define R300_RS_INST_7                     0x434C
+#	define R300_RS_INST_TEX_ID(x)  		((x) << 0)
+#	define R300_RS_INST_TEX_CN_WRITE 	(1 << 3)
+#	define R300_RS_INST_TEX_ADDR_SHIFT 	6
+#	define R300_RS_INST_COL_ID(x)		((x) << 11)
+#	define R300_RS_INST_COL_CN_WRITE	(1 << 14)
+#	define R300_RS_INST_COL_ADDR_SHIFT	17
+#	define R300_RS_INST_TEX_ADJ		(1 << 22)
+#	define R300_RS_COL_BIAS_UNUSED_SHIFT    23
+
+/* END: Rasterization / Interpolators - many guesses */
+
+/* Hierarchical Z Enable */
+#define R300_SC_HYPERZ                   0x43a4
+#	define R300_SC_HYPERZ_DISABLE     (0 << 0)
+#	define R300_SC_HYPERZ_ENABLE      (1 << 0)
+#	define R300_SC_HYPERZ_MIN         (0 << 1)
+#	define R300_SC_HYPERZ_MAX         (1 << 1)
+#	define R300_SC_HYPERZ_ADJ_256     (0 << 2)
+#	define R300_SC_HYPERZ_ADJ_128     (1 << 2)
+#	define R300_SC_HYPERZ_ADJ_64      (2 << 2)
+#	define R300_SC_HYPERZ_ADJ_32      (3 << 2)
+#	define R300_SC_HYPERZ_ADJ_16      (4 << 2)
+#	define R300_SC_HYPERZ_ADJ_8       (5 << 2)
+#	define R300_SC_HYPERZ_ADJ_4       (6 << 2)
+#	define R300_SC_HYPERZ_ADJ_2       (7 << 2)
+#	define R300_SC_HYPERZ_HZ_Z0MIN_NO (0 << 5)
+#	define R300_SC_HYPERZ_HZ_Z0MIN    (1 << 5)
+#	define R300_SC_HYPERZ_HZ_Z0MAX_NO (0 << 6)
+#	define R300_SC_HYPERZ_HZ_Z0MAX    (1 << 6)
+
+#define R300_SC_EDGERULE                 0x43a8
+
+/* BEGIN: Scissors and cliprects */
+
+/* There are four clipping rectangles. Their corner coordinates are inclusive.
+ * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending
+ * on whether the pixel is inside cliprects 0-3, respectively. For example,
+ * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned
+ * the number 3 (binary 0011).
+ * Iff the bit corresponding to the pixel's number in RE_CLIPRECT_CNTL is set,
+ * the pixel is rasterized.
+ *
+ * In addition to this, there is a scissors rectangle. Only pixels inside the
+ * scissors rectangle are drawn. (coordinates are inclusive)
+ *
+ * For some reason, the top-left corner of the framebuffer is at (1440, 1440)
+ * for the purpose of clipping and scissors.
+ */
+#define R300_SC_CLIPRECT_TL_0               0x43B0
+#define R300_SC_CLIPRECT_BR_0               0x43B4
+#define R300_SC_CLIPRECT_TL_1               0x43B8
+#define R300_SC_CLIPRECT_BR_1               0x43BC
+#define R300_SC_CLIPRECT_TL_2               0x43C0
+#define R300_SC_CLIPRECT_BR_2               0x43C4
+#define R300_SC_CLIPRECT_TL_3               0x43C8
+#define R300_SC_CLIPRECT_BR_3               0x43CC
+#       define R300_CLIPRECT_OFFSET              1440
+#       define R300_CLIPRECT_MASK                0x1FFF
+#       define R300_CLIPRECT_X_SHIFT             0
+#       define R300_CLIPRECT_X_MASK              (0x1FFF << 0)
+#       define R300_CLIPRECT_Y_SHIFT             13
+#       define R300_CLIPRECT_Y_MASK              (0x1FFF << 13)
+#define R300_SC_CLIP_RULE                   0x43D0
+#       define R300_CLIP_OUT                     (1 << 0)
+#       define R300_CLIP_0                       (1 << 1)
+#       define R300_CLIP_1                       (1 << 2)
+#       define R300_CLIP_10                      (1 << 3)
+#       define R300_CLIP_2                       (1 << 4)
+#       define R300_CLIP_20                      (1 << 5)
+#       define R300_CLIP_21                      (1 << 6)
+#       define R300_CLIP_210                     (1 << 7)
+#       define R300_CLIP_3                       (1 << 8)
+#       define R300_CLIP_30                      (1 << 9)
+#       define R300_CLIP_31                      (1 << 10)
+#       define R300_CLIP_310                     (1 << 11)
+#       define R300_CLIP_32                      (1 << 12)
+#       define R300_CLIP_320                     (1 << 13)
+#       define R300_CLIP_321                     (1 << 14)
+#       define R300_CLIP_3210                    (1 << 15)
+
+/* gap */
+
+#define R300_SC_SCISSORS_TL                 0x43E0
+#define R300_SC_SCISSORS_BR                 0x43E4
+#       define R300_SCISSORS_OFFSET              1440
+#       define R300_SCISSORS_X_SHIFT             0
+#       define R300_SCISSORS_X_MASK              (0x1FFF << 0)
+#       define R300_SCISSORS_Y_SHIFT             13
+#       define R300_SCISSORS_Y_MASK              (0x1FFF << 13)
+
+/* Screen door sample mask */
+#define R300_SC_SCREENDOOR                 0x43e8
+
+/* END: Scissors and cliprects */
+
+/* BEGIN: Texture specification */
+
+/*
+ * The texture specification dwords are grouped by meaning and not by texture
+ * unit. This means that e.g. the offset for texture image unit N is found in
+ * register TX_OFFSET_0 + (4*N)
+ */
+#define R300_TX_FILTER0_0                        0x4400
+#define R300_TX_FILTER0_1                        0x4404
+#define R300_TX_FILTER0_2                        0x4408
+#define R300_TX_FILTER0_3                        0x440c
+#define R300_TX_FILTER0_4                        0x4410
+#define R300_TX_FILTER0_5                        0x4414
+#define R300_TX_FILTER0_6                        0x4418
+#define R300_TX_FILTER0_7                        0x441c
+#define R300_TX_FILTER0_8                        0x4420
+#define R300_TX_FILTER0_9                        0x4424
+#define R300_TX_FILTER0_10                       0x4428
+#define R300_TX_FILTER0_11                       0x442c
+#define R300_TX_FILTER0_12                       0x4430
+#define R300_TX_FILTER0_13                       0x4434
+#define R300_TX_FILTER0_14                       0x4438
+#define R300_TX_FILTER0_15                       0x443c
+#       define R300_TX_REPEAT                    0
+#       define R300_TX_MIRRORED                  1
+#       define R300_TX_CLAMP_TO_EDGE             2
+#	define R300_TX_MIRROR_ONCE_TO_EDGE       3
+#       define R300_TX_CLAMP                     4
+#	define R300_TX_MIRROR_ONCE               5
+#       define R300_TX_CLAMP_TO_BORDER           6
+#	define R300_TX_MIRROR_ONCE_TO_BORDER     7
+#       define R300_TX_WRAP_S_SHIFT              0
+#       define R300_TX_WRAP_S_MASK               (7 << 0)
+#       define R300_TX_WRAP_T_SHIFT              3
+#       define R300_TX_WRAP_T_MASK               (7 << 3)
+#       define R300_TX_WRAP_R_SHIFT              6
+#       define R300_TX_WRAP_R_MASK               (7 << 6)
+#	define R300_TX_MAG_FILTER_4              (0 << 9)
+#       define R300_TX_MAG_FILTER_NEAREST        (1 << 9)
+#       define R300_TX_MAG_FILTER_LINEAR         (2 << 9)
+#       define R300_TX_MAG_FILTER_ANISO          (3 << 9)
+#       define R300_TX_MAG_FILTER_MASK           (3 << 9)
+#       define R300_TX_MIN_FILTER_NEAREST        (1 << 11)
+#       define R300_TX_MIN_FILTER_LINEAR         (2 << 11)
+#	define R300_TX_MIN_FILTER_ANISO          (3 << 11)
+#	define R300_TX_MIN_FILTER_MASK           (3 << 11)
+#	define R300_TX_MIN_FILTER_MIP_NONE       (0 << 13)
+#	define R300_TX_MIN_FILTER_MIP_NEAREST    (1 << 13)
+#	define R300_TX_MIN_FILTER_MIP_LINEAR     (2 << 13)
+#	define R300_TX_MIN_FILTER_MIP_MASK       (3 << 13)
+#	define R300_TX_MAX_ANISO_1_TO_1          (0 << 21)
+#	define R300_TX_MAX_ANISO_2_TO_1          (1 << 21)
+#	define R300_TX_MAX_ANISO_4_TO_1          (2 << 21)
+#	define R300_TX_MAX_ANISO_8_TO_1          (3 << 21)
+#	define R300_TX_MAX_ANISO_16_TO_1         (4 << 21)
+#	define R300_TX_MAX_ANISO_MASK            (7 << 21)
+
+#define R300_TX_FILTER1_0                      0x4440
+#	define R300_CHROMA_KEY_MODE_DISABLE    0
+#	define R300_CHROMA_KEY_FORCE	       1
+#	define R300_CHROMA_KEY_BLEND           2
+#	define R300_MC_ROUND_NORMAL            (0<<2)
+#	define R300_MC_ROUND_MPEG4             (1<<2)
+#	define R300_LOD_BIAS_SHIFT             3
+#	define R300_LOD_BIAS_MASK	       0x1ff8
+#	define R300_EDGE_ANISO_EDGE_DIAG       (0<<13)
+#	define R300_EDGE_ANISO_EDGE_ONLY       (1<<13)
+#	define R300_MC_COORD_TRUNCATE_DISABLE  (0<<14)
+#	define R300_MC_COORD_TRUNCATE_MPEG     (1<<14)
+#	define R300_TX_TRI_PERF_0_8            (0<<15)
+#	define R300_TX_TRI_PERF_1_8            (1<<15)
+#	define R300_TX_TRI_PERF_1_4            (2<<15)
+#	define R300_TX_TRI_PERF_3_8            (3<<15)
+#	define R300_ANISO_THRESHOLD_MASK       (7<<17)
+
+#	define R500_MACRO_SWITCH               (1<<22)
+#	define R500_BORDER_FIX                 (1<<31)
+
+#define R300_TX_SIZE_0                      0x4480
+#       define R300_TX_WIDTHMASK_SHIFT           0
+#       define R300_TX_WIDTHMASK_MASK            (2047 << 0)
+#       define R300_TX_HEIGHTMASK_SHIFT          11
+#       define R300_TX_HEIGHTMASK_MASK           (2047 << 11)
+#	define R300_TX_DEPTHMASK_SHIFT		 22
+#	define R300_TX_DEPTHMASK_MASK		 (0xf << 22)
+#       define R300_TX_MAX_MIP_LEVEL_SHIFT       26
+#       define R300_TX_MAX_MIP_LEVEL_MASK        (0xf << 26)
+#       define R300_TX_SIZE_PROJECTED            (1<<30)
+#       define R300_TX_SIZE_TXPITCH_EN           (1<<31)
+#define R300_TX_FORMAT_0                    0x44C0
+	/* The interpretation of the format word by Wladimir van der Laan */
+	/* The X, Y, Z and W refer to the layout of the components.
+	   They are given meanings as R, G, B and Alpha by the swizzle
+	   specification */
+#	define R300_TX_FORMAT_X8		    0x0
+#	define R500_TX_FORMAT_X1		    0x0 // bit set in format 2
+#	define R300_TX_FORMAT_X16		    0x1
+#	define R500_TX_FORMAT_X1_REV		    0x0 // bit set in format 2
+#	define R300_TX_FORMAT_Y4X4		    0x2
+#	define R300_TX_FORMAT_Y8X8		    0x3
+#	define R300_TX_FORMAT_Y16X16		    0x4
+#	define R300_TX_FORMAT_Z3Y3X2		    0x5
+#	define R300_TX_FORMAT_Z5Y6X5		    0x6
+#	define R300_TX_FORMAT_Z6Y5X5		    0x7
+#	define R300_TX_FORMAT_Z11Y11X10		    0x8
+#	define R300_TX_FORMAT_Z10Y11X11		    0x9
+#	define R300_TX_FORMAT_W4Z4Y4X4		    0xA
+#	define R300_TX_FORMAT_W1Z5Y5X5		    0xB
+#	define R300_TX_FORMAT_W8Z8Y8X8		    0xC
+#	define R300_TX_FORMAT_W2Z10Y10X10	    0xD
+#	define R300_TX_FORMAT_W16Z16Y16X16	    0xE
+#	define R300_TX_FORMAT_DXT1	    	    0xF
+#	define R300_TX_FORMAT_DXT3	    	    0x10
+#	define R300_TX_FORMAT_DXT5	    	    0x11
+#	define R300_TX_FORMAT_D3DMFT_CxV8U8	    0x12     /* no swizzle */
+#	define R300_TX_FORMAT_A8R8G8B8	    	    0x13     /* no swizzle */
+#	define R300_TX_FORMAT_B8G8_B8G8	    	    0x14     /* no swizzle */
+#	define R300_TX_FORMAT_G8R8_G8B8	    	    0x15     /* no swizzle */
+
+	/* These two values are wrong, but they're the only values that
+	 * produce any even vaguely correct results.  Can r300 only do 16-bit
+	 * depth textures?
+	 */
+#	define R300_TX_FORMAT_X24_Y8	    	    0x1e
+#	define R300_TX_FORMAT_X32	    	    0x1e
+
+	/* 0x16 - some 16 bit green format.. ?? */
+#	define R300_TX_FORMAT_3D		   (1 << 25)
+#	define R300_TX_FORMAT_CUBIC_MAP		   (2 << 25)
+
+	/* gap */
+	/* Floating point formats */
+	/* Note - hardware supports both 16 and 32 bit floating point */
+#	define R300_TX_FORMAT_FL_I16	    	    0x18
+#	define R300_TX_FORMAT_FL_I16A16	    	    0x19
+#	define R300_TX_FORMAT_FL_R16G16B16A16	    0x1A
+#	define R300_TX_FORMAT_FL_I32	    	    0x1B
+#	define R300_TX_FORMAT_FL_I32A32	    	    0x1C
+#	define R300_TX_FORMAT_FL_R32G32B32A32	    0x1D
+	/* alpha modes, convenience mostly */
+	/* if you have alpha, pick constant appropriate to the
+	   number of channels (1 for I8, 2 for I8A8, 4 for R8G8B8A8, etc */
+# 	define R300_TX_FORMAT_ALPHA_1CH		    0x000
+# 	define R300_TX_FORMAT_ALPHA_2CH		    0x200
+# 	define R300_TX_FORMAT_ALPHA_4CH		    0x600
+# 	define R300_TX_FORMAT_ALPHA_NONE	    0xA00
+	/* Swizzling */
+	/* constants */
+#	define R300_TX_FORMAT_X		0
+#	define R300_TX_FORMAT_Y		1
+#	define R300_TX_FORMAT_Z		2
+#	define R300_TX_FORMAT_W		3
+#	define R300_TX_FORMAT_ZERO	4
+#	define R300_TX_FORMAT_ONE	5
+	/* 2.0*Z, everything above 1.0 is set to 0.0 */
+#	define R300_TX_FORMAT_CUT_Z	6
+	/* 2.0*W, everything above 1.0 is set to 0.0 */
+#	define R300_TX_FORMAT_CUT_W	7
+
+#	define R300_TX_FORMAT_B_SHIFT	18
+#	define R300_TX_FORMAT_G_SHIFT	15
+#	define R300_TX_FORMAT_R_SHIFT	12
+#	define R300_TX_FORMAT_A_SHIFT	9
+	/* Convenience macro to take care of layout and swizzling */
+#	define R300_EASY_TX_FORMAT(B, G, R, A, FMT)	(		\
+		((R300_TX_FORMAT_##B)<<R300_TX_FORMAT_B_SHIFT)		\
+		| ((R300_TX_FORMAT_##G)<<R300_TX_FORMAT_G_SHIFT)	\
+		| ((R300_TX_FORMAT_##R)<<R300_TX_FORMAT_R_SHIFT)	\
+		| ((R300_TX_FORMAT_##A)<<R300_TX_FORMAT_A_SHIFT)	\
+		| (R300_TX_FORMAT_##FMT)				\
+		)
+	/* These can be ORed with result of R300_EASY_TX_FORMAT()
+	   We don't really know what they do. Take values from a
+           constant color ? */
+#	define R300_TX_FORMAT_CONST_X		(1<<5)
+#	define R300_TX_FORMAT_CONST_Y		(2<<5)
+#	define R300_TX_FORMAT_CONST_Z		(4<<5)
+#	define R300_TX_FORMAT_CONST_W		(8<<5)
+
+#	define R300_TX_FORMAT_YUV_MODE		0x00800000
+
+#define R300_TX_FORMAT2_0		    0x4500 /* obvious missing in gap */
+#       define R300_TX_PITCHMASK_SHIFT           0
+#       define R300_TX_PITCHMASK_MASK            (2047 << 0)
+#	define R500_TXFORMAT_MSB		 (1 << 14)
+#	define R500_TXWIDTH_BIT11	         (1 << 15)
+#	define R500_TXHEIGHT_BIT11	         (1 << 16)
+#	define R500_POW2FIX2FLT			 (1 << 17)
+#	define R500_SEL_FILTER4_TC0		 (0 << 18)
+#	define R500_SEL_FILTER4_TC1		 (1 << 18)
+#	define R500_SEL_FILTER4_TC2		 (2 << 18)
+#	define R500_SEL_FILTER4_TC3		 (3 << 18)
+
+#define R300_TX_OFFSET_0                    0x4540
+#define R300_TX_OFFSET_1                    0x4544
+#define R300_TX_OFFSET_2                    0x4548
+#define R300_TX_OFFSET_3                    0x454C
+#define R300_TX_OFFSET_4                    0x4550
+#define R300_TX_OFFSET_5                    0x4554
+#define R300_TX_OFFSET_6                    0x4558
+#define R300_TX_OFFSET_7                    0x455C
+	/* BEGIN: Guess from R200 */
+#       define R300_TXO_ENDIAN_NO_SWAP           (0 << 0)
+#       define R300_TXO_ENDIAN_BYTE_SWAP         (1 << 0)
+#       define R300_TXO_ENDIAN_WORD_SWAP         (2 << 0)
+#       define R300_TXO_ENDIAN_HALFDW_SWAP       (3 << 0)
+#       define R300_TXO_MACRO_TILE               (1 << 2)
+#       define R300_TXO_MICRO_TILE_LINEAR        (0 << 3)
+#       define R300_TXO_MICRO_TILE               (1 << 3)
+#       define R300_TXO_MICRO_TILE_SQUARE        (2 << 3)
+#       define R300_TXO_OFFSET_MASK              0xffffffe0
+#       define R300_TXO_OFFSET_SHIFT             5
+	/* END: Guess from R200 */
+
+/* 32 bit chroma key */
+#define R300_TX_CHROMA_KEY_0                      0x4580
+#define R300_TX_CHROMA_KEY_1                      0x4584
+#define R300_TX_CHROMA_KEY_2                      0x4588
+#define R300_TX_CHROMA_KEY_3                      0x458c
+#define R300_TX_CHROMA_KEY_4                      0x4590
+#define R300_TX_CHROMA_KEY_5                      0x4594
+#define R300_TX_CHROMA_KEY_6                      0x4598
+#define R300_TX_CHROMA_KEY_7                      0x459c
+#define R300_TX_CHROMA_KEY_8                      0x45a0
+#define R300_TX_CHROMA_KEY_9                      0x45a4
+#define R300_TX_CHROMA_KEY_10                     0x45a8
+#define R300_TX_CHROMA_KEY_11                     0x45ac
+#define R300_TX_CHROMA_KEY_12                     0x45b0
+#define R300_TX_CHROMA_KEY_13                     0x45b4
+#define R300_TX_CHROMA_KEY_14                     0x45b8
+#define R300_TX_CHROMA_KEY_15                     0x45bc
+/* ff00ff00 == { 0, 1.0, 0, 1.0 } */
+
+/* Border Color */
+#define R300_TX_BORDER_COLOR_0              0x45c0
+#define R300_TX_BORDER_COLOR_1              0x45c4
+#define R300_TX_BORDER_COLOR_2              0x45c8
+#define R300_TX_BORDER_COLOR_3              0x45cc
+#define R300_TX_BORDER_COLOR_4              0x45d0
+#define R300_TX_BORDER_COLOR_5              0x45d4
+#define R300_TX_BORDER_COLOR_6              0x45d8
+#define R300_TX_BORDER_COLOR_7              0x45dc
+#define R300_TX_BORDER_COLOR_8              0x45e0
+#define R300_TX_BORDER_COLOR_9              0x45e4
+#define R300_TX_BORDER_COLOR_10             0x45e8
+#define R300_TX_BORDER_COLOR_11             0x45ec
+#define R300_TX_BORDER_COLOR_12             0x45f0
+#define R300_TX_BORDER_COLOR_13             0x45f4
+#define R300_TX_BORDER_COLOR_14             0x45f8
+#define R300_TX_BORDER_COLOR_15             0x45fc
+
+
+/* END: Texture specification */
+
+/* BEGIN: Fragment program instruction set */
+
+/* Fragment programs are written directly into register space.
+ * There are separate instruction streams for texture instructions and ALU
+ * instructions.
+ * In order to synchronize these streams, the program is divided into up
+ * to 4 nodes. Each node begins with a number of TEX operations, followed
+ * by a number of ALU operations.
+ * The first node can have zero TEX ops, all subsequent nodes must have at
+ * least
+ * one TEX ops.
+ * All nodes must have at least one ALU op.
+ *
+ * The index of the last node is stored in PFS_CNTL_0: A value of 0 means
+ * 1 node, a value of 3 means 4 nodes.
+ * The total amount of instructions is defined in PFS_CNTL_2. The offsets are
+ * offsets into the respective instruction streams, while *_END points to the
+ * last instruction relative to this offset.
+ */
+#define R300_US_CONFIG                      0x4600
+#       define R300_PFS_CNTL_LAST_NODES_SHIFT    0
+#       define R300_PFS_CNTL_LAST_NODES_MASK     (3 << 0)
+#       define R300_PFS_CNTL_FIRST_NODE_HAS_TEX  (1 << 3)
+#define R300_US_PIXSIZE                     0x4604
+/* There is an unshifted value here which has so far always been equal to the
+ * index of the highest used temporary register.
+ */
+#define R300_US_CODE_OFFSET                 0x4608
+#       define R300_PFS_CNTL_ALU_OFFSET_SHIFT    0
+#       define R300_PFS_CNTL_ALU_OFFSET_MASK     (63 << 0)
+#       define R300_PFS_CNTL_ALU_END_SHIFT       6
+#       define R300_PFS_CNTL_ALU_END_MASK        (63 << 6)
+#       define R300_PFS_CNTL_TEX_OFFSET_SHIFT    13
+#       define R300_PFS_CNTL_TEX_OFFSET_MASK     (31 << 13)
+#       define R300_PFS_CNTL_TEX_END_SHIFT       18
+#       define R300_PFS_CNTL_TEX_END_MASK        (31 << 18)
+
+/* gap */
+
+/* Nodes are stored backwards. The last active node is always stored in
+ * PFS_NODE_3.
+ * Example: In a 2-node program, NODE_0 and NODE_1 are set to 0. The
+ * first node is stored in NODE_2, the second node is stored in NODE_3.
+ *
+ * Offsets are relative to the master offset from PFS_CNTL_2.
+ */
+#define R300_US_CODE_ADDR_0                 0x4610
+#define R300_US_CODE_ADDR_1                 0x4614
+#define R300_US_CODE_ADDR_2                 0x4618
+#define R300_US_CODE_ADDR_3                 0x461C
+#       define R300_ALU_START_SHIFT         0
+#       define R300_ALU_START_MASK          (63 << 0)
+#       define R300_ALU_SIZE_SHIFT          6
+#       define R300_ALU_SIZE_MASK           (63 << 6)
+#       define R300_TEX_START_SHIFT         12
+#       define R300_TEX_START_MASK          (31 << 12)
+#       define R300_TEX_SIZE_SHIFT          17
+#       define R300_TEX_SIZE_MASK           (31 << 17)
+#	define R300_RGBA_OUT                (1 << 22)
+#	define R300_W_OUT                   (1 << 23)
+
+/* TEX
+ * As far as I can tell, texture instructions cannot write into output
+ * registers directly. A subsequent ALU instruction is always necessary,
+ * even if it's just MAD o0, r0, 1, 0
+ */
+#define R300_US_TEX_INST_0                  0x4620
+#	define R300_SRC_ADDR_SHIFT          0
+#	define R300_SRC_ADDR_MASK           (31 << 0)
+#	define R300_DST_ADDR_SHIFT          6
+#	define R300_DST_ADDR_MASK           (31 << 6)
+#	define R300_TEX_ID_SHIFT            11
+#       define R300_TEX_ID_MASK             (15 << 11)
+#	define R300_TEX_INST_SHIFT		15
+#		define R300_TEX_OP_NOP	        0
+#		define R300_TEX_OP_LD	        1
+#		define R300_TEX_OP_KIL	        2
+#		define R300_TEX_OP_TXP	        3
+#		define R300_TEX_OP_TXB	        4
+#	define R300_TEX_INST_MASK               (7 << 15)
+
+/* Output format from the unfied shader */
+#define R300_US_OUT_FMT_0                   0x46A4
+#	define R300_US_OUT_FMT_C4_8         (0 << 0)
+#	define R300_US_OUT_FMT_C4_10        (1 << 0)
+#	define R300_US_OUT_FMT_C4_10_GAMMA  (2 << 0)
+#	define R300_US_OUT_FMT_C_16         (3 << 0)
+#	define R300_US_OUT_FMT_C2_16        (4 << 0)
+#	define R300_US_OUT_FMT_C4_16        (5 << 0)
+#	define R300_US_OUT_FMT_C_16_MPEG    (6 << 0)
+#	define R300_US_OUT_FMT_C2_16_MPEG   (7 << 0)
+#	define R300_US_OUT_FMT_C2_4         (8 << 0)
+#	define R300_US_OUT_FMT_C_3_3_2      (9 << 0)
+#	define R300_US_OUT_FMT_C_6_5_6      (10 << 0)
+#	define R300_US_OUT_FMT_C_11_11_10   (11 << 0)
+#	define R300_US_OUT_FMT_C_10_11_11   (12 << 0)
+#	define R300_US_OUT_FMT_C_2_10_10_10 (13 << 0)
+/* reserved */
+#	define R300_US_OUT_FMT_UNUSED       (15 << 0)
+#	define R300_US_OUT_FMT_C_16_FP      (16 << 0)
+#	define R300_US_OUT_FMT_C2_16_FP     (17 << 0)
+#	define R300_US_OUT_FMT_C4_16_FP     (18 << 0)
+#	define R300_US_OUT_FMT_C_32_FP      (19 << 0)
+#	define R300_US_OUT_FMT_C2_32_FP     (20 << 0)
+#	define R300_US_OUT_FMT_C4_32_FP     (21 << 0)
+#   define R300_C0_SEL_A				(0 << 8)
+#   define R300_C0_SEL_R				(1 << 8)
+#   define R300_C0_SEL_G				(2 << 8)
+#   define R300_C0_SEL_B				(3 << 8)
+#   define R300_C1_SEL_A				(0 << 10)
+#   define R300_C1_SEL_R				(1 << 10)
+#   define R300_C1_SEL_G				(2 << 10)
+#   define R300_C1_SEL_B				(3 << 10)
+#   define R300_C2_SEL_A				(0 << 12)
+#   define R300_C2_SEL_R				(1 << 12)
+#   define R300_C2_SEL_G				(2 << 12)
+#   define R300_C2_SEL_B				(3 << 12)
+#   define R300_C3_SEL_A				(0 << 14)
+#   define R300_C3_SEL_R				(1 << 14)
+#   define R300_C3_SEL_G				(2 << 14)
+#   define R300_C3_SEL_B				(3 << 14)
+#   define R300_OUT_SIGN(x)				((x) << 16)
+#   define R500_ROUND_ADJ				(1 << 20)
+
+/* ALU
+ * The ALU instructions register blocks are enumerated according to the order
+ * in which fglrx. I assume there is space for 64 instructions, since
+ * each block has space for a maximum of 64 DWORDs, and this matches reported
+ * native limits.
+ *
+ * The basic functional block seems to be one MAD for each color and alpha,
+ * and an adder that adds all components after the MUL.
+ *  - ADD, MUL, MAD etc.: use MAD with appropriate neutral operands
+ *  - DP4: Use OUTC_DP4, OUTA_DP4
+ *  - DP3: Use OUTC_DP3, OUTA_DP4, appropriate alpha operands
+ *  - DPH: Use OUTC_DP4, OUTA_DP4, appropriate alpha operands
+ *  - CMPH: If ARG2 > 0.5, return ARG0, else return ARG1
+ *  - CMP: If ARG2 < 0, return ARG1, else return ARG0
+ *  - FLR: use FRC+MAD
+ *  - XPD: use MAD+MAD
+ *  - SGE, SLT: use MAD+CMP
+ *  - RSQ: use ABS modifier for argument
+ *  - Use OUTC_REPL_ALPHA to write results of an alpha-only operation
+ *    (e.g. RCP) into color register
+ *  - apparently, there's no quick DST operation
+ *  - fglrx set FPI2_UNKNOWN_31 on a "MAD fragment.color, tmp0, tmp1, tmp2"
+ *  - fglrx set FPI2_UNKNOWN_31 on a "MAX r2, r1, c0"
+ *  - fglrx once set FPI0_UNKNOWN_31 on a "FRC r1, r1"
+ *
+ * Operand selection
+ * First stage selects three sources from the available registers and
+ * constant parameters. This is defined in INSTR1 (color) and INSTR3 (alpha).
+ * fglrx sorts the three source fields: Registers before constants,
+ * lower indices before higher indices; I do not know whether this is
+ * necessary.
+ *
+ * fglrx fills unused sources with "read constant 0"
+ * According to specs, you cannot select more than two different constants.
+ *
+ * Second stage selects the operands from the sources. This is defined in
+ * INSTR0 (color) and INSTR2 (alpha). You can also select the special constants
+ * zero and one.
+ * Swizzling and negation happens in this stage, as well.
+ *
+ * Important: Color and alpha seem to be mostly separate, i.e. their sources
+ * selection appears to be fully independent (the register storage is probably
+ * physically split into a color and an alpha section).
+ * However (because of the apparent physical split), there is some interaction
+ * WRT swizzling. If, for example, you want to load an R component into an
+ * Alpha operand, this R component is taken from a *color* source, not from
+ * an alpha source. The corresponding register doesn't even have to appear in
+ * the alpha sources list. (I hope this all makes sense to you)
+ *
+ * Destination selection
+ * The destination register index is in FPI1 (color) and FPI3 (alpha)
+ * together with enable bits.
+ * There are separate enable bits for writing into temporary registers
+ * (DSTC_REG_* /DSTA_REG) and and program output registers (DSTC_OUTPUT_*
+ * /DSTA_OUTPUT). You can write to both at once, or not write at all (the
+ * same index must be used for both).
+ *
+ * Note: There is a special form for LRP
+ *  - Argument order is the same as in ARB_fragment_program.
+ *  - Operation is MAD
+ *  - ARG1 is set to ARGC_SRC1C_LRP/ARGC_SRC1A_LRP
+ *  - Set FPI0/FPI2_SPECIAL_LRP
+ * Arbitrary LRP (including support for swizzling) requires vanilla MAD+MAD
+ */
+#define R300_US_ALU_RGB_ADDR_0                   0x46C0
+#       define R300_ALU_SRC0C_SHIFT             0
+#       define R300_ALU_SRC0C_MASK              (31 << 0)
+#       define R300_ALU_SRC0C_CONST             (1 << 5)
+#       define R300_ALU_SRC1C_SHIFT             6
+#       define R300_ALU_SRC1C_MASK              (31 << 6)
+#       define R300_ALU_SRC1C_CONST             (1 << 11)
+#       define R300_ALU_SRC2C_SHIFT             12
+#       define R300_ALU_SRC2C_MASK              (31 << 12)
+#       define R300_ALU_SRC2C_CONST             (1 << 17)
+#       define R300_ALU_SRC_MASK                0x0003ffff
+#       define R300_ALU_DSTC_SHIFT              18
+#       define R300_ALU_DSTC_MASK               (31 << 18)
+#		define R300_ALU_DSTC_REG_MASK_SHIFT     23
+#       define R300_ALU_DSTC_REG_X              (1 << 23)
+#       define R300_ALU_DSTC_REG_Y              (1 << 24)
+#       define R300_ALU_DSTC_REG_Z              (1 << 25)
+#		define R300_ALU_DSTC_OUTPUT_MASK_SHIFT  26
+#       define R300_ALU_DSTC_OUTPUT_X           (1 << 26)
+#       define R300_ALU_DSTC_OUTPUT_Y           (1 << 27)
+#       define R300_ALU_DSTC_OUTPUT_Z           (1 << 28)
+#       define R300_ALU_DSTC_OUTPUT_XYZ         (7 << 26)
+#       define R300_RGB_ADDR0(x)                ((x) << 0)
+#       define R300_RGB_ADDR1(x)                ((x) << 6)
+#       define R300_RGB_ADDR2(x)                ((x) << 12)
+
+#define R300_US_ALU_ALPHA_ADDR_0                 0x47C0
+#       define R300_ALU_SRC0A_SHIFT             0
+#       define R300_ALU_SRC0A_MASK              (31 << 0)
+#       define R300_ALU_SRC0A_CONST             (1 << 5)
+#       define R300_ALU_SRC1A_SHIFT             6
+#       define R300_ALU_SRC1A_MASK              (31 << 6)
+#       define R300_ALU_SRC1A_CONST             (1 << 11)
+#       define R300_ALU_SRC2A_SHIFT             12
+#       define R300_ALU_SRC2A_MASK              (31 << 12)
+#       define R300_ALU_SRC2A_CONST             (1 << 17)
+#       define R300_ALU_SRC_MASK                0x0003ffff
+#       define R300_ALU_DSTA_SHIFT              18
+#       define R300_ALU_DSTA_MASK               (31 << 18)
+#       define R300_ALU_DSTA_REG                (1 << 23)
+#       define R300_ALU_DSTA_OUTPUT             (1 << 24)
+#		define R300_ALU_DSTA_DEPTH              (1 << 27)
+#       define R300_ALPHA_ADDR0(x)                ((x) << 0)
+#       define R300_ALPHA_ADDR1(x)                ((x) << 6)
+#       define R300_ALPHA_ADDR2(x)                ((x) << 12)
+
+#define R300_US_ALU_RGB_INST_0                   0x48C0
+#       define R300_ALU_ARGC_SRC0C_XYZ          0
+#       define R300_ALU_ARGC_SRC0C_XXX          1
+#       define R300_ALU_ARGC_SRC0C_YYY          2
+#       define R300_ALU_ARGC_SRC0C_ZZZ          3
+#       define R300_ALU_ARGC_SRC1C_XYZ          4
+#       define R300_ALU_ARGC_SRC1C_XXX          5
+#       define R300_ALU_ARGC_SRC1C_YYY          6
+#       define R300_ALU_ARGC_SRC1C_ZZZ          7
+#       define R300_ALU_ARGC_SRC2C_XYZ          8
+#       define R300_ALU_ARGC_SRC2C_XXX          9
+#       define R300_ALU_ARGC_SRC2C_YYY          10
+#       define R300_ALU_ARGC_SRC2C_ZZZ          11
+#       define R300_ALU_ARGC_SRC0A              12
+#       define R300_ALU_ARGC_SRC1A              13
+#       define R300_ALU_ARGC_SRC2A              14
+#       define R300_ALU_ARGC_SRCP_XYZ           15
+#       define R300_ALU_ARGC_SRCP_XXX           16
+#       define R300_ALU_ARGC_SRCP_YYY           17
+#       define R300_ALU_ARGC_SRCP_ZZZ           18
+#       define R300_ALU_ARGC_SRCP_WWW           19
+#       define R300_ALU_ARGC_ZERO               20
+#       define R300_ALU_ARGC_ONE                21
+#       define R300_ALU_ARGC_HALF               22
+#       define R300_ALU_ARGC_SRC0C_YZX          23
+#       define R300_ALU_ARGC_SRC1C_YZX          24
+#       define R300_ALU_ARGC_SRC2C_YZX          25
+#       define R300_ALU_ARGC_SRC0C_ZXY          26
+#       define R300_ALU_ARGC_SRC1C_ZXY          27
+#       define R300_ALU_ARGC_SRC2C_ZXY          28
+#       define R300_ALU_ARGC_SRC0CA_WZY         29
+#       define R300_ALU_ARGC_SRC1CA_WZY         30
+#       define R300_ALU_ARGC_SRC2CA_WZY         31
+#       define R300_RGB_SWIZA(x)                ((x) << 0)
+#       define R300_RGB_SWIZB(x)                ((x) << 7)
+#       define R300_RGB_SWIZC(x)                ((x) << 14)
+
+#       define R300_ALU_ARG0C_SHIFT             0
+#       define R300_ALU_ARG0C_MASK              (31 << 0)
+#       define R300_ALU_ARG0C_NOP               (0 << 5)
+#       define R300_ALU_ARG0C_NEG               (1 << 5)
+#       define R300_ALU_ARG0C_ABS               (2 << 5)
+#       define R300_ALU_ARG0C_NAB               (3 << 5)
+#       define R300_ALU_ARG1C_SHIFT             7
+#       define R300_ALU_ARG1C_MASK              (31 << 7)
+#       define R300_ALU_ARG1C_NOP               (0 << 12)
+#       define R300_ALU_ARG1C_NEG               (1 << 12)
+#       define R300_ALU_ARG1C_ABS               (2 << 12)
+#       define R300_ALU_ARG1C_NAB               (3 << 12)
+#       define R300_ALU_ARG2C_SHIFT             14
+#       define R300_ALU_ARG2C_MASK              (31 << 14)
+#       define R300_ALU_ARG2C_NOP               (0 << 19)
+#       define R300_ALU_ARG2C_NEG               (1 << 19)
+#       define R300_ALU_ARG2C_ABS               (2 << 19)
+#       define R300_ALU_ARG2C_NAB               (3 << 19)
+#       define R300_ALU_SRCP_1_MINUS_2_SRC0     (0 << 21)
+#       define R300_ALU_SRCP_SRC1_MINUS_SRC0    (1 << 21)
+#       define R300_ALU_SRCP_SRC1_PLUS_SRC0     (2 << 21)
+#       define R300_ALU_SRCP_1_MINUS_SRC0       (3 << 21)
+
+#       define R300_ALU_OUTC_MAD                (0 << 23)
+#       define R300_ALU_OUTC_DP3                (1 << 23)
+#       define R300_ALU_OUTC_DP4                (2 << 23)
+#       define R300_ALU_OUTC_D2A                (3 << 23)
+#       define R300_ALU_OUTC_MIN                (4 << 23)
+#       define R300_ALU_OUTC_MAX                (5 << 23)
+#       define R300_ALU_OUTC_CMPH               (7 << 23)
+#       define R300_ALU_OUTC_CMP                (8 << 23)
+#       define R300_ALU_OUTC_FRC                (9 << 23)
+#       define R300_ALU_OUTC_REPL_ALPHA         (10 << 23)
+
+#       define R300_ALU_OUTC_MOD_NOP            (0 << 27)
+#       define R300_ALU_OUTC_MOD_MUL2           (1 << 27)
+#       define R300_ALU_OUTC_MOD_MUL4           (2 << 27)
+#       define R300_ALU_OUTC_MOD_MUL8           (3 << 27)
+#       define R300_ALU_OUTC_MOD_DIV2           (4 << 27)
+#       define R300_ALU_OUTC_MOD_DIV4           (5 << 27)
+#       define R300_ALU_OUTC_MOD_DIV8           (6 << 27)
+
+#       define R300_ALU_OUTC_CLAMP              (1 << 30)
+#       define R300_ALU_INSERT_NOP              (1 << 31)
+
+#define R300_US_ALU_ALPHA_INST_0                 0x49C0
+#       define R300_ALU_ARGA_SRC0C_X            0
+#       define R300_ALU_ARGA_SRC0C_Y            1
+#       define R300_ALU_ARGA_SRC0C_Z            2
+#       define R300_ALU_ARGA_SRC1C_X            3
+#       define R300_ALU_ARGA_SRC1C_Y            4
+#       define R300_ALU_ARGA_SRC1C_Z            5
+#       define R300_ALU_ARGA_SRC2C_X            6
+#       define R300_ALU_ARGA_SRC2C_Y            7
+#       define R300_ALU_ARGA_SRC2C_Z            8
+#       define R300_ALU_ARGA_SRC0A              9
+#       define R300_ALU_ARGA_SRC1A              10
+#       define R300_ALU_ARGA_SRC2A              11
+#       define R300_ALU_ARGA_SRCP_X             12
+#       define R300_ALU_ARGA_SRCP_Y             13
+#       define R300_ALU_ARGA_SRCP_Z             14
+#       define R300_ALU_ARGA_SRCP_W             15
+#       define R300_ALU_ARGA_ZERO               16
+#       define R300_ALU_ARGA_ONE                17
+#       define R300_ALU_ARGA_HALF               18
+#       define R300_ALPHA_SWIZA(x)              ((x) << 0)
+#       define R300_ALPHA_SWIZB(x)              ((x) << 7)
+#       define R300_ALPHA_SWIZC(x)              ((x) << 14)
+
+#       define R300_ALU_ARG0A_SHIFT             0
+#       define R300_ALU_ARG0A_MASK              (31 << 0)
+#       define R300_ALU_ARG0A_NOP               (0 << 5)
+#       define R300_ALU_ARG0A_NEG               (1 << 5)
+#	define R300_ALU_ARG0A_ABS		 (2 << 5)
+#	define R300_ALU_ARG0A_NAB		 (3 << 5)
+#       define R300_ALU_ARG1A_SHIFT             7
+#       define R300_ALU_ARG1A_MASK              (31 << 7)
+#       define R300_ALU_ARG1A_NOP               (0 << 12)
+#       define R300_ALU_ARG1A_NEG               (1 << 12)
+#	define R300_ALU_ARG1A_ABS		 (2 << 12)
+#	define R300_ALU_ARG1A_NAB		 (3 << 12)
+#       define R300_ALU_ARG2A_SHIFT             14
+#       define R300_ALU_ARG2A_MASK              (31 << 14)
+#       define R300_ALU_ARG2A_NOP               (0 << 19)
+#       define R300_ALU_ARG2A_NEG               (1 << 19)
+#	define R300_ALU_ARG2A_ABS		 (2 << 19)
+#	define R300_ALU_ARG2A_NAB		 (3 << 19)
+#       define R300_ALU_SRCP_1_MINUS_2_SRC0     (0 << 21)
+#       define R300_ALU_SRCP_SRC1_MINUS_SRC0    (1 << 21)
+#       define R300_ALU_SRCP_SRC1_PLUS_SRC0     (2 << 21)
+#       define R300_ALU_SRCP_1_MINUS_SRC0       (3 << 21)
+
+#       define R300_ALU_OUTA_MAD                (0 << 23)
+#       define R300_ALU_OUTA_DP4                (1 << 23)
+#       define R300_ALU_OUTA_MIN                (2 << 23)
+#       define R300_ALU_OUTA_MAX                (3 << 23)
+#       define R300_ALU_OUTA_CND                (5 << 23)
+#       define R300_ALU_OUTA_CMP                (6 << 23)
+#       define R300_ALU_OUTA_FRC                (7 << 23)
+#       define R300_ALU_OUTA_EX2                (8 << 23)
+#       define R300_ALU_OUTA_LG2                (9 << 23)
+#       define R300_ALU_OUTA_RCP                (10 << 23)
+#       define R300_ALU_OUTA_RSQ                (11 << 23)
+
+#       define R300_ALU_OUTA_MOD_NOP            (0 << 27)
+#       define R300_ALU_OUTA_MOD_MUL2           (1 << 27)
+#       define R300_ALU_OUTA_MOD_MUL4           (2 << 27)
+#       define R300_ALU_OUTA_MOD_MUL8           (3 << 27)
+#       define R300_ALU_OUTA_MOD_DIV2           (4 << 27)
+#       define R300_ALU_OUTA_MOD_DIV4           (5 << 27)
+#       define R300_ALU_OUTA_MOD_DIV8           (6 << 27)
+
+#       define R300_ALU_OUTA_CLAMP              (1 << 30)
+/* END: Fragment program instruction set */
+
+/* Fog: Fog Blending Enable */
+#define R300_FG_FOG_BLEND                             0x4bc0
+#       define R300_FG_FOG_BLEND_DISABLE              (0 << 0)
+#       define R300_FG_FOG_BLEND_ENABLE               (1 << 0)
+#	define R300_FG_FOG_BLEND_FN_LINEAR            (0 << 1)
+#	define R300_FG_FOG_BLEND_FN_EXP               (1 << 1)
+#	define R300_FG_FOG_BLEND_FN_EXP2              (2 << 1)
+#	define R300_FG_FOG_BLEND_FN_CONSTANT          (3 << 1)
+#	define R300_FG_FOG_BLEND_FN_MASK              (3 << 1)
+
+/* Fog: Red Component of Fog Color */
+#define R300_FG_FOG_COLOR_R                           0x4bc8
+/* Fog: Green Component of Fog Color */
+#define R300_FG_FOG_COLOR_G                           0x4bcc
+/* Fog: Blue Component of Fog Color */
+#define R300_FG_FOG_COLOR_B                           0x4bd0
+#	define R300_FG_FOG_COLOR_MASK 0x000003ff
+
+/* Fog: Constant Factor for Fog Blending */
+#define R300_FG_FOG_FACTOR                            0x4bc4
+#	define FG_FOG_FACTOR_MASK 0x000003ff
+
+/* Fog: Alpha function */
+#define R300_FG_ALPHA_FUNC                            0x4bd4
+#       define R300_FG_ALPHA_FUNC_VAL_MASK               0x000000ff
+#       define R300_FG_ALPHA_FUNC_NEVER                     (0 << 8)
+#       define R300_FG_ALPHA_FUNC_LESS                      (1 << 8)
+#       define R300_FG_ALPHA_FUNC_EQUAL                     (2 << 8)
+#       define R300_FG_ALPHA_FUNC_LE                        (3 << 8)
+#       define R300_FG_ALPHA_FUNC_GREATER                   (4 << 8)
+#       define R300_FG_ALPHA_FUNC_NOTEQUAL                  (5 << 8)
+#       define R300_FG_ALPHA_FUNC_GE                        (6 << 8)
+#       define R300_FG_ALPHA_FUNC_ALWAYS                    (7 << 8)
+#       define R300_ALPHA_TEST_OP_MASK                      (7 << 8)
+#       define R300_FG_ALPHA_FUNC_DISABLE                   (0 << 11)
+#       define R300_FG_ALPHA_FUNC_ENABLE                    (1 << 11)
+
+#       define R500_FG_ALPHA_FUNC_10BIT                     (0 << 12)
+#       define R500_FG_ALPHA_FUNC_8BIT                      (1 << 12)
+
+#       define R300_FG_ALPHA_FUNC_MASK_DISABLE              (0 << 16)
+#       define R300_FG_ALPHA_FUNC_MASK_ENABLE               (1 << 16)
+#       define R300_FG_ALPHA_FUNC_CFG_2_OF_4                (0 << 17)
+#       define R300_FG_ALPHA_FUNC_CFG_3_OF_6                (1 << 17)
+
+#       define R300_FG_ALPHA_FUNC_DITH_DISABLE              (0 << 20)
+#       define R300_FG_ALPHA_FUNC_DITH_ENABLE               (1 << 20)
+
+#       define R500_FG_ALPHA_FUNC_OFFSET_DISABLE            (0 << 24)
+#       define R500_FG_ALPHA_FUNC_OFFSET_ENABLE             (1 << 24) /* Not supported in R520 */
+#       define R500_FG_ALPHA_FUNC_DISC_ZERO_MASK_DISABLE    (0 << 25)
+#       define R500_FG_ALPHA_FUNC_DISC_ZERO_MASK_ENABLE     (1 << 25)
+
+#       define R500_FG_ALPHA_FUNC_FP16_DISABLE              (0 << 28)
+#       define R500_FG_ALPHA_FUNC_FP16_ENABLE               (1 << 28)
+
+
+/* Fog: Where does the depth come from? */
+#define R300_FG_DEPTH_SRC                  0x4bd8
+#	define R300_FG_DEPTH_SRC_SCAN   (0 << 0)
+#	define R300_FG_DEPTH_SRC_SHADER (1 << 0)
+
+/* Fog: Alpha Compare Value */
+#define R500_FG_ALPHA_VALUE                0x4be0
+#	define R500_FG_ALPHA_VALUE_MASK 0x0000ffff
+
+/* gap */
+
+/* Fragment program parameters in 7.16 floating point */
+#define R300_PFS_PARAM_0_X                  0x4C00
+#define R300_PFS_PARAM_0_Y                  0x4C04
+#define R300_PFS_PARAM_0_Z                  0x4C08
+#define R300_PFS_PARAM_0_W                  0x4C0C
+/* last consts */
+#define R300_PFS_PARAM_31_X                 0x4DF0
+#define R300_PFS_PARAM_31_Y                 0x4DF4
+#define R300_PFS_PARAM_31_Z                 0x4DF8
+#define R300_PFS_PARAM_31_W                 0x4DFC
+
+/* Unpipelined. */
+#define R300_RB3D_CCTL                      0x4e00
+#	define R300_RB3D_CCTL_NUM_MULTIWRITES_1_BUFFER                (0 << 5)
+#	define R300_RB3D_CCTL_NUM_MULTIWRITES_2_BUFFERS               (1 << 5)
+#	define R300_RB3D_CCTL_NUM_MULTIWRITES_3_BUFFERS               (2 << 5)
+#	define R300_RB3D_CCTL_NUM_MULTIWRITES_4_BUFFERS               (3 << 5)
+#	define R300_RB3D_CCTL_CLRCMP_FLIPE_DISABLE                    (0 << 7)
+#	define R300_RB3D_CCTL_CLRCMP_FLIPE_ENABLE                     (1 << 7)
+#	define R300_RB3D_CCTL_AA_COMPRESSION_DISABLE                  (0 << 9)
+#	define R300_RB3D_CCTL_AA_COMPRESSION_ENABLE                   (1 << 9)
+#	define R300_RB3D_CCTL_CMASK_DISABLE                           (0 << 10)
+#	define R300_RB3D_CCTL_CMASK_ENABLE                            (1 << 10)
+/* reserved */
+#	define R300_RB3D_CCTL_INDEPENDENT_COLOR_CHANNEL_MASK_DISABLE  (0 << 12)
+#	define R300_RB3D_CCTL_INDEPENDENT_COLOR_CHANNEL_MASK_ENABLE   (1 << 12)
+#	define R300_RB3D_CCTL_WRITE_COMPRESSION_ENABLE                (0 << 13)
+#	define R300_RB3D_CCTL_WRITE_COMPRESSION_DISABLE               (1 << 13)
+#	define R300_RB3D_CCTL_INDEPENDENT_COLORFORMAT_ENABLE_DISABLE  (0 << 14)
+#	define R300_RB3D_CCTL_INDEPENDENT_COLORFORMAT_ENABLE_ENABLE   (1 << 14)
+
+
+/* Notes:
+ * - AFAIK fglrx always sets BLEND_UNKNOWN when blending is used in
+ *   the application
+ * - AFAIK fglrx always sets BLEND_NO_SEPARATE when CBLEND and ABLEND
+ *    are set to the same
+ *   function (both registers are always set up completely in any case)
+ * - Most blend flags are simply copied from R200 and not tested yet
+ */
+#define R300_RB3D_CBLEND                    0x4E04
+#define R300_RB3D_ABLEND                    0x4E08
+/* the following only appear in CBLEND */
+#       define R300_ALPHA_BLEND_ENABLE         (1 << 0)
+#       define R300_SEPARATE_ALPHA_ENABLE      (1 << 1)
+#       define R300_READ_ENABLE                (1 << 2)
+#       define R300_DISCARD_SRC_PIXELS_DIS     (0 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_0     (1 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_COLOR_0     (2 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_0     (3 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_1     (4 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_COLOR_1     (5 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_1     (6 << 3)
+
+/* the following are shared between CBLEND and ABLEND */
+#       define R300_FCN_MASK                         (3  << 12)
+#       define R300_COMB_FCN_ADD_CLAMP               (0  << 12)
+#       define R300_COMB_FCN_ADD_NOCLAMP             (1  << 12)
+#       define R300_COMB_FCN_SUB_CLAMP               (2  << 12)
+#       define R300_COMB_FCN_SUB_NOCLAMP             (3  << 12)
+#       define R300_COMB_FCN_MIN                     (4  << 12)
+#       define R300_COMB_FCN_MAX                     (5  << 12)
+#       define R300_COMB_FCN_RSUB_CLAMP              (6  << 12)
+#       define R300_COMB_FCN_RSUB_NOCLAMP            (7  << 12)
+#       define R300_BLEND_GL_ZERO                    (32)
+#       define R300_BLEND_GL_ONE                     (33)
+#       define R300_BLEND_GL_SRC_COLOR               (34)
+#       define R300_BLEND_GL_ONE_MINUS_SRC_COLOR     (35)
+#       define R300_BLEND_GL_DST_COLOR               (36)
+#       define R300_BLEND_GL_ONE_MINUS_DST_COLOR     (37)
+#       define R300_BLEND_GL_SRC_ALPHA               (38)
+#       define R300_BLEND_GL_ONE_MINUS_SRC_ALPHA     (39)
+#       define R300_BLEND_GL_DST_ALPHA               (40)
+#       define R300_BLEND_GL_ONE_MINUS_DST_ALPHA     (41)
+#       define R300_BLEND_GL_SRC_ALPHA_SATURATE      (42)
+#       define R300_BLEND_GL_CONST_COLOR             (43)
+#       define R300_BLEND_GL_ONE_MINUS_CONST_COLOR   (44)
+#       define R300_BLEND_GL_CONST_ALPHA             (45)
+#       define R300_BLEND_GL_ONE_MINUS_CONST_ALPHA   (46)
+#       define R300_BLEND_MASK                       (63)
+#       define R300_SRC_BLEND_SHIFT                  (16)
+#       define R300_DST_BLEND_SHIFT                  (24)
+
+/* Constant color used by the blender. Pipelined through the blender.
+ * Note: For R520, this field is ignored, use RB3D_CONSTANT_COLOR_GB__BLUE,
+ * RB3D_CONSTANT_COLOR_GB__GREEN, etc. instead.
+ */
+#define R300_RB3D_BLEND_COLOR               0x4E10
+
+
+/* 3D Color Channel Mask. If all the channels used in the current color format
+ * are disabled, then the cb will discard all the incoming quads. Pipelined
+ * through the blender.
+ */
+#define RB3D_COLOR_CHANNEL_MASK                  0x4E0C
+#	define RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0  (1 << 0)
+#	define RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 (1 << 1)
+#	define RB3D_COLOR_CHANNEL_MASK_RED_MASK0   (1 << 2)
+#	define RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 (1 << 3)
+#	define RB3D_COLOR_CHANNEL_MASK_BLUE_MASK1  (1 << 4)
+#	define RB3D_COLOR_CHANNEL_MASK_GREEN_MASK1 (1 << 5)
+#	define RB3D_COLOR_CHANNEL_MASK_RED_MASK1   (1 << 6)
+#	define RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK1 (1 << 7)
+#	define RB3D_COLOR_CHANNEL_MASK_BLUE_MASK2  (1 << 8)
+#	define RB3D_COLOR_CHANNEL_MASK_GREEN_MASK2 (1 << 9)
+#	define RB3D_COLOR_CHANNEL_MASK_RED_MASK2   (1 << 10)
+#	define RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK2 (1 << 11)
+#	define RB3D_COLOR_CHANNEL_MASK_BLUE_MASK3  (1 << 12)
+#	define RB3D_COLOR_CHANNEL_MASK_GREEN_MASK3 (1 << 13)
+#	define RB3D_COLOR_CHANNEL_MASK_RED_MASK3   (1 << 14)
+#	define RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK3 (1 << 15)
+
+/* Clear color that is used when the color mask is set to 00. Unpipelined.
+ * Program this register with a 32-bit value in ARGB8888 or ARGB2101010
+ * formats, ignoring the fields.
+ */
+#define RB3D_COLOR_CLEAR_VALUE                   0x4e14
+
+/* gap */
+
+/* Color Compare Color. Stalls the 2d/3d datapath until it is idle. */
+#define RB3D_CLRCMP_CLR                     0x4e20
+
+/* Color Compare Mask. Stalls the 2d/3d datapath until it is idle. */
+#define RB3D_CLRCMP_MSK                     0x4e24
+
+/* Color Buffer Address Offset of multibuffer 0. Unpipelined. */
+#define R300_RB3D_COLOROFFSET0              0x4E28
+#       define R300_COLOROFFSET_MASK             0xFFFFFFE0
+/* Color Buffer Address Offset of multibuffer 1. Unpipelined. */
+#define R300_RB3D_COLOROFFSET1              0x4E2C
+/* Color Buffer Address Offset of multibuffer 2. Unpipelined. */
+#define R300_RB3D_COLOROFFSET2              0x4E30
+/* Color Buffer Address Offset of multibuffer 3. Unpipelined. */
+#define R300_RB3D_COLOROFFSET3              0x4E34
+
+/* Color buffer format and tiling control for all the multibuffers and the
+ * pitch of multibuffer 0 to 3. Unpipelined. The cache must be empty before any
+ * of the registers are changed.
+ *
+ * Bit 16: Larger tiles
+ * Bit 17: 4x2 tiles
+ * Bit 18: Extremely weird tile like, but some pixels duplicated?
+ */
+#define R300_RB3D_COLORPITCH0               0x4E38
+#       define R300_COLORPITCH_MASK              0x00003FFE
+#       define R300_COLOR_TILE_DISABLE            (0 << 16)
+#       define R300_COLOR_TILE_ENABLE             (1 << 16)
+#       define R300_COLOR_MICROTILE_DISABLE       (0 << 17)
+#       define R300_COLOR_MICROTILE_ENABLE        (1 << 17)
+#       define R300_COLOR_MICROTILE_ENABLE_SQUARE (2 << 17) /* Only available in 16-bit */
+#       define R300_COLOR_ENDIAN_NO_SWAP          (0 << 19)
+#       define R300_COLOR_ENDIAN_WORD_SWAP        (1 << 19)
+#       define R300_COLOR_ENDIAN_DWORD_SWAP       (2 << 19)
+#       define R300_COLOR_ENDIAN_HALF_DWORD_SWAP  (3 << 19)
+#	define R500_COLOR_FORMAT_ARGB10101010     (0 << 21)
+#	define R500_COLOR_FORMAT_UV1010           (1 << 21)
+#	define R500_COLOR_FORMAT_CI8              (2 << 21) /* 2D only */
+#	define R300_COLOR_FORMAT_ARGB1555         (3 << 21)
+#       define R300_COLOR_FORMAT_RGB565           (4 << 21)
+#       define R500_COLOR_FORMAT_ARGB2101010      (5 << 21)
+#       define R300_COLOR_FORMAT_ARGB8888         (6 << 21)
+#       define R300_COLOR_FORMAT_ARGB32323232     (7 << 21)
+/* reserved */
+#       define R300_COLOR_FORMAT_I8               (9 << 21)
+#       define R300_COLOR_FORMAT_ARGB16161616     (10 << 21)
+#       define R300_COLOR_FORMAT_VYUY             (11 << 21)
+#       define R300_COLOR_FORMAT_YVYU             (12 << 21)
+#       define R300_COLOR_FORMAT_UV88             (13 << 21)
+#       define R500_COLOR_FORMAT_I10              (14 << 21)
+#       define R300_COLOR_FORMAT_ARGB4444         (15 << 21)
+#define R300_RB3D_COLORPITCH1               0x4E3C
+#define R300_RB3D_COLORPITCH2               0x4E40
+#define R300_RB3D_COLORPITCH3               0x4E44
+
+/* gap */
+
+/* Destination Color Buffer Cache Control/Status. If the cb is in e2 mode, then
+ * a flush or free will not occur upon a write to this register, but a sync
+ * will be immediately sent if one is requested. If both DC_FLUSH and DC_FREE
+ * are zero but DC_FINISH is one, then a sync will be sent immediately -- the
+ * cb will not wait for all the previous operations to complete before sending
+ * the sync. Unpipelined except when DC_FINISH and DC_FREE are both set to
+ * zero.
+ *
+ * Set to 0A before 3D operations, set to 02 afterwards.
+ */
+#define R300_RB3D_DSTCACHE_CTLSTAT               0x4e4c
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_NO_EFFECT         (0 << 0)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_NO_EFFECT_1       (1 << 0)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D    (2 << 0)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D_1  (3 << 0)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_NO_EFFECT          (0 << 2)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_NO_EFFECT_1        (1 << 2)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS       (2 << 2)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS_1     (3 << 2)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FINISH_NO_SIGNAL        (0 << 4)
+#	define R300_RB3D_DSTCACHE_CTLSTAT_DC_FINISH_SIGNAL           (1 << 4)
+
+#define R300_RB3D_DITHER_CTL 0x4E50
+#	define R300_RB3D_DITHER_CTL_DITHER_MODE_TRUNCATE         (0 << 0)
+#	define R300_RB3D_DITHER_CTL_DITHER_MODE_ROUND            (1 << 0)
+#	define R300_RB3D_DITHER_CTL_DITHER_MODE_LUT              (2 << 0)
+/* reserved */
+#	define R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_TRUNCATE   (0 << 2)
+#	define R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_ROUND      (1 << 2)
+#	define R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_LUT        (2 << 2)
+/* reserved */
+
+/* Resolve buffer destination address. The cache must be empty before changing
+ * this register if the cb is in resolve mode. Unpipelined
+ */
+#define R300_RB3D_AARESOLVE_OFFSET        0x4e80
+#	define R300_RB3D_AARESOLVE_OFFSET_SHIFT 5
+#	define R300_RB3D_AARESOLVE_OFFSET_MASK 0xffffffe0 /* At least according to the calculations of Christoph Brill */
+
+/* Resolve Buffer Pitch and Tiling Control. The cache must be empty before
+ * changing this register if the cb is in resolve mode. Unpipelined
+ */
+#define R300_RB3D_AARESOLVE_PITCH         0x4e84
+#	define R300_RB3D_AARESOLVE_PITCH_SHIFT 1
+#	define R300_RB3D_AARESOLVE_PITCH_MASK  0x00003ffe /* At least according to the calculations of Christoph Brill */
+
+/* Resolve Buffer Control. Unpipelined */
+#define R300_RB3D_AARESOLVE_CTL           0x4e88
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_MODE_NORMAL   (0 << 0)
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_MODE_RESOLVE  (1 << 0)
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_GAMMA_10      (0 << 1)
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_GAMMA_22      (1 << 1)
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_ALPHA_SAMPLE0 (0 << 2)
+#	define R300_RB3D_AARESOLVE_CTL_AARESOLVE_ALPHA_AVERAGE (1 << 2)
+
+
+/* Discard src pixels less than or equal to threshold. */
+#define R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD 0x4ea0
+/* Discard src pixels greater than or equal to threshold. */
+#define R500_RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD 0x4ea4
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_BLUE_SHIFT 0
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_BLUE_MASK 0x000000ff
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_GREEN_SHIFT 8
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_GREEN_MASK 0x0000ff00
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_RED_SHIFT 16
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_RED_MASK 0x00ff0000
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_ALPHA_SHIFT 24
+#	define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_ALPHA_MASK 0xff000000
+
+/* 3D ROP Control. Stalls the 2d/3d datapath until it is idle. */
+#define R300_RB3D_ROPCNTL                             0x4e18
+#	define R300_RB3D_ROPCNTL_ROP_ENABLE            0x00000004
+#	define R300_RB3D_ROPCNTL_ROP_MASK              (15 << 8)
+#	define R300_RB3D_ROPCNTL_ROP_SHIFT             8
+
+/* Color Compare Flip. Stalls the 2d/3d datapath until it is idle. */
+#define R300_RB3D_CLRCMP_FLIPE                        0x4e1c
+
+/* Sets the fifo sizes */
+#define R500_RB3D_FIFO_SIZE                           0x4ef4
+#	define R500_RB3D_FIFO_SIZE_OP_FIFO_SIZE_FULL   (0 << 0)
+#	define R500_RB3D_FIFO_SIZE_OP_FIFO_SIZE_HALF   (1 << 0)
+#	define R500_RB3D_FIFO_SIZE_OP_FIFO_SIZE_QUATER (2 << 0)
+#	define R500_RB3D_FIFO_SIZE_OP_FIFO_SIZE_EIGTHS (3 << 0)
+
+/* Constant color used by the blender. Pipelined through the blender. */
+#define R500_RB3D_CONSTANT_COLOR_AR                   0x4ef8
+#	define R500_RB3D_CONSTANT_COLOR_AR_RED_MASK    0x0000ffff
+#	define R500_RB3D_CONSTANT_COLOR_AR_RED_SHIFT   0
+#	define R500_RB3D_CONSTANT_COLOR_AR_ALPHA_MASK  0xffff0000
+#	define R500_RB3D_CONSTANT_COLOR_AR_ALPHA_SHIFT 16
+
+/* Constant color used by the blender. Pipelined through the blender. */
+#define R500_RB3D_CONSTANT_COLOR_GB                   0x4efc
+#	define R500_RB3D_CONSTANT_COLOR_AR_BLUE_MASK   0x0000ffff
+#	define R500_RB3D_CONSTANT_COLOR_AR_BLUE_SHIFT  0
+#	define R500_RB3D_CONSTANT_COLOR_AR_GREEN_MASK  0xffff0000
+#	define R500_RB3D_CONSTANT_COLOR_AR_GREEN_SHIFT 16
+
+/* gap */
+/* There seems to be no "write only" setting, so use Z-test = ALWAYS
+ * for this.
+ * Bit (1<<8) is the "test" bit. so plain write is 6  - vd
+ */
+#define R300_ZB_CNTL                             0x4F00
+#	define R300_STENCIL_ENABLE		 (1 << 0)
+#	define R300_Z_ENABLE		         (1 << 1)
+#	define R300_Z_WRITE_ENABLE		 (1 << 2)
+#	define R300_Z_SIGNED_COMPARE		 (1 << 3)
+#	define R300_STENCIL_FRONT_BACK		 (1 << 4)
+
+#define R300_ZB_ZSTENCILCNTL                   0x4f04
+	/* functions */
+#	define R300_ZS_NEVER			0
+#	define R300_ZS_LESS			1
+#	define R300_ZS_LEQUAL			2
+#	define R300_ZS_EQUAL			3
+#	define R300_ZS_GEQUAL			4
+#	define R300_ZS_GREATER			5
+#	define R300_ZS_NOTEQUAL			6
+#	define R300_ZS_ALWAYS			7
+#       define R300_ZS_MASK                     7
+	/* operations */
+#	define R300_ZS_KEEP			0
+#	define R300_ZS_ZERO			1
+#	define R300_ZS_REPLACE			2
+#	define R300_ZS_INCR			3
+#	define R300_ZS_DECR			4
+#	define R300_ZS_INVERT			5
+#	define R300_ZS_INCR_WRAP		6
+#	define R300_ZS_DECR_WRAP		7
+#	define R300_Z_FUNC_SHIFT		0
+	/* front and back refer to operations done for front
+	   and back faces, i.e. separate stencil function support */
+#	define R300_S_FRONT_FUNC_SHIFT	        3
+#	define R300_S_FRONT_SFAIL_OP_SHIFT	6
+#	define R300_S_FRONT_ZPASS_OP_SHIFT	9
+#	define R300_S_FRONT_ZFAIL_OP_SHIFT      12
+#	define R300_S_BACK_FUNC_SHIFT           15
+#	define R300_S_BACK_SFAIL_OP_SHIFT       18
+#	define R300_S_BACK_ZPASS_OP_SHIFT       21
+#	define R300_S_BACK_ZFAIL_OP_SHIFT       24
+
+#define R300_ZB_STENCILREFMASK                        0x4f08
+#	define R300_STENCILREF_SHIFT       0
+#	define R300_STENCILREF_MASK        0x000000ff
+#	define R300_STENCILMASK_SHIFT      8
+#	define R300_STENCILMASK_MASK       0x0000ff00
+#	define R300_STENCILWRITEMASK_SHIFT 16
+#	define R300_STENCILWRITEMASK_MASK  0x00ff0000
+
+/* gap */
+
+#define R300_ZB_FORMAT                             0x4f10
+#	define R300_DEPTHFORMAT_16BIT_INT_Z   (0 << 0)
+#	define R300_DEPTHFORMAT_16BIT_13E3    (1 << 0)
+#	define R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL   (2 << 0)
+/* reserved up to (15 << 0) */
+#	define R300_INVERT_13E3_LEADING_ONES  (0 << 4)
+#	define R300_INVERT_13E3_LEADING_ZEROS (1 << 4)
+
+#define R300_ZB_ZTOP                             0x4F14
+#	define R300_ZTOP_DISABLE                 (0 << 0)
+#	define R300_ZTOP_ENABLE                  (1 << 0)
+
+/* gap */
+
+#define R300_ZB_ZCACHE_CTLSTAT            0x4f18
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_NO_EFFECT      (0 << 0)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE (1 << 0)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_NO_EFFECT       (0 << 1)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE            (1 << 1)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_BUSY_IDLE            (0 << 31)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_BUSY_BUSY            (1 << 31)
+
+#define R300_ZB_BW_CNTL                     0x4f1c
+#	define R300_HIZ_DISABLE                              (0 << 0)
+#	define R300_HIZ_ENABLE                               (1 << 0)
+#	define R300_HIZ_MIN                                  (0 << 1)
+#	define R300_HIZ_MAX                                  (1 << 1)
+#	define R300_FAST_FILL_DISABLE                        (0 << 2)
+#	define R300_FAST_FILL_ENABLE                         (1 << 2)
+#	define R300_RD_COMP_DISABLE                          (0 << 3)
+#	define R300_RD_COMP_ENABLE                           (1 << 3)
+#	define R300_WR_COMP_DISABLE                          (0 << 4)
+#	define R300_WR_COMP_ENABLE                           (1 << 4)
+#	define R300_ZB_CB_CLEAR_RMW                          (0 << 5)
+#	define R300_ZB_CB_CLEAR_CACHE_LINEAR                 (1 << 5)
+#	define R300_FORCE_COMPRESSED_STENCIL_VALUE_DISABLE   (0 << 6)
+#	define R300_FORCE_COMPRESSED_STENCIL_VALUE_ENABLE    (1 << 6)
+
+#	define R500_ZEQUAL_OPTIMIZE_ENABLE                   (0 << 7)
+#	define R500_ZEQUAL_OPTIMIZE_DISABLE                  (1 << 7)
+#	define R500_SEQUAL_OPTIMIZE_ENABLE                   (0 << 8)
+#	define R500_SEQUAL_OPTIMIZE_DISABLE                  (1 << 8)
+
+#	define R500_BMASK_ENABLE                             (0 << 10)
+#	define R500_BMASK_DISABLE                            (1 << 10)
+#	define R500_HIZ_EQUAL_REJECT_DISABLE                 (0 << 11)
+#	define R500_HIZ_EQUAL_REJECT_ENABLE                  (1 << 11)
+#	define R500_HIZ_FP_EXP_BITS_DISABLE                  (0 << 12)
+#	define R500_HIZ_FP_EXP_BITS_1                        (1 << 12)
+#	define R500_HIZ_FP_EXP_BITS_2                        (2 << 12)
+#	define R500_HIZ_FP_EXP_BITS_3                        (3 << 12)
+#	define R500_HIZ_FP_EXP_BITS_4                        (4 << 12)
+#	define R500_HIZ_FP_EXP_BITS_5                        (5 << 12)
+#	define R500_HIZ_FP_INVERT_LEADING_ONES               (0 << 15)
+#	define R500_HIZ_FP_INVERT_LEADING_ZEROS              (1 << 15)
+#	define R500_TILE_OVERWRITE_RECOMPRESSION_ENABLE      (0 << 16)
+#	define R500_TILE_OVERWRITE_RECOMPRESSION_DISABLE     (1 << 16)
+#	define R500_CONTIGUOUS_6XAA_SAMPLES_ENABLE           (0 << 17)
+#	define R500_CONTIGUOUS_6XAA_SAMPLES_DISABLE          (1 << 17)
+#	define R500_PEQ_PACKING_DISABLE                      (0 << 18)
+#	define R500_PEQ_PACKING_ENABLE                       (1 << 18)
+#	define R500_COVERED_PTR_MASKING_DISABLE              (0 << 18)
+#	define R500_COVERED_PTR_MASKING_ENABLE               (1 << 18)
+
+
+/* gap */
+
+/* Z Buffer Address Offset.
+ * Bits 31 to 5 are used for aligned Z buffer address offset for macro tiles.
+ */
+#define R300_ZB_DEPTHOFFSET               0x4f20
+
+/* Z Buffer Pitch and Endian Control */
+#define R300_ZB_DEPTHPITCH                0x4f24
+#       define R300_DEPTHPITCH_MASK              0x00003FFC
+#       define R300_DEPTHMACROTILE_DISABLE      (0 << 16)
+#       define R300_DEPTHMACROTILE_ENABLE       (1 << 16)
+#       define R300_DEPTHMICROTILE_LINEAR       (0 << 17)
+#       define R300_DEPTHMICROTILE_TILED        (1 << 17)
+#       define R300_DEPTHMICROTILE_TILED_SQUARE (2 << 17)
+#       define R300_DEPTHENDIAN_NO_SWAP         (0 << 18)
+#       define R300_DEPTHENDIAN_WORD_SWAP       (1 << 18)
+#       define R300_DEPTHENDIAN_DWORD_SWAP      (2 << 18)
+#       define R300_DEPTHENDIAN_HALF_DWORD_SWAP (3 << 18)
+
+/* Z Buffer Clear Value */
+#define R300_ZB_DEPTHCLEARVALUE                  0x4f28
+
+/* Hierarchical Z Memory Offset */
+#define R300_ZB_HIZ_OFFSET                       0x4f44
+
+/* Hierarchical Z Write Index */
+#define R300_ZB_HIZ_WRINDEX                      0x4f48
+
+/* Hierarchical Z Data */
+#define R300_ZB_HIZ_DWORD                        0x4f4c
+
+/* Hierarchical Z Read Index */
+#define R300_ZB_HIZ_RDINDEX                      0x4f50
+
+/* Hierarchical Z Pitch */
+#define R300_ZB_HIZ_PITCH                        0x4f54
+
+/* Z Buffer Z Pass Counter Data */
+#define R300_ZB_ZPASS_DATA                       0x4f58
+
+/* Z Buffer Z Pass Counter Address */
+#define R300_ZB_ZPASS_ADDR                       0x4f5c
+
+/* Depth buffer X and Y coordinate offset */
+#define R300_ZB_DEPTHXY_OFFSET                   0x4f60
+#	define R300_DEPTHX_OFFSET_SHIFT  1
+#	define R300_DEPTHX_OFFSET_MASK   0x000007FE
+#	define R300_DEPTHY_OFFSET_SHIFT  17
+#	define R300_DEPTHY_OFFSET_MASK   0x07FE0000
+
+/* Sets the fifo sizes */
+#define R500_ZB_FIFO_SIZE                        0x4fd0
+#	define R500_OP_FIFO_SIZE_FULL   (0 << 0)
+#	define R500_OP_FIFO_SIZE_HALF   (1 << 0)
+#	define R500_OP_FIFO_SIZE_QUATER (2 << 0)
+#	define R500_OP_FIFO_SIZE_EIGTHS (4 << 0)
+
+/* Stencil Reference Value and Mask for backfacing quads */
+/* R300_ZB_STENCILREFMASK handles front face */
+#define R500_ZB_STENCILREFMASK_BF                0x4fd4
+#	define R500_STENCILREF_SHIFT       0
+#	define R500_STENCILREF_MASK        0x000000ff
+#	define R500_STENCILMASK_SHIFT      8
+#	define R500_STENCILMASK_MASK       0x0000ff00
+#	define R500_STENCILWRITEMASK_SHIFT 16
+#	define R500_STENCILWRITEMASK_MASK  0x00ff0000
+
+/**
+ * \defgroup R3XX_R5XX_PROGRAMMABLE_VERTEX_SHADER_DESCRIPTION R3XX-R5XX PROGRAMMABLE VERTEX SHADER DESCRIPTION
+ *
+ * The PVS_DST_MATH_INST is used to identify whether the instruction is a Vector
+ * Engine instruction or a Math Engine instruction.
+ */
+
+/*\{*/
+
+enum {
+	/* R3XX */
+	VECTOR_NO_OP			= 0,
+	VE_DOT_PRODUCT			= 1,
+	VE_MULTIPLY			= 2,
+	VE_ADD				= 3,
+	VE_MULTIPLY_ADD			= 4,
+	VE_DISTANCE_VECTOR		= 5,
+	VE_FRACTION			= 6,
+	VE_MAXIMUM			= 7,
+	VE_MINIMUM			= 8,
+	VE_SET_GREATER_THAN_EQUAL	= 9,
+	VE_SET_LESS_THAN		= 10,
+	VE_MULTIPLYX2_ADD		= 11,
+	VE_MULTIPLY_CLAMP		= 12,
+	VE_FLT2FIX_DX			= 13,
+	VE_FLT2FIX_DX_RND		= 14,
+	/* R5XX */
+	VE_PRED_SET_EQ_PUSH		= 15,
+	VE_PRED_SET_GT_PUSH		= 16,
+	VE_PRED_SET_GTE_PUSH		= 17,
+	VE_PRED_SET_NEQ_PUSH		= 18,
+	VE_COND_WRITE_EQ		= 19,
+	VE_COND_WRITE_GT		= 20,
+	VE_COND_WRITE_GTE		= 21,
+	VE_COND_WRITE_NEQ		= 22,
+	VE_COND_MUX_EQ			= 23,
+	VE_COND_MUX_GT			= 24,
+	VE_COND_MUX_GTE			= 25,
+	VE_SET_GREATER_THAN		= 26,
+	VE_SET_EQUAL			= 27,
+	VE_SET_NOT_EQUAL		= 28,
+};
+
+enum {
+	/* R3XX */
+	MATH_NO_OP			= 0,
+	ME_EXP_BASE2_DX			= 1,
+	ME_LOG_BASE2_DX			= 2,
+	ME_EXP_BASEE_FF			= 3,
+	ME_LIGHT_COEFF_DX		= 4,
+	ME_POWER_FUNC_FF		= 5,
+	ME_RECIP_DX			= 6,
+	ME_RECIP_FF			= 7,
+	ME_RECIP_SQRT_DX		= 8,
+	ME_RECIP_SQRT_FF		= 9,
+	ME_MULTIPLY			= 10,
+	ME_EXP_BASE2_FULL_DX		= 11,
+	ME_LOG_BASE2_FULL_DX		= 12,
+	ME_POWER_FUNC_FF_CLAMP_B	= 13,
+	ME_POWER_FUNC_FF_CLAMP_B1	= 14,
+	ME_POWER_FUNC_FF_CLAMP_01	= 15,
+	ME_SIN				= 16,
+	ME_COS				= 17,
+	/* R5XX */
+	ME_LOG_BASE2_IEEE		= 18,
+	ME_RECIP_IEEE			= 19,
+	ME_RECIP_SQRT_IEEE		= 20,
+	ME_PRED_SET_EQ			= 21,
+	ME_PRED_SET_GT			= 22,
+	ME_PRED_SET_GTE			= 23,
+	ME_PRED_SET_NEQ			= 24,
+	ME_PRED_SET_CLR			= 25,
+	ME_PRED_SET_INV			= 26,
+	ME_PRED_SET_POP			= 27,
+	ME_PRED_SET_RESTORE		= 28,
+};
+
+enum {
+	/* R3XX */
+	PVS_MACRO_OP_2CLK_MADD		= 0,
+	PVS_MACRO_OP_2CLK_M2X_ADD	= 1,
+};
+
+enum {
+	PVS_SRC_REG_TEMPORARY		= 0,	/* Intermediate Storage */
+	PVS_SRC_REG_INPUT		= 1,	/* Input Vertex Storage */
+	PVS_SRC_REG_CONSTANT		= 2,	/* Constant State Storage */
+	PVS_SRC_REG_ALT_TEMPORARY	= 3,	/* Alternate Intermediate Storage */
+};
+
+enum {
+	PVS_DST_REG_TEMPORARY		= 0,	/* Intermediate Storage */
+	PVS_DST_REG_A0			= 1,	/* Address Register Storage */
+	PVS_DST_REG_OUT			= 2,	/* Output Memory. Used for all outputs */
+	PVS_DST_REG_OUT_REPL_X		= 3,	/* Output Memory & Replicate X to all channels */
+	PVS_DST_REG_ALT_TEMPORARY	= 4,	/* Alternate Intermediate Storage */
+	PVS_DST_REG_INPUT		= 5,	/* Output Memory & Replicate X to all channels */
+};
+
+enum {
+	PVS_SRC_SELECT_X		= 0,	/* Select X Component */
+	PVS_SRC_SELECT_Y		= 1,	/* Select Y Component */
+	PVS_SRC_SELECT_Z		= 2,	/* Select Z Component */
+	PVS_SRC_SELECT_W		= 3,	/* Select W Component */
+	PVS_SRC_SELECT_FORCE_0		= 4,	/* Force Component to 0.0 */
+	PVS_SRC_SELECT_FORCE_1		= 5,	/* Force Component to 1.0 */
+};
+
+/* PVS Opcode & Destination Operand Description */
+
+enum {
+	PVS_DST_OPCODE_MASK		= 0x3f,
+	PVS_DST_OPCODE_SHIFT		= 0,
+	PVS_DST_MATH_INST_MASK		= 0x1,
+	PVS_DST_MATH_INST_SHIFT		= 6,
+	PVS_DST_MACRO_INST_MASK		= 0x1,
+	PVS_DST_MACRO_INST_SHIFT	= 7,
+	PVS_DST_REG_TYPE_MASK		= 0xf,
+	PVS_DST_REG_TYPE_SHIFT		= 8,
+	PVS_DST_ADDR_MODE_1_MASK	= 0x1,
+	PVS_DST_ADDR_MODE_1_SHIFT	= 12,
+	PVS_DST_OFFSET_MASK		= 0x7f,
+	PVS_DST_OFFSET_SHIFT		= 13,
+	PVS_DST_WE_X_MASK		= 0x1,
+	PVS_DST_WE_X_SHIFT		= 20,
+	PVS_DST_WE_Y_MASK		= 0x1,
+	PVS_DST_WE_Y_SHIFT		= 21,
+	PVS_DST_WE_Z_MASK		= 0x1,
+	PVS_DST_WE_Z_SHIFT		= 22,
+	PVS_DST_WE_W_MASK		= 0x1,
+	PVS_DST_WE_W_SHIFT		= 23,
+	PVS_DST_VE_SAT_MASK		= 0x1,
+	PVS_DST_VE_SAT_SHIFT		= 24,
+	PVS_DST_ME_SAT_MASK		= 0x1,
+	PVS_DST_ME_SAT_SHIFT		= 25,
+	PVS_DST_PRED_ENABLE_MASK	= 0x1,
+	PVS_DST_PRED_ENABLE_SHIFT	= 26,
+	PVS_DST_PRED_SENSE_MASK		= 0x1,
+	PVS_DST_PRED_SENSE_SHIFT	= 27,
+	PVS_DST_DUAL_MATH_OP_MASK	= 0x3,
+	PVS_DST_DUAL_MATH_OP_SHIFT	= 27,
+	PVS_DST_ADDR_SEL_MASK		= 0x3,
+	PVS_DST_ADDR_SEL_SHIFT		= 29,
+	PVS_DST_ADDR_MODE_0_MASK	= 0x1,
+	PVS_DST_ADDR_MODE_0_SHIFT	= 31,
+};
+
+/* PVS Source Operand Description */
+
+enum {
+	PVS_SRC_REG_TYPE_MASK		= 0x3,
+	PVS_SRC_REG_TYPE_SHIFT		= 0,
+	SPARE_0_MASK			= 0x1,
+	SPARE_0_SHIFT			= 2,
+	PVS_SRC_ABS_XYZW_MASK		= 0x1,
+	PVS_SRC_ABS_XYZW_SHIFT		= 3,
+	PVS_SRC_ADDR_MODE_0_MASK	= 0x1,
+	PVS_SRC_ADDR_MODE_0_SHIFT	= 4,
+	PVS_SRC_OFFSET_MASK		= 0xff,
+	PVS_SRC_OFFSET_SHIFT		= 5,
+	PVS_SRC_SWIZZLE_X_MASK		= 0x7,
+	PVS_SRC_SWIZZLE_X_SHIFT		= 13,
+	PVS_SRC_SWIZZLE_Y_MASK		= 0x7,
+	PVS_SRC_SWIZZLE_Y_SHIFT		= 16,
+	PVS_SRC_SWIZZLE_Z_MASK		= 0x7,
+	PVS_SRC_SWIZZLE_Z_SHIFT		= 19,
+	PVS_SRC_SWIZZLE_W_MASK		= 0x7,
+	PVS_SRC_SWIZZLE_W_SHIFT		= 22,
+	PVS_SRC_MODIFIER_X_MASK		= 0x1,
+	PVS_SRC_MODIFIER_X_SHIFT	= 25,
+	PVS_SRC_MODIFIER_Y_MASK		= 0x1,
+	PVS_SRC_MODIFIER_Y_SHIFT	= 26,
+	PVS_SRC_MODIFIER_Z_MASK		= 0x1,
+	PVS_SRC_MODIFIER_Z_SHIFT	= 27,
+	PVS_SRC_MODIFIER_W_MASK		= 0x1,
+	PVS_SRC_MODIFIER_W_SHIFT	= 28,
+	PVS_SRC_ADDR_SEL_MASK		= 0x3,
+	PVS_SRC_ADDR_SEL_SHIFT		= 29,
+	PVS_SRC_ADDR_MODE_1_MASK	= 0x0,
+	PVS_SRC_ADDR_MODE_1_SHIFT	= 32,
+};
+
+/*\}*/
+
+/* BEGIN: Packet 3 commands */
+
+/* A primitive emission dword. */
+#define R300_PRIM_TYPE_NONE                     (0 << 0)
+#define R300_PRIM_TYPE_POINT                    (1 << 0)
+#define R300_PRIM_TYPE_LINE                     (2 << 0)
+#define R300_PRIM_TYPE_LINE_STRIP               (3 << 0)
+#define R300_PRIM_TYPE_TRI_LIST                 (4 << 0)
+#define R300_PRIM_TYPE_TRI_FAN                  (5 << 0)
+#define R300_PRIM_TYPE_TRI_STRIP                (6 << 0)
+#define R300_PRIM_TYPE_TRI_TYPE2                (7 << 0)
+#define R300_PRIM_TYPE_RECT_LIST                (8 << 0)
+#define R300_PRIM_TYPE_3VRT_POINT_LIST          (9 << 0)
+#define R300_PRIM_TYPE_3VRT_LINE_LIST           (10 << 0)
+	/* GUESS (based on r200) */
+#define R300_PRIM_TYPE_POINT_SPRITES            (11 << 0)
+#define R300_PRIM_TYPE_LINE_LOOP                (12 << 0)
+#define R300_PRIM_TYPE_QUADS                    (13 << 0)
+#define R300_PRIM_TYPE_QUAD_STRIP               (14 << 0)
+#define R300_PRIM_TYPE_POLYGON                  (15 << 0)
+#define R300_PRIM_TYPE_MASK                     0xF
+#define R300_PRIM_WALK_IND                      (1 << 4)
+#define R300_PRIM_WALK_LIST                     (2 << 4)
+#define R300_PRIM_WALK_RING                     (3 << 4)
+#define R300_PRIM_WALK_MASK                     (3 << 4)
+	/* GUESS (based on r200) */
+#define R300_PRIM_COLOR_ORDER_BGRA              (0 << 6)
+#define R300_PRIM_COLOR_ORDER_RGBA              (1 << 6)
+#define R300_PRIM_NUM_VERTICES_SHIFT            16
+#define R300_PRIM_NUM_VERTICES_MASK             0xffff
+
+
+
+/*
+ * The R500 unified shader (US) registers come in banks of 512 each, one
+ * for each instruction slot in the shader.  You can't touch them directly.
+ * R500_US_VECTOR_INDEX() sets the base instruction to modify; successive
+ * writes to R500_GA_US_VECTOR_DATA autoincrement the index after the
+ * instruction is fully specified.
+ */
+#define R500_US_ALU_ALPHA_INST_0			0xa800
+#   define R500_ALPHA_OP_MAD				0
+#   define R500_ALPHA_OP_DP				1
+#   define R500_ALPHA_OP_MIN				2
+#   define R500_ALPHA_OP_MAX				3
+/* #define R500_ALPHA_OP_RESERVED			4 */
+#   define R500_ALPHA_OP_CND				5
+#   define R500_ALPHA_OP_CMP				6
+#   define R500_ALPHA_OP_FRC				7
+#   define R500_ALPHA_OP_EX2				8
+#   define R500_ALPHA_OP_LN2				9
+#   define R500_ALPHA_OP_RCP				10
+#   define R500_ALPHA_OP_RSQ				11
+#   define R500_ALPHA_OP_SIN				12
+#   define R500_ALPHA_OP_COS				13
+#   define R500_ALPHA_OP_MDH				14
+#   define R500_ALPHA_OP_MDV				15
+#   define R500_ALPHA_ADDRD(x)				((x) << 4)
+#   define R500_ALPHA_ADDRD_REL				(1 << 11)
+#  define R500_ALPHA_SEL_A_SHIFT			12
+#   define R500_ALPHA_SEL_A_SRC0			(0 << 12)
+#   define R500_ALPHA_SEL_A_SRC1			(1 << 12)
+#   define R500_ALPHA_SEL_A_SRC2			(2 << 12)
+#   define R500_ALPHA_SEL_A_SRCP			(3 << 12)
+#   define R500_ALPHA_SWIZ_A_R				(0 << 14)
+#   define R500_ALPHA_SWIZ_A_G				(1 << 14)
+#   define R500_ALPHA_SWIZ_A_B				(2 << 14)
+#   define R500_ALPHA_SWIZ_A_A				(3 << 14)
+#   define R500_ALPHA_SWIZ_A_0				(4 << 14)
+#   define R500_ALPHA_SWIZ_A_HALF			(5 << 14)
+#   define R500_ALPHA_SWIZ_A_1				(6 << 14)
+/* #define R500_ALPHA_SWIZ_A_UNUSED			(7 << 14) */
+#   define R500_ALPHA_MOD_A_NOP				(0 << 17)
+#   define R500_ALPHA_MOD_A_NEG				(1 << 17)
+#   define R500_ALPHA_MOD_A_ABS				(2 << 17)
+#   define R500_ALPHA_MOD_A_NAB				(3 << 17)
+#  define R500_ALPHA_SEL_B_SHIFT			19
+#   define R500_ALPHA_SEL_B_SRC0			(0 << 19)
+#   define R500_ALPHA_SEL_B_SRC1			(1 << 19)
+#   define R500_ALPHA_SEL_B_SRC2			(2 << 19)
+#   define R500_ALPHA_SEL_B_SRCP			(3 << 19)
+#   define R500_ALPHA_SWIZ_B_R				(0 << 21)
+#   define R500_ALPHA_SWIZ_B_G				(1 << 21)
+#   define R500_ALPHA_SWIZ_B_B				(2 << 21)
+#   define R500_ALPHA_SWIZ_B_A				(3 << 21)
+#   define R500_ALPHA_SWIZ_B_0				(4 << 21)
+#   define R500_ALPHA_SWIZ_B_HALF			(5 << 21)
+#   define R500_ALPHA_SWIZ_B_1				(6 << 21)
+/* #define R500_ALPHA_SWIZ_B_UNUSED			(7 << 21) */
+#   define R500_ALPHA_MOD_B_NOP				(0 << 24)
+#   define R500_ALPHA_MOD_B_NEG				(1 << 24)
+#   define R500_ALPHA_MOD_B_ABS				(2 << 24)
+#   define R500_ALPHA_MOD_B_NAB				(3 << 24)
+#   define R500_ALPHA_OMOD_IDENTITY			(0 << 26)
+#   define R500_ALPHA_OMOD_MUL_2			(1 << 26)
+#   define R500_ALPHA_OMOD_MUL_4			(2 << 26)
+#   define R500_ALPHA_OMOD_MUL_8			(3 << 26)
+#   define R500_ALPHA_OMOD_DIV_2			(4 << 26)
+#   define R500_ALPHA_OMOD_DIV_4			(5 << 26)
+#   define R500_ALPHA_OMOD_DIV_8			(6 << 26)
+#   define R500_ALPHA_OMOD_DISABLE			(7 << 26)
+#   define R500_ALPHA_TARGET(x)				((x) << 29)
+#   define R500_ALPHA_W_OMASK				(1 << 31)
+#define R500_US_ALU_ALPHA_ADDR_0			0x9800
+#   define R500_ALPHA_ADDR0(x)				((x) << 0)
+#   define R500_ALPHA_ADDR0_CONST			(1 << 8)
+#   define R500_ALPHA_ADDR0_REL				(1 << 9)
+#   define R500_ALPHA_ADDR1(x)				((x) << 10)
+#   define R500_ALPHA_ADDR1_CONST			(1 << 18)
+#   define R500_ALPHA_ADDR1_REL				(1 << 19)
+#   define R500_ALPHA_ADDR2(x)				((x) << 20)
+#   define R500_ALPHA_ADDR2_CONST			(1 << 28)
+#   define R500_ALPHA_ADDR2_REL				(1 << 29)
+#   define R500_ALPHA_SRCP_OP_1_MINUS_2A0		(0 << 30)
+#   define R500_ALPHA_SRCP_OP_A1_MINUS_A0		(1 << 30)
+#   define R500_ALPHA_SRCP_OP_A1_PLUS_A0		(2 << 30)
+#   define R500_ALPHA_SRCP_OP_1_MINUS_A0		(3 << 30)
+#define R500_US_ALU_RGBA_INST_0				0xb000
+#   define R500_ALU_RGBA_OP_MAD				(0 << 0)
+#   define R500_ALU_RGBA_OP_DP3				(1 << 0)
+#   define R500_ALU_RGBA_OP_DP4				(2 << 0)
+#   define R500_ALU_RGBA_OP_D2A				(3 << 0)
+#   define R500_ALU_RGBA_OP_MIN				(4 << 0)
+#   define R500_ALU_RGBA_OP_MAX				(5 << 0)
+/* #define R500_ALU_RGBA_OP_RESERVED			(6 << 0) */
+#   define R500_ALU_RGBA_OP_CND				(7 << 0)
+#   define R500_ALU_RGBA_OP_CMP				(8 << 0)
+#   define R500_ALU_RGBA_OP_FRC				(9 << 0)
+#   define R500_ALU_RGBA_OP_SOP				(10 << 0)
+#   define R500_ALU_RGBA_OP_MDH				(11 << 0)
+#   define R500_ALU_RGBA_OP_MDV				(12 << 0)
+#   define R500_ALU_RGBA_ADDRD(x)			((x) << 4)
+#   define R500_ALU_RGBA_ADDRD_REL			(1 << 11)
+#  define R500_ALU_RGBA_SEL_C_SHIFT			12
+#   define R500_ALU_RGBA_SEL_C_SRC0			(0 << 12)
+#   define R500_ALU_RGBA_SEL_C_SRC1			(1 << 12)
+#   define R500_ALU_RGBA_SEL_C_SRC2			(2 << 12)
+#   define R500_ALU_RGBA_SEL_C_SRCP			(3 << 12)
+#   define R500_ALU_RGBA_R_SWIZ_R			(0 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_G			(1 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_B			(2 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_A			(3 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_0			(4 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_HALF			(5 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_1			(6 << 14)
+/* #define R500_ALU_RGBA_R_SWIZ_UNUSED			(7 << 14) */
+#   define R500_ALU_RGBA_G_SWIZ_R			(0 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_G			(1 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_B			(2 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_A			(3 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_0			(4 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_HALF			(5 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_1			(6 << 17)
+/* #define R500_ALU_RGBA_G_SWIZ_UNUSED			(7 << 17) */
+#   define R500_ALU_RGBA_B_SWIZ_R			(0 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_G			(1 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_B			(2 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_A			(3 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_0			(4 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_HALF			(5 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_1			(6 << 20)
+/* #define R500_ALU_RGBA_B_SWIZ_UNUSED			(7 << 20) */
+#   define R500_ALU_RGBA_MOD_C_NOP			(0 << 23)
+#   define R500_ALU_RGBA_MOD_C_NEG			(1 << 23)
+#   define R500_ALU_RGBA_MOD_C_ABS			(2 << 23)
+#   define R500_ALU_RGBA_MOD_C_NAB			(3 << 23)
+#  define R500_ALU_RGBA_ALPHA_SEL_C_SHIFT		25
+#   define R500_ALU_RGBA_ALPHA_SEL_C_SRC0		(0 << 25)
+#   define R500_ALU_RGBA_ALPHA_SEL_C_SRC1		(1 << 25)
+#   define R500_ALU_RGBA_ALPHA_SEL_C_SRC2		(2 << 25)
+#   define R500_ALU_RGBA_ALPHA_SEL_C_SRCP		(3 << 25)
+#   define R500_ALU_RGBA_A_SWIZ_R			(0 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_G			(1 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_B			(2 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_A			(3 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_0			(4 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_HALF			(5 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_1			(6 << 27)
+/* #define R500_ALU_RGBA_A_SWIZ_UNUSED			(7 << 27) */
+#   define R500_ALU_RGBA_ALPHA_MOD_C_NOP		(0 << 30)
+#   define R500_ALU_RGBA_ALPHA_MOD_C_NEG		(1 << 30)
+#   define R500_ALU_RGBA_ALPHA_MOD_C_ABS		(2 << 30)
+#   define R500_ALU_RGBA_ALPHA_MOD_C_NAB		(3 << 30)
+#define R500_US_ALU_RGB_INST_0				0xa000
+#  define R500_ALU_RGB_SEL_A_SHIFT			0
+#   define R500_ALU_RGB_SEL_A_SRC0			(0 << 0)
+#   define R500_ALU_RGB_SEL_A_SRC1			(1 << 0)
+#   define R500_ALU_RGB_SEL_A_SRC2			(2 << 0)
+#   define R500_ALU_RGB_SEL_A_SRCP			(3 << 0)
+#   define R500_ALU_RGB_R_SWIZ_A_R			(0 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_G			(1 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_B			(2 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_A			(3 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_0			(4 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_HALF			(5 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_1			(6 << 2)
+/* #define R500_ALU_RGB_R_SWIZ_A_UNUSED			(7 << 2) */
+#   define R500_ALU_RGB_G_SWIZ_A_R			(0 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_G			(1 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_B			(2 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_A			(3 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_0			(4 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_HALF			(5 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_1			(6 << 5)
+/* #define R500_ALU_RGB_G_SWIZ_A_UNUSED			(7 << 5) */
+#   define R500_ALU_RGB_B_SWIZ_A_R			(0 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_G			(1 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_B			(2 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_A			(3 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_0			(4 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_HALF			(5 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_1			(6 << 8)
+/* #define R500_ALU_RGB_B_SWIZ_A_UNUSED			(7 << 8) */
+#   define R500_ALU_RGB_MOD_A_NOP			(0 << 11)
+#   define R500_ALU_RGB_MOD_A_NEG			(1 << 11)
+#   define R500_ALU_RGB_MOD_A_ABS			(2 << 11)
+#   define R500_ALU_RGB_MOD_A_NAB			(3 << 11)
+#  define R500_ALU_RGB_SEL_B_SHIFT			13
+#   define R500_ALU_RGB_SEL_B_SRC0			(0 << 13)
+#   define R500_ALU_RGB_SEL_B_SRC1			(1 << 13)
+#   define R500_ALU_RGB_SEL_B_SRC2			(2 << 13)
+#   define R500_ALU_RGB_SEL_B_SRCP			(3 << 13)
+#   define R500_ALU_RGB_R_SWIZ_B_R			(0 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_G			(1 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_B			(2 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_A			(3 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_0			(4 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_HALF			(5 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_1			(6 << 15)
+/* #define R500_ALU_RGB_R_SWIZ_B_UNUSED			(7 << 15) */
+#   define R500_ALU_RGB_G_SWIZ_B_R			(0 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_G			(1 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_B			(2 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_A			(3 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_0			(4 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_HALF			(5 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_1			(6 << 18)
+/* #define R500_ALU_RGB_G_SWIZ_B_UNUSED			(7 << 18) */
+#   define R500_ALU_RGB_B_SWIZ_B_R			(0 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_G			(1 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_B			(2 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_A			(3 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_0			(4 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_HALF			(5 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_1			(6 << 21)
+/* #define R500_ALU_RGB_B_SWIZ_B_UNUSED			(7 << 21) */
+#   define R500_ALU_RGB_MOD_B_NOP			(0 << 24)
+#   define R500_ALU_RGB_MOD_B_NEG			(1 << 24)
+#   define R500_ALU_RGB_MOD_B_ABS			(2 << 24)
+#   define R500_ALU_RGB_MOD_B_NAB			(3 << 24)
+#   define R500_ALU_RGB_OMOD_IDENTITY			(0 << 26)
+#   define R500_ALU_RGB_OMOD_MUL_2			(1 << 26)
+#   define R500_ALU_RGB_OMOD_MUL_4			(2 << 26)
+#   define R500_ALU_RGB_OMOD_MUL_8			(3 << 26)
+#   define R500_ALU_RGB_OMOD_DIV_2			(4 << 26)
+#   define R500_ALU_RGB_OMOD_DIV_4			(5 << 26)
+#   define R500_ALU_RGB_OMOD_DIV_8			(6 << 26)
+#   define R500_ALU_RGB_OMOD_DISABLE			(7 << 26)
+#   define R500_ALU_RGB_TARGET(x)			((x) << 29)
+#   define R500_ALU_RGB_WMASK				(1 << 31)
+#define R500_US_ALU_RGB_ADDR_0				0x9000
+#   define R500_RGB_ADDR0(x)				((x) << 0)
+#   define R500_RGB_ADDR0_CONST				(1 << 8)
+#   define R500_RGB_ADDR0_REL				(1 << 9)
+#   define R500_RGB_ADDR1(x)				((x) << 10)
+#   define R500_RGB_ADDR1_CONST				(1 << 18)
+#   define R500_RGB_ADDR1_REL				(1 << 19)
+#   define R500_RGB_ADDR2(x)				((x) << 20)
+#   define R500_RGB_ADDR2_CONST				(1 << 28)
+#   define R500_RGB_ADDR2_REL				(1 << 29)
+#   define R500_RGB_SRCP_OP_1_MINUS_2RGB0		(0 << 30)
+#   define R500_RGB_SRCP_OP_RGB1_MINUS_RGB0		(1 << 30)
+#   define R500_RGB_SRCP_OP_RGB1_PLUS_RGB0		(2 << 30)
+#   define R500_RGB_SRCP_OP_1_MINUS_RGB0		(3 << 30)
+#define R500_US_CMN_INST_0				0xb800
+#  define R500_INST_TYPE_MASK				(3 << 0)
+#   define R500_INST_TYPE_ALU				(0 << 0)
+#   define R500_INST_TYPE_OUT				(1 << 0)
+#   define R500_INST_TYPE_FC				(2 << 0)
+#   define R500_INST_TYPE_TEX				(3 << 0)
+#   define R500_INST_TEX_SEM_WAIT			(1 << 2)
+#   define R500_INST_RGB_PRED_SEL_NONE			(0 << 3)
+#   define R500_INST_RGB_PRED_SEL_RGBA			(1 << 3)
+#   define R500_INST_RGB_PRED_SEL_RRRR			(2 << 3)
+#   define R500_INST_RGB_PRED_SEL_GGGG			(3 << 3)
+#   define R500_INST_RGB_PRED_SEL_BBBB			(4 << 3)
+#   define R500_INST_RGB_PRED_SEL_AAAA			(5 << 3)
+#   define R500_INST_RGB_PRED_INV			(1 << 6)
+#   define R500_INST_WRITE_INACTIVE			(1 << 7)
+#   define R500_INST_LAST				(1 << 8)
+#   define R500_INST_NOP				(1 << 9)
+#   define R500_INST_ALU_WAIT				(1 << 10)
+#   define R500_INST_RGB_WMASK_R			(1 << 11)
+#   define R500_INST_RGB_WMASK_G			(1 << 12)
+#   define R500_INST_RGB_WMASK_B			(1 << 13)
+#   define R500_INST_ALPHA_WMASK			(1 << 14)
+#   define R500_INST_RGB_OMASK_R			(1 << 15)
+#   define R500_INST_RGB_OMASK_G			(1 << 16)
+#   define R500_INST_RGB_OMASK_B			(1 << 17)
+#   define R500_INST_RGB_OMASK_RGB			(7 << 15)
+#   define R500_INST_ALPHA_OMASK			(1 << 18)
+#   define R500_INST_RGB_CLAMP				(1 << 19)
+#   define R500_INST_ALPHA_CLAMP			(1 << 20)
+#   define R500_INST_ALU_RESULT_SEL			(1 << 21)
+#   define R500_INST_ALPHA_PRED_INV			(1 << 22)
+#   define R500_INST_ALU_RESULT_OP_EQ			(0 << 23)
+#   define R500_INST_ALU_RESULT_OP_LT			(1 << 23)
+#   define R500_INST_ALU_RESULT_OP_GE			(2 << 23)
+#   define R500_INST_ALU_RESULT_OP_NE			(3 << 23)
+#   define R500_INST_ALPHA_PRED_SEL_NONE		(0 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_RGBA		(1 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_RRRR		(2 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_GGGG		(3 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_BBBB		(4 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_AAAA		(5 << 25)
+/* XXX next four are kind of guessed */
+#   define R500_INST_STAT_WE_R				(1 << 28)
+#   define R500_INST_STAT_WE_G				(1 << 29)
+#   define R500_INST_STAT_WE_B				(1 << 30)
+#   define R500_INST_STAT_WE_A				(1 << 31)
+
+/* note that these are 8 bit lengths, despite the offsets, at least for R500 */
+#define R500_US_CODE_ADDR				0x4630
+#   define R500_US_CODE_START_ADDR(x)			((x) << 0)
+#   define R500_US_CODE_END_ADDR(x)			((x) << 16)
+#define R500_US_CODE_OFFSET				0x4638
+#   define R500_US_CODE_OFFSET_ADDR(x)			((x) << 0)
+#define R500_US_CODE_RANGE				0x4634
+#   define R500_US_CODE_RANGE_ADDR(x)			((x) << 0)
+#   define R500_US_CODE_RANGE_SIZE(x)			((x) << 16)
+#define R500_US_CONFIG					0x4600
+#   define R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO		(1 << 1)
+#define R500_US_FC_ADDR_0				0xa000
+#   define R500_FC_BOOL_ADDR(x)				((x) << 0)
+#   define R500_FC_INT_ADDR(x)				((x) << 8)
+#   define R500_FC_JUMP_ADDR(x)				((x) << 16)
+#   define R500_FC_JUMP_GLOBAL				(1 << 31)
+#define R500_US_FC_BOOL_CONST				0x4620
+#   define R500_FC_KBOOL(x)				(x)
+#define R500_US_FC_CTRL					0x4624
+#   define R500_FC_TEST_EN				(1 << 30)
+#   define R500_FC_FULL_FC_EN				(1 << 31)
+#define R500_US_FC_INST_0				0x9800
+#   define R500_FC_OP_JUMP				(0 << 0)
+#   define R500_FC_OP_LOOP				(1 << 0)
+#   define R500_FC_OP_ENDLOOP				(2 << 0)
+#   define R500_FC_OP_REP				(3 << 0)
+#   define R500_FC_OP_ENDREP				(4 << 0)
+#   define R500_FC_OP_BREAKLOOP				(5 << 0)
+#   define R500_FC_OP_BREAKREP				(6 << 0)
+#   define R500_FC_OP_CONTINUE				(7 << 0)
+#   define R500_FC_B_ELSE				(1 << 4)
+#   define R500_FC_JUMP_ANY				(1 << 5)
+#   define R500_FC_A_OP_NONE				(0 << 6)
+#   define R500_FC_A_OP_POP				(1 << 6)
+#   define R500_FC_A_OP_PUSH				(2 << 6)
+#   define R500_FC_JUMP_FUNC(x)				((x) << 8)
+#   define R500_FC_B_POP_CNT(x)				((x) << 16)
+#   define R500_FC_B_OP0_NONE				(0 << 24)
+#   define R500_FC_B_OP0_DECR				(1 << 24)
+#   define R500_FC_B_OP0_INCR				(2 << 24)
+#   define R500_FC_B_OP1_DECR				(0 << 26)
+#   define R500_FC_B_OP1_NONE				(1 << 26)
+#   define R500_FC_B_OP1_INCR				(2 << 26)
+#   define R500_FC_IGNORE_UNCOVERED			(1 << 28)
+#define R500_US_FC_INT_CONST_0				0x4c00
+#   define R500_FC_INT_CONST_KR(x)			((x) << 0)
+#   define R500_FC_INT_CONST_KG(x)			((x) << 8)
+#   define R500_FC_INT_CONST_KB(x)			((x) << 16)
+/* _0 through _15 */
+#define R500_US_FORMAT0_0				0x4640
+#   define R500_FORMAT_TXWIDTH(x)			((x) << 0)
+#   define R500_FORMAT_TXHEIGHT(x)			((x) << 11)
+#   define R500_FORMAT_TXDEPTH(x)			((x) << 22)
+#define R500_US_PIXSIZE					0x4604
+#   define R500_PIX_SIZE(x)				(x)
+#define R500_US_TEX_ADDR_0				0x9800
+#   define R500_TEX_SRC_ADDR(x)				((x) << 0)
+#   define R500_TEX_SRC_ADDR_REL			(1 << 7)
+#   define R500_TEX_SRC_S_SWIZ_R			(0 << 8)
+#   define R500_TEX_SRC_S_SWIZ_G			(1 << 8)
+#   define R500_TEX_SRC_S_SWIZ_B			(2 << 8)
+#   define R500_TEX_SRC_S_SWIZ_A			(3 << 8)
+#   define R500_TEX_SRC_T_SWIZ_R			(0 << 10)
+#   define R500_TEX_SRC_T_SWIZ_G			(1 << 10)
+#   define R500_TEX_SRC_T_SWIZ_B			(2 << 10)
+#   define R500_TEX_SRC_T_SWIZ_A			(3 << 10)
+#   define R500_TEX_SRC_R_SWIZ_R			(0 << 12)
+#   define R500_TEX_SRC_R_SWIZ_G			(1 << 12)
+#   define R500_TEX_SRC_R_SWIZ_B			(2 << 12)
+#   define R500_TEX_SRC_R_SWIZ_A			(3 << 12)
+#   define R500_TEX_SRC_Q_SWIZ_R			(0 << 14)
+#   define R500_TEX_SRC_Q_SWIZ_G			(1 << 14)
+#   define R500_TEX_SRC_Q_SWIZ_B			(2 << 14)
+#   define R500_TEX_SRC_Q_SWIZ_A			(3 << 14)
+#   define R500_TEX_DST_ADDR(x)				((x) << 16)
+#   define R500_TEX_DST_ADDR_REL			(1 << 23)
+#   define R500_TEX_DST_R_SWIZ_R			(0 << 24)
+#   define R500_TEX_DST_R_SWIZ_G			(1 << 24)
+#   define R500_TEX_DST_R_SWIZ_B			(2 << 24)
+#   define R500_TEX_DST_R_SWIZ_A			(3 << 24)
+#   define R500_TEX_DST_G_SWIZ_R			(0 << 26)
+#   define R500_TEX_DST_G_SWIZ_G			(1 << 26)
+#   define R500_TEX_DST_G_SWIZ_B			(2 << 26)
+#   define R500_TEX_DST_G_SWIZ_A			(3 << 26)
+#   define R500_TEX_DST_B_SWIZ_R			(0 << 28)
+#   define R500_TEX_DST_B_SWIZ_G			(1 << 28)
+#   define R500_TEX_DST_B_SWIZ_B			(2 << 28)
+#   define R500_TEX_DST_B_SWIZ_A			(3 << 28)
+#   define R500_TEX_DST_A_SWIZ_R			(0 << 30)
+#   define R500_TEX_DST_A_SWIZ_G			(1 << 30)
+#   define R500_TEX_DST_A_SWIZ_B			(2 << 30)
+#   define R500_TEX_DST_A_SWIZ_A			(3 << 30)
+#define R500_US_TEX_ADDR_DXDY_0				0xa000
+#   define R500_DX_ADDR(x)				((x) << 0)
+#   define R500_DX_ADDR_REL				(1 << 7)
+#   define R500_DX_S_SWIZ_R				(0 << 8)
+#   define R500_DX_S_SWIZ_G				(1 << 8)
+#   define R500_DX_S_SWIZ_B				(2 << 8)
+#   define R500_DX_S_SWIZ_A				(3 << 8)
+#   define R500_DX_T_SWIZ_R				(0 << 10)
+#   define R500_DX_T_SWIZ_G				(1 << 10)
+#   define R500_DX_T_SWIZ_B				(2 << 10)
+#   define R500_DX_T_SWIZ_A				(3 << 10)
+#   define R500_DX_R_SWIZ_R				(0 << 12)
+#   define R500_DX_R_SWIZ_G				(1 << 12)
+#   define R500_DX_R_SWIZ_B				(2 << 12)
+#   define R500_DX_R_SWIZ_A				(3 << 12)
+#   define R500_DX_Q_SWIZ_R				(0 << 14)
+#   define R500_DX_Q_SWIZ_G				(1 << 14)
+#   define R500_DX_Q_SWIZ_B				(2 << 14)
+#   define R500_DX_Q_SWIZ_A				(3 << 14)
+#   define R500_DY_ADDR(x)				((x) << 16)
+#   define R500_DY_ADDR_REL				(1 << 17)
+#   define R500_DY_S_SWIZ_R				(0 << 24)
+#   define R500_DY_S_SWIZ_G				(1 << 24)
+#   define R500_DY_S_SWIZ_B				(2 << 24)
+#   define R500_DY_S_SWIZ_A				(3 << 24)
+#   define R500_DY_T_SWIZ_R				(0 << 26)
+#   define R500_DY_T_SWIZ_G				(1 << 26)
+#   define R500_DY_T_SWIZ_B				(2 << 26)
+#   define R500_DY_T_SWIZ_A				(3 << 26)
+#   define R500_DY_R_SWIZ_R				(0 << 28)
+#   define R500_DY_R_SWIZ_G				(1 << 28)
+#   define R500_DY_R_SWIZ_B				(2 << 28)
+#   define R500_DY_R_SWIZ_A				(3 << 28)
+#   define R500_DY_Q_SWIZ_R				(0 << 30)
+#   define R500_DY_Q_SWIZ_G				(1 << 30)
+#   define R500_DY_Q_SWIZ_B				(2 << 30)
+#   define R500_DY_Q_SWIZ_A				(3 << 30)
+#define R500_US_TEX_INST_0				0x9000
+#   define R500_TEX_ID(x)				((x) << 16)
+#   define R500_TEX_INST_NOP				(0 << 22)
+#   define R500_TEX_INST_LD				(1 << 22)
+#   define R500_TEX_INST_TEXKILL			(2 << 22)
+#   define R500_TEX_INST_PROJ				(3 << 22)
+#   define R500_TEX_INST_LODBIAS			(4 << 22)
+#   define R500_TEX_INST_LOD				(5 << 22)
+#   define R500_TEX_INST_DXDY				(6 << 22)
+#   define R500_TEX_SEM_ACQUIRE				(1 << 25)
+#   define R500_TEX_IGNORE_UNCOVERED			(1 << 26)
+#   define R500_TEX_UNSCALED				(1 << 27)
+#define R300_US_W_FMT					0x46b4
+#   define R300_W_FMT_W0				(0 << 0)
+#   define R300_W_FMT_W24				(1 << 0)
+#   define R300_W_FMT_W24FP				(2 << 0)
+#   define R300_W_SRC_US				(0 << 2)
+#   define R300_W_SRC_RAS				(1 << 2)
+
+
+/* Draw a primitive from vertex data in arrays loaded via 3D_LOAD_VBPNTR.
+ * Two parameter dwords:
+ * 0. VAP_VTX_FMT: The first parameter is not written to hardware
+ * 1. VAP_VF_CTL: The second parameter is a standard primitive emission dword.
+ */
+#define R300_PACKET3_3D_DRAW_VBUF           0x00002800
+
+/* Draw a primitive from immediate vertices in this packet
+ * Up to 16382 dwords:
+ * 0. VAP_VTX_FMT: The first parameter is not written to hardware
+ * 1. VAP_VF_CTL: The second parameter is a standard primitive emission dword.
+ * 2 to end: Up to 16380 dwords of vertex data.
+ */
+#define R300_PACKET3_3D_DRAW_IMMD           0x00002900
+
+/* Draw a primitive from vertex data in arrays loaded via 3D_LOAD_VBPNTR and
+ * immediate vertices in this packet
+ * Up to 16382 dwords:
+ * 0. VAP_VTX_FMT: The first parameter is not written to hardware
+ * 1. VAP_VF_CTL: The second parameter is a standard primitive emission dword.
+ * 2 to end: Up to 16380 dwords of vertex data.
+ */
+#define R300_PACKET3_3D_DRAW_INDX           0x00002A00
+
+
+/* Specify the full set of vertex arrays as (address, stride).
+ * The first parameter is the number of vertex arrays specified.
+ * The rest of the command is a variable length list of blocks, where
+ * each block is three dwords long and specifies two arrays.
+ * The first dword of a block is split into two words, the lower significant
+ * word refers to the first array, the more significant word to the second
+ * array in the block.
+ * The low byte of each word contains the size of an array entry in dwords,
+ * the high byte contains the stride of the array.
+ * The second dword of a block contains the pointer to the first array,
+ * the third dword of a block contains the pointer to the second array.
+ * Note that if the total number of arrays is odd, the third dword of
+ * the last block is omitted.
+ */
+#define R300_PACKET3_3D_LOAD_VBPNTR         0x00002F00
+
+#define R300_PACKET3_INDX_BUFFER            0x00003300
+#    define R300_INDX_BUFFER_DST_SHIFT          0
+#    define R300_INDX_BUFFER_SKIP_SHIFT         16
+#    define R300_INDX_BUFFER_ONE_REG_WR		(1<<31)
+
+/* Same as R300_PACKET3_3D_DRAW_VBUF but without VAP_VTX_FMT */
+#define R300_PACKET3_3D_DRAW_VBUF_2         0x00003400
+/* Same as R300_PACKET3_3D_DRAW_IMMD but without VAP_VTX_FMT */
+#define R300_PACKET3_3D_DRAW_IMMD_2         0x00003500
+/* Same as R300_PACKET3_3D_DRAW_INDX but without VAP_VTX_FMT */
+#define R300_PACKET3_3D_DRAW_INDX_2         0x00003600
+
+/* Clears a portion of hierachical Z RAM
+ * 3 dword parameters
+ * 0. START
+ * 1. COUNT: 13:0 (max is 0x3FFF)
+ * 2. CLEAR_VALUE: Value to write into HIZ RAM.
+ */
+#define R300_PACKET3_3D_CLEAR_HIZ           0x00003700
+
+/* Draws a set of primitives using vertex buffers pointed by the state data.
+ * At least 2 Parameters:
+ * 0. VAP_VF_CNTL: The first parameter is a standard primitive emission dword.
+ * 2 to end: Data or indices (see other 3D_DRAW_* packets for details)
+ */
+#define R300_PACKET3_3D_DRAW_128            0x00003900
+
+/* END: Packet 3 commands */
+
+
+/* Color formats for 2d packets
+ */
+#define R300_CP_COLOR_FORMAT_CI8	2
+#define R300_CP_COLOR_FORMAT_ARGB1555	3
+#define R300_CP_COLOR_FORMAT_RGB565	4
+#define R300_CP_COLOR_FORMAT_ARGB8888	6
+#define R300_CP_COLOR_FORMAT_RGB332	7
+#define R300_CP_COLOR_FORMAT_RGB8	9
+#define R300_CP_COLOR_FORMAT_ARGB4444	15
+
+/*
+ * CP type-3 packets
+ */
+#define R300_CP_CMD_BITBLT_MULTI	0xC0009B00
+
+/* XXX Corbin's stuff from radeon and r200 */
+
+#define RADEON_WAIT_UNTIL                   0x1720
+#       define RADEON_WAIT_CRTC_PFLIP       (1 << 0)
+#       define RADEON_WAIT_2D_IDLECLEAN     (1 << 16)
+#       define RADEON_WAIT_3D_IDLECLEAN     (1 << 17)
+#       define RADEON_WAIT_HOST_IDLECLEAN   (1 << 18)
+
+#define RADEON_CP_PACKET3                           0xC0000000
+
+#define R200_3D_DRAW_IMMD_2      0xC0003500
+
+#endif /* _R300_REG_H */
+
+/* *INDENT-ON* */
+
+/* vim: set foldenable foldmarker=\\{,\\} foldmethod=marker : */
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
new file mode 100644
index 0000000000..5ff9015a7b
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -0,0 +1,371 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_screen.h"
+
+/* Return the identifier behind whom the brave coders responsible for this
+ * amalgamation of code, sweat, and duct tape, routinely obscure their names.
+ *
+ * ...I should have just put "Corbin Simpson", but I'm not that cool.
+ *
+ * (Or egotistical. Yet.) */
+static const char* r300_get_vendor(struct pipe_screen* pscreen)
+{
+    return "X.Org R300 Project";
+}
+
+static const char* chip_families[] = {
+    "R300",
+    "R350",
+    "R360",
+    "RV350",
+    "RV370",
+    "RV380",
+    "R420",
+    "R423",
+    "R430",
+    "R480",
+    "R481",
+    "RV410",
+    "RS400",
+    "RC410",
+    "RS480",
+    "RS482",
+    "RS690",
+    "RS740",
+    "RV515",
+    "R520",
+    "RV530",
+    "R580",
+    "RV560",
+    "RV570"
+};
+
+static const char* r300_get_name(struct pipe_screen* pscreen)
+{
+    struct r300_screen* r300screen = r300_screen(pscreen);
+
+    return chip_families[r300screen->caps->family];
+}
+
+static int r300_get_param(struct pipe_screen* pscreen, int param)
+{
+    struct r300_screen* r300screen = r300_screen(pscreen);
+
+    switch (param) {
+        /* XXX cases marked "IN THEORY" are possible on the hardware,
+         * but haven't been implemented yet. */
+        case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+            /* XXX I'm told this goes up to 16 */
+            return 8;
+        case PIPE_CAP_NPOT_TEXTURES:
+            /* IN THEORY */
+            return 0;
+        case PIPE_CAP_TWO_SIDED_STENCIL:
+            if (r300screen->caps->is_r500) {
+                return 1;
+            } else {
+                return 0;
+            }
+            return 0;
+        case PIPE_CAP_GLSL:
+            /* IN THEORY */
+            return 0;
+        case PIPE_CAP_S3TC:
+            /* IN THEORY */
+            return 0;
+        case PIPE_CAP_ANISOTROPIC_FILTER:
+            /* IN THEORY */
+            return 0;
+        case PIPE_CAP_POINT_SPRITE:
+            /* IN THEORY */
+            return 0;
+        case PIPE_CAP_MAX_RENDER_TARGETS:
+            return 4;
+        case PIPE_CAP_OCCLUSION_QUERY:
+            /* IN THEORY */
+            return 0;
+        case PIPE_CAP_TEXTURE_SHADOW_MAP:
+            /* IN THEORY */
+            return 0;
+        case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+            if (r300screen->caps->is_r500) {
+                /* 13 == 4096x4096 */
+                return 13;
+            } else {
+                /* 12 == 2048x2048 */
+                return 12;
+            }
+        case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+            /* So, technically, the limit is the same as above, but some math
+             * shows why this is silly. Assuming RGBA, 4cpp, we can see that
+             * 4096*4096*4096 = 64.0 GiB exactly, so it's not exactly
+             * practical. However, if at some point a game really wants this,
+             * then we can remove or raise this limit. */
+            if (r300screen->caps->is_r500) {
+                /* 9 == 256x256x256 */
+                return 9;
+            } else {
+                /* 8 == 128*128*128 */
+                return 8;
+            }
+        case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+            if (r300screen->caps->is_r500) {
+                /* 13 == 4096x4096 */
+                return 13;
+            } else {
+                /* 12 == 2048x2048 */
+                return 12;
+            }
+        case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+            return 1;
+        case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+            return 1;
+        case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+            /* XXX guessing (what a terrible guess) */
+            return 2;
+        default:
+            debug_printf("r300: Implementation error: Bad param %d\n",
+                param);
+            return 0;
+    }
+}
+
+static float r300_get_paramf(struct pipe_screen* pscreen, int param)
+{
+    switch (param) {
+        case PIPE_CAP_MAX_LINE_WIDTH:
+        case PIPE_CAP_MAX_LINE_WIDTH_AA:
+            /* XXX this is the biggest thing that will fit in that register.
+            * Perhaps the actual rendering limits are less? */
+            return 10922.0f;
+        case PIPE_CAP_MAX_POINT_WIDTH:
+        case PIPE_CAP_MAX_POINT_WIDTH_AA:
+            /* XXX this is the biggest thing that will fit in that register.
+             * Perhaps the actual rendering limits are less? */
+            return 10922.0f;
+        case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+            return 16.0f;
+        case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+            return 16.0f;
+        default:
+            debug_printf("r300: Implementation error: Bad paramf %d\n",
+                param);
+            return 0.0f;
+    }
+}
+
+static boolean check_tex_2d_format(enum pipe_format format, boolean is_r500)
+{
+    switch (format) {
+        /* Colorbuffer */
+        case PIPE_FORMAT_A8R8G8B8_UNORM:
+        /* Texture */
+        case PIPE_FORMAT_I8_UNORM:
+        /* Z buffer */
+        case PIPE_FORMAT_Z16_UNORM:
+        /* Z buffer with stencil */
+        case PIPE_FORMAT_Z24S8_UNORM:
+            return TRUE;
+
+        /* XXX Supported yet unimplemented formats: */
+        case PIPE_FORMAT_A1R5G5B5_UNORM:
+        case PIPE_FORMAT_R5G6B5_UNORM:
+        /* XXX These don't even exist
+        case PIPE_FORMAT_A32R32G32B32:
+        case PIPE_FORMAT_A16R16G16B16: */
+        /* XXX Insert YUV422 packed VYUY and YVYU here */
+        /* XXX What the deuce is UV88? (r3xx accel page 14) */
+        case PIPE_FORMAT_A4R4G4B4_UNORM:
+            debug_printf("r300: Warning: Got unimplemented format: %s in %s\n",
+                pf_name(format), __FUNCTION__);
+            return FALSE;
+
+        /* XXX Supported yet unimplemented r5xx formats: */
+        /* XXX Again, what is UV1010 this time? (r5xx accel page 148) */
+        /* XXX Even more that don't exist
+        case PIPE_FORMAT_A10R10G10B10_UNORM:
+        case PIPE_FORMAT_A2R10G10B10_UNORM:
+        case PIPE_FORMAT_I10_UNORM: */
+            debug_printf(
+                "r300: Warning: Got unimplemented r500 format: %s in %s\n",
+                pf_name(format), __FUNCTION__);
+            return FALSE;
+
+        default:
+            debug_printf("r300: Warning: Got unsupported format: %s in %s\n",
+                pf_name(format), __FUNCTION__);
+            break;
+    }
+
+    return FALSE;
+}
+
+/* XXX moar targets */
+static boolean r300_is_format_supported(struct pipe_screen* pscreen,
+                                        enum pipe_format format,
+                                        enum pipe_texture_target target,
+                                        unsigned tex_usage,
+                                        unsigned geom_flags)
+{
+    switch (target) {
+        case PIPE_TEXTURE_2D:
+            return check_tex_2d_format(format,
+                r300_screen(pscreen)->caps->is_r500);
+        default:
+            debug_printf("r300: Warning: Got unknown format target: %d\n",
+                format);
+            break;
+    }
+
+    return FALSE;
+}
+
+static struct pipe_transfer*
+r300_get_tex_transfer(struct pipe_screen *screen,
+                      struct pipe_texture *texture,
+                      unsigned face, unsigned level, unsigned zslice,
+                      enum pipe_transfer_usage usage, unsigned x, unsigned y,
+                      unsigned w, unsigned h)
+{
+    struct r300_texture *tex = (struct r300_texture *)texture;
+    struct r300_transfer *trans;
+    unsigned offset;  /* in bytes */
+
+    /* XXX Add support for these things */
+    if (texture->target == PIPE_TEXTURE_CUBE) {
+        debug_printf("PIPE_TEXTURE_CUBE is not yet supported.\n");
+        /* offset = tex->image_offset[level][face]; */
+    }
+    else if (texture->target == PIPE_TEXTURE_3D) {
+        debug_printf("PIPE_TEXTURE_3D is not yet supported.\n");
+        /* offset = tex->image_offset[level][zslice]; */
+    }
+    else {
+        offset = tex->offset[level];
+        assert(face == 0);
+        assert(zslice == 0);
+    }
+
+    trans = CALLOC_STRUCT(r300_transfer);
+    if (trans) {
+        trans->transfer.refcount = 1;
+        pipe_texture_reference(&trans->transfer.texture, texture);
+        trans->transfer.format = trans->transfer.format;
+        trans->transfer.width = w;
+        trans->transfer.height = h;
+        trans->transfer.block = texture->block;
+        trans->transfer.nblocksx = texture->nblocksx[level];
+        trans->transfer.nblocksy = texture->nblocksy[level];
+        trans->transfer.stride = tex->stride;
+        trans->transfer.usage = usage;
+        trans->offset = offset;
+    }
+    return &trans->transfer;
+}
+
+static void
+r300_tex_transfer_release(struct pipe_screen *screen,
+                          struct pipe_transfer **transfer)
+{
+   struct pipe_transfer *trans = *transfer;
+
+   if (--trans->refcount == 0) {
+      pipe_texture_reference(&trans->texture, NULL);
+      FREE(trans);
+   }
+
+   *transfer = NULL;
+}
+
+static void* r300_transfer_map(struct pipe_screen* screen,
+                              struct pipe_transfer* transfer)
+{
+    struct r300_texture* tex = (struct r300_texture*)transfer->texture;
+    char* map;
+    unsigned flags = 0;
+
+    if (transfer->usage != PIPE_TRANSFER_WRITE) {
+        flags |= PIPE_BUFFER_USAGE_CPU_READ;
+    }
+    if (transfer->usage != PIPE_TRANSFER_READ) {
+        flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
+    }
+    
+    map = pipe_buffer_map(screen, tex->buffer, flags);
+
+    if (!map) {
+        return NULL;
+    }
+
+    return map + r300_transfer(transfer)->offset +
+        transfer->y / transfer->block.height * transfer->stride +
+        transfer->x / transfer->block.width * transfer->block.size;
+}
+
+static void r300_transfer_unmap(struct pipe_screen* screen,
+                                struct pipe_transfer* transfer)
+{
+    struct r300_texture* tex = (struct r300_texture*)transfer->texture;
+    pipe_buffer_unmap(screen, tex->buffer);
+}
+
+static void r300_destroy_screen(struct pipe_screen* pscreen)
+{
+    struct r300_screen* r300screen = r300_screen(pscreen);
+
+    FREE(r300screen->caps);
+    FREE(r300screen);
+}
+
+struct pipe_screen* r300_create_screen(struct pipe_winsys* winsys,
+                                       struct r300_winsys* r300_winsys)
+{
+    struct r300_screen* r300screen = CALLOC_STRUCT(r300_screen);
+    struct r300_capabilities* caps = CALLOC_STRUCT(r300_capabilities);
+
+    if (!r300screen || !caps)
+        return NULL;
+
+    caps->pci_id = r300_winsys->pci_id;
+    caps->num_frag_pipes = r300_winsys->gb_pipes;
+
+    r300_parse_chipset(caps);
+
+    r300screen->caps = caps;
+    r300screen->screen.winsys = winsys;
+    r300screen->screen.destroy = r300_destroy_screen;
+    r300screen->screen.get_name = r300_get_name;
+    r300screen->screen.get_vendor = r300_get_vendor;
+    r300screen->screen.get_param = r300_get_param;
+    r300screen->screen.get_paramf = r300_get_paramf;
+    r300screen->screen.is_format_supported = r300_is_format_supported;
+    r300screen->screen.get_tex_transfer = r300_get_tex_transfer;
+    r300screen->screen.tex_transfer_release = r300_tex_transfer_release;
+    r300screen->screen.transfer_map = r300_transfer_map;
+    r300screen->screen.transfer_unmap = r300_transfer_unmap;
+
+    r300_init_screen_texture_functions(&r300screen->screen);
+    u_simple_screen_init(&r300screen->screen);
+
+    return &r300screen->screen;
+}
diff --git a/src/gallium/drivers/r300/r300_screen.h b/src/gallium/drivers/r300/r300_screen.h
new file mode 100644
index 0000000000..6c845144cb
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_screen.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_SCREEN_H
+#define R300_SCREEN_H
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_screen.h"
+#include "util/u_memory.h"
+#include "util/u_simple_screen.h"
+
+#include "r300_chipset.h"
+#include "r300_texture.h"
+#include "r300_winsys.h"
+
+struct r300_screen {
+    /* Parent class */
+    struct pipe_screen screen;
+
+    /* Chipset capabilities */
+    struct r300_capabilities* caps;
+};
+
+struct r300_transfer {
+    /* Parent class */
+    struct pipe_transfer transfer;
+
+    /* Offset from start of buffer. */
+    unsigned offset;
+};
+
+/* Convenience cast wrapper. */
+static struct r300_screen* r300_screen(struct pipe_screen* screen) {
+    return (struct r300_screen*)screen;
+}
+
+/* Convenience cast wrapper. */
+static INLINE struct r300_transfer*
+r300_transfer(struct pipe_transfer* transfer)
+{
+    return (struct r300_transfer*)transfer;
+}
+
+/* Creates a new r300 screen. */
+struct pipe_screen* r300_create_screen(struct pipe_winsys* winsys,
+                                       struct r300_winsys* r300_winsys);
+
+#endif /* R300_SCREEN_H */
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
new file mode 100644
index 0000000000..da99a3be6b
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -0,0 +1,854 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "util/u_math.h"
+#include "util/u_pack_color.h"
+
+#include "util/u_debug.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "r300_context.h"
+#include "r300_reg.h"
+#include "r300_state_shader.h"
+
+/* r300_state: Functions used to intialize state context by translating
+ * Gallium state objects into semi-native r300 state objects.
+ *
+ * XXX break this file up into pieces if it gets too big! */
+
+/* Pack a float into a dword. */
+static uint32_t pack_float_32(float f)
+{
+    union {
+        float f;
+        uint32_t u;
+    } u;
+
+    u.f = f;
+    return u.u;
+}
+
+static uint32_t translate_blend_function(int blend_func) {
+    switch (blend_func) {
+        case PIPE_BLEND_ADD:
+            return R300_COMB_FCN_ADD_CLAMP;
+        case PIPE_BLEND_SUBTRACT:
+            return R300_COMB_FCN_SUB_CLAMP;
+        case PIPE_BLEND_REVERSE_SUBTRACT:
+            return R300_COMB_FCN_RSUB_CLAMP;
+        case PIPE_BLEND_MIN:
+            return R300_COMB_FCN_MIN;
+        case PIPE_BLEND_MAX:
+            return R300_COMB_FCN_MAX;
+        default:
+            debug_printf("r300: Unknown blend function %d\n", blend_func);
+            break;
+    }
+    return 0;
+}
+
+/* XXX we can also offer the D3D versions of some of these... */
+static uint32_t translate_blend_factor(int blend_fact) {
+    switch (blend_fact) {
+        case PIPE_BLENDFACTOR_ONE:
+            return R300_BLEND_GL_ONE;
+        case PIPE_BLENDFACTOR_SRC_COLOR:
+            return R300_BLEND_GL_SRC_COLOR;
+        case PIPE_BLENDFACTOR_SRC_ALPHA:
+            return R300_BLEND_GL_SRC_ALPHA;
+        case PIPE_BLENDFACTOR_DST_ALPHA:
+            return R300_BLEND_GL_DST_ALPHA;
+        case PIPE_BLENDFACTOR_DST_COLOR:
+            return R300_BLEND_GL_DST_COLOR;
+        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+            return R300_BLEND_GL_SRC_ALPHA_SATURATE;
+        case PIPE_BLENDFACTOR_CONST_COLOR:
+            return R300_BLEND_GL_CONST_COLOR;
+        case PIPE_BLENDFACTOR_CONST_ALPHA:
+            return R300_BLEND_GL_CONST_ALPHA;
+        /* XXX WTF are these?
+        case PIPE_BLENDFACTOR_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_SRC1_ALPHA: */
+        case PIPE_BLENDFACTOR_ZERO:
+            return R300_BLEND_GL_ZERO;
+        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+            return R300_BLEND_GL_ONE_MINUS_SRC_COLOR;
+        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+            return R300_BLEND_GL_ONE_MINUS_SRC_ALPHA;
+        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+            return R300_BLEND_GL_ONE_MINUS_DST_ALPHA;
+        case PIPE_BLENDFACTOR_INV_DST_COLOR:
+            return R300_BLEND_GL_ONE_MINUS_DST_COLOR;
+        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+            return R300_BLEND_GL_ONE_MINUS_CONST_COLOR;
+        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+            return R300_BLEND_GL_ONE_MINUS_CONST_ALPHA;
+        /* XXX see above
+        case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: */
+        default:
+            debug_printf("r300: Unknown blend factor %d\n", blend_fact);
+            break;
+    }
+    return 0;
+}
+
+/* Create a new blend state based on the CSO blend state.
+ *
+ * This encompasses alpha blending, logic/raster ops, and blend dithering. */
+static void* r300_create_blend_state(struct pipe_context* pipe,
+                                     const struct pipe_blend_state* state)
+{
+    struct r300_blend_state* blend = CALLOC_STRUCT(r300_blend_state);
+
+    if (state->blend_enable) {
+        /* XXX for now, always do separate alpha...
+         * is it faster to do it with one reg? */
+        blend->blend_control = R300_ALPHA_BLEND_ENABLE |
+                R300_SEPARATE_ALPHA_ENABLE |
+                R300_READ_ENABLE |
+                translate_blend_function(state->rgb_func) |
+                (translate_blend_factor(state->rgb_src_factor) <<
+                    R300_SRC_BLEND_SHIFT) |
+                (translate_blend_factor(state->rgb_dst_factor) <<
+                    R300_DST_BLEND_SHIFT);
+        blend->alpha_blend_control =
+                translate_blend_function(state->alpha_func) |
+                (translate_blend_factor(state->alpha_src_factor) <<
+                    R300_SRC_BLEND_SHIFT) |
+                (translate_blend_factor(state->alpha_dst_factor) <<
+                    R300_DST_BLEND_SHIFT);
+    }
+
+    /* PIPE_LOGICOP_* don't need to be translated, fortunately. */
+    /* XXX are logicops still allowed if blending's disabled?
+     * Does Gallium take care of it for us? */
+    if (state->logicop_enable) {
+        blend->rop = R300_RB3D_ROPCNTL_ROP_ENABLE |
+                (state->logicop_func) << R300_RB3D_ROPCNTL_ROP_SHIFT;
+    }
+
+    if (state->dither) {
+        blend->dither = R300_RB3D_DITHER_CTL_DITHER_MODE_LUT |
+                R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_LUT;
+    }
+
+    return (void*)blend;
+}
+
+/* Bind blend state. */
+static void r300_bind_blend_state(struct pipe_context* pipe,
+                                  void* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    r300->blend_state = (struct r300_blend_state*)state;
+    r300->dirty_state |= R300_NEW_BLEND;
+}
+
+/* Free blend state. */
+static void r300_delete_blend_state(struct pipe_context* pipe,
+                                    void* state)
+{
+    FREE(state);
+}
+
+/* Set blend color.
+ * Setup both R300 and R500 registers, figure out later which one to write. */
+static void r300_set_blend_color(struct pipe_context* pipe,
+                                 const struct pipe_blend_color* color)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    uint32_t r, g, b, a;
+    ubyte ur, ug, ub, ua;
+
+    r = util_iround(color->color[0] * 1023.0f);
+    g = util_iround(color->color[1] * 1023.0f);
+    b = util_iround(color->color[2] * 1023.0f);
+    a = util_iround(color->color[3] * 1023.0f);
+
+    ur = float_to_ubyte(color->color[0]);
+    ug = float_to_ubyte(color->color[1]);
+    ub = float_to_ubyte(color->color[2]);
+    ua = float_to_ubyte(color->color[3]);
+
+    r300->blend_color_state->blend_color = (a << 24) | (r << 16) | (g << 8) | b;
+
+    r300->blend_color_state->blend_color_red_alpha = ur | (ua << 16);
+    r300->blend_color_state->blend_color_green_blue = ub | (ug << 16);
+
+    r300->dirty_state |= R300_NEW_BLEND_COLOR;
+}
+
+static void r300_set_clip_state(struct pipe_context* pipe,
+                                const struct pipe_clip_state* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    /* XXX Draw */
+    draw_flush(r300->draw);
+    draw_set_clip_state(r300->draw, state);
+}
+
+static void
+    r300_set_constant_buffer(struct pipe_context* pipe,
+                             uint shader, uint index,
+                             const struct pipe_constant_buffer* buffer)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    /* This entire chunk of code seems ever-so-slightly baked.
+     * It's as if I've got pipe_buffer* matryoshkas... */
+    if (buffer && buffer->buffer && buffer->buffer->size) {
+        void* map = pipe->winsys->buffer_map(pipe->winsys, buffer->buffer,
+                                             PIPE_BUFFER_USAGE_CPU_READ);
+        memcpy(r300->shader_constants[shader].constants,
+            map, buffer->buffer->size);
+        pipe->winsys->buffer_unmap(pipe->winsys, buffer->buffer);
+
+        r300->shader_constants[shader].user_count =
+            buffer->buffer->size / (sizeof(float) * 4);
+    } else {
+        r300->shader_constants[shader].user_count = 0;
+    }
+
+    r300->dirty_state |= R300_NEW_CONSTANTS;
+}
+
+static uint32_t translate_depth_stencil_function(int zs_func) {
+    switch (zs_func) {
+        case PIPE_FUNC_NEVER:
+            return R300_ZS_NEVER;
+        case PIPE_FUNC_LESS:
+            return R300_ZS_LESS;
+        case PIPE_FUNC_EQUAL:
+            return R300_ZS_EQUAL;
+        case PIPE_FUNC_LEQUAL:
+            return R300_ZS_LEQUAL;
+        case PIPE_FUNC_GREATER:
+            return R300_ZS_GREATER;
+        case PIPE_FUNC_NOTEQUAL:
+            return R300_ZS_NOTEQUAL;
+        case PIPE_FUNC_GEQUAL:
+            return R300_ZS_GEQUAL;
+        case PIPE_FUNC_ALWAYS:
+            return R300_ZS_ALWAYS;
+        default:
+            debug_printf("r300: Unknown depth/stencil function %d\n",
+                zs_func);
+            break;
+    }
+    return 0;
+}
+
+static uint32_t translate_stencil_op(int s_op) {
+    switch (s_op) {
+        case PIPE_STENCIL_OP_KEEP:
+            return R300_ZS_KEEP;
+        case PIPE_STENCIL_OP_ZERO:
+            return R300_ZS_ZERO;
+        case PIPE_STENCIL_OP_REPLACE:
+            return R300_ZS_REPLACE;
+        case PIPE_STENCIL_OP_INCR:
+            return R300_ZS_INCR;
+        case PIPE_STENCIL_OP_DECR:
+            return R300_ZS_DECR;
+        case PIPE_STENCIL_OP_INCR_WRAP:
+            return R300_ZS_INCR_WRAP;
+        case PIPE_STENCIL_OP_DECR_WRAP:
+            return R300_ZS_DECR_WRAP;
+        case PIPE_STENCIL_OP_INVERT:
+            return R300_ZS_INVERT;
+        default:
+            debug_printf("r300: Unknown stencil op %d", s_op);
+            break;
+    }
+    return 0;
+}
+
+static uint32_t translate_alpha_function(int alpha_func) {
+    switch (alpha_func) {
+        case PIPE_FUNC_NEVER:
+            return R300_FG_ALPHA_FUNC_NEVER;
+        case PIPE_FUNC_LESS:
+            return R300_FG_ALPHA_FUNC_LESS;
+        case PIPE_FUNC_EQUAL:
+            return R300_FG_ALPHA_FUNC_EQUAL;
+        case PIPE_FUNC_LEQUAL:
+            return R300_FG_ALPHA_FUNC_LE;
+        case PIPE_FUNC_GREATER:
+            return R300_FG_ALPHA_FUNC_GREATER;
+        case PIPE_FUNC_NOTEQUAL:
+            return R300_FG_ALPHA_FUNC_NOTEQUAL;
+        case PIPE_FUNC_GEQUAL:
+            return R300_FG_ALPHA_FUNC_GE;
+        case PIPE_FUNC_ALWAYS:
+            return R300_FG_ALPHA_FUNC_ALWAYS;
+        default:
+            debug_printf("r300: Unknown alpha function %d", alpha_func);
+            break;
+    }
+    return 0;
+}
+
+/* Create a new depth, stencil, and alpha state based on the CSO dsa state.
+ *
+ * This contains the depth buffer, stencil buffer, alpha test, and such.
+ * On the Radeon, depth and stencil buffer setup are intertwined, which is
+ * the reason for some of the strange-looking assignments across registers. */
+static void*
+        r300_create_dsa_state(struct pipe_context* pipe,
+                              const struct pipe_depth_stencil_alpha_state* state)
+{
+    struct r300_dsa_state* dsa = CALLOC_STRUCT(r300_dsa_state);
+
+    /* Depth test setup. */
+    if (state->depth.enabled) {
+        dsa->z_buffer_control |= R300_Z_ENABLE;
+
+        if (state->depth.writemask) {
+            dsa->z_buffer_control |= R300_Z_WRITE_ENABLE;
+        }
+
+        dsa->z_stencil_control |=
+            (translate_depth_stencil_function(state->depth.func) <<
+                R300_Z_FUNC_SHIFT);
+    }
+
+    /* Stencil buffer setup. */
+    if (state->stencil[0].enabled) {
+        dsa->z_buffer_control |= R300_STENCIL_ENABLE;
+        dsa->z_stencil_control |=
+                (translate_depth_stencil_function(state->stencil[0].func) <<
+                    R300_S_FRONT_FUNC_SHIFT) |
+                (translate_stencil_op(state->stencil[0].fail_op) <<
+                    R300_S_FRONT_SFAIL_OP_SHIFT) |
+                (translate_stencil_op(state->stencil[0].zpass_op) <<
+                    R300_S_FRONT_ZPASS_OP_SHIFT) |
+                (translate_stencil_op(state->stencil[0].zfail_op) <<
+                    R300_S_FRONT_ZFAIL_OP_SHIFT);
+
+        dsa->stencil_ref_mask = (state->stencil[0].ref_value) |
+                (state->stencil[0].valuemask << R300_STENCILMASK_SHIFT) |
+                (state->stencil[0].writemask << R300_STENCILWRITEMASK_SHIFT);
+
+        if (state->stencil[1].enabled) {
+            dsa->z_buffer_control |= R300_STENCIL_FRONT_BACK;
+            dsa->z_stencil_control |=
+                (translate_depth_stencil_function(state->stencil[1].func) <<
+                    R300_S_BACK_FUNC_SHIFT) |
+                (translate_stencil_op(state->stencil[1].fail_op) <<
+                    R300_S_BACK_SFAIL_OP_SHIFT) |
+                (translate_stencil_op(state->stencil[1].zpass_op) <<
+                    R300_S_BACK_ZPASS_OP_SHIFT) |
+                (translate_stencil_op(state->stencil[1].zfail_op) <<
+                    R300_S_BACK_ZFAIL_OP_SHIFT);
+
+            dsa->stencil_ref_bf = (state->stencil[1].ref_value) |
+                (state->stencil[1].valuemask << R300_STENCILMASK_SHIFT) |
+                (state->stencil[1].writemask << R300_STENCILWRITEMASK_SHIFT);
+        }
+    }
+
+    /* Alpha test setup. */
+    if (state->alpha.enabled) {
+        dsa->alpha_function = translate_alpha_function(state->alpha.func) |
+            R300_FG_ALPHA_FUNC_ENABLE;
+        dsa->alpha_reference = CLAMP(state->alpha.ref_value * 1023.0f,
+                                     0, 1023);
+    } else {
+        dsa->z_buffer_top = R300_ZTOP_ENABLE;
+    }
+
+    return (void*)dsa;
+}
+
+/* Bind DSA state. */
+static void r300_bind_dsa_state(struct pipe_context* pipe,
+                                void* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    r300->dsa_state = (struct r300_dsa_state*)state;
+    r300->dirty_state |= R300_NEW_DSA;
+}
+
+/* Free DSA state. */
+static void r300_delete_dsa_state(struct pipe_context* pipe,
+                                  void* state)
+{
+    FREE(state);
+}
+
+static void r300_set_edgeflags(struct pipe_context* pipe,
+                               const unsigned* bitfield)
+{
+    /* XXX you know it's bad when i915 has this blank too */
+}
+
+static void
+    r300_set_framebuffer_state(struct pipe_context* pipe,
+                               const struct pipe_framebuffer_state* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    draw_flush(r300->draw);
+
+    r300->framebuffer_state = *state;
+
+    r300->dirty_state |= R300_NEW_FRAMEBUFFERS;
+}
+
+/* Create fragment shader state. */
+static void* r300_create_fs_state(struct pipe_context* pipe,
+                                  const struct pipe_shader_state* shader)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r3xx_fragment_shader* fs = NULL;
+
+    if (r300_screen(r300->context.screen)->caps->is_r500) {
+        fs =
+            (struct r3xx_fragment_shader*)CALLOC_STRUCT(r500_fragment_shader);
+    } else {
+        fs =
+            (struct r3xx_fragment_shader*)CALLOC_STRUCT(r300_fragment_shader);
+    }
+
+    /* Copy state directly into shader. */
+    fs->state = *shader;
+
+    tgsi_scan_shader(shader->tokens, &fs->info);
+
+    return (void*)fs;
+}
+
+/* Bind fragment shader state. */
+static void r300_bind_fs_state(struct pipe_context* pipe, void* shader)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r3xx_fragment_shader* fs = (struct r3xx_fragment_shader*)shader;
+
+    if (fs == NULL) {
+        r300->fs = NULL;
+        return;
+    } else if (!fs->translated) {
+        if (r300_screen(r300->context.screen)->caps->is_r500) {
+            r500_translate_fragment_shader(r300, (struct r500_fragment_shader*)fs);
+        } else {
+            r300_translate_fragment_shader(r300, (struct r300_fragment_shader*)fs);
+        }
+    }
+
+    fs->translated = true;
+    r300->fs = fs;
+
+    r300->dirty_state |= R300_NEW_FRAGMENT_SHADER;
+}
+
+/* Delete fragment shader state. */
+static void r300_delete_fs_state(struct pipe_context* pipe, void* shader)
+{
+    FREE(shader);
+}
+
+static void r300_set_polygon_stipple(struct pipe_context* pipe,
+                                     const struct pipe_poly_stipple* state)
+{
+    /* XXX */
+}
+
+static INLINE int pack_float_16_6x(float f) {
+    return ((int)(f * 6.0) & 0xffff);
+}
+
+/* Create a new rasterizer state based on the CSO rasterizer state.
+ *
+ * This is a very large chunk of state, and covers most of the graphics
+ * backend (GB), geometry assembly (GA), and setup unit (SU) blocks.
+ *
+ * In a not entirely unironic sidenote, this state has nearly nothing to do
+ * with the actual block on the Radeon called the rasterizer (RS). */
+static void* r300_create_rs_state(struct pipe_context* pipe,
+                                  const struct pipe_rasterizer_state* state)
+{
+    struct r300_rs_state* rs = CALLOC_STRUCT(r300_rs_state);
+
+    /* XXX this is part of HW TCL */
+    /* XXX endian control */
+    rs->vap_control_status = R300_VAP_TCL_BYPASS;
+
+    rs->point_size = pack_float_16_6x(state->point_size) |
+        (pack_float_16_6x(state->point_size) << R300_POINTSIZE_X_SHIFT);
+
+    rs->line_control = pack_float_16_6x(state->line_width) |
+        R300_GA_LINE_CNTL_END_TYPE_COMP;
+
+    /* Radeons don't think in "CW/CCW", they think in "front/back". */
+    if (state->front_winding == PIPE_WINDING_CW) {
+        rs->cull_mode = R300_FRONT_FACE_CW;
+
+        if (state->offset_cw) {
+            rs->polygon_offset_enable |= R300_FRONT_ENABLE;
+        }
+        if (state->offset_ccw) {
+            rs->polygon_offset_enable |= R300_BACK_ENABLE;
+        }
+    } else {
+        rs->cull_mode = R300_FRONT_FACE_CCW;
+
+        if (state->offset_ccw) {
+            rs->polygon_offset_enable |= R300_FRONT_ENABLE;
+        }
+        if (state->offset_cw) {
+            rs->polygon_offset_enable |= R300_BACK_ENABLE;
+        }
+    }
+    if (state->front_winding & state->cull_mode) {
+        rs->cull_mode |= R300_CULL_FRONT;
+    }
+    if (~(state->front_winding) & state->cull_mode) {
+        rs->cull_mode |= R300_CULL_BACK;
+    }
+
+    if (rs->polygon_offset_enable) {
+        rs->depth_offset_front = rs->depth_offset_back =
+                pack_float_32(state->offset_units);
+        rs->depth_scale_front = rs->depth_scale_back =
+                pack_float_32(state->offset_scale);
+    }
+
+    if (state->line_stipple_enable) {
+        rs->line_stipple_config =
+            R300_GA_LINE_STIPPLE_CONFIG_LINE_RESET_LINE |
+            (pack_float_32((float)state->line_stipple_factor) &
+                R300_GA_LINE_STIPPLE_CONFIG_STIPPLE_SCALE_MASK);
+        /* XXX this might need to be scaled up */
+        rs->line_stipple_value = state->line_stipple_pattern;
+    }
+
+    rs->rs = *state;
+
+    return (void*)rs;
+}
+
+/* Bind rasterizer state. */
+static void r300_bind_rs_state(struct pipe_context* pipe, void* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_rs_state* rs = (struct r300_rs_state*)state;
+
+    draw_set_rasterizer_state(r300->draw, &rs->rs);
+
+    r300->rs_state = rs;
+    r300->dirty_state |= R300_NEW_RASTERIZER;
+}
+
+/* Free rasterizer state. */
+static void r300_delete_rs_state(struct pipe_context* pipe, void* state)
+{
+    FREE(state);
+}
+
+static uint32_t translate_wrap(int wrap) {
+    switch (wrap) {
+        case PIPE_TEX_WRAP_REPEAT:
+            return R300_TX_REPEAT;
+        case PIPE_TEX_WRAP_CLAMP:
+            return R300_TX_CLAMP;
+        case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+            return R300_TX_CLAMP_TO_EDGE;
+        case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+            return R300_TX_CLAMP_TO_BORDER;
+        case PIPE_TEX_WRAP_MIRROR_REPEAT:
+            return R300_TX_REPEAT | R300_TX_MIRRORED;
+        case PIPE_TEX_WRAP_MIRROR_CLAMP:
+            return R300_TX_CLAMP | R300_TX_MIRRORED;
+        case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+            return R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED;
+        case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+            return R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED;
+        default:
+            debug_printf("r300: Unknown texture wrap %d", wrap);
+            return 0;
+    }
+}
+
+static uint32_t translate_tex_filters(int min, int mag, int mip) {
+    uint32_t retval = 0;
+    switch (min) {
+        case PIPE_TEX_FILTER_NEAREST:
+            retval |= R300_TX_MIN_FILTER_NEAREST;
+        case PIPE_TEX_FILTER_LINEAR:
+            retval |= R300_TX_MIN_FILTER_LINEAR;
+        case PIPE_TEX_FILTER_ANISO:
+            retval |= R300_TX_MIN_FILTER_ANISO;
+        default:
+            debug_printf("r300: Unknown texture filter %d", min);
+            break;
+    }
+    switch (mag) {
+        case PIPE_TEX_FILTER_NEAREST:
+            retval |= R300_TX_MAG_FILTER_NEAREST;
+        case PIPE_TEX_FILTER_LINEAR:
+            retval |= R300_TX_MAG_FILTER_LINEAR;
+        case PIPE_TEX_FILTER_ANISO:
+            retval |= R300_TX_MAG_FILTER_ANISO;
+        default:
+            debug_printf("r300: Unknown texture filter %d", mag);
+            break;
+    }
+    switch (mip) {
+        case PIPE_TEX_MIPFILTER_NONE:
+            retval |= R300_TX_MIN_FILTER_MIP_NONE;
+        case PIPE_TEX_MIPFILTER_NEAREST:
+            retval |= R300_TX_MIN_FILTER_MIP_NEAREST;
+        case PIPE_TEX_MIPFILTER_LINEAR:
+            retval |= R300_TX_MIN_FILTER_MIP_LINEAR;
+        default:
+            debug_printf("r300: Unknown texture filter %d", mip);
+            break;
+    }
+
+    return retval;
+}
+
+static uint32_t anisotropy(float max_aniso) {
+    if (max_aniso >= 16.0f) {
+        return R300_TX_MAX_ANISO_16_TO_1;
+    } else if (max_aniso >= 8.0f) {
+        return R300_TX_MAX_ANISO_8_TO_1;
+    } else if (max_aniso >= 4.0f) {
+        return R300_TX_MAX_ANISO_4_TO_1;
+    } else if (max_aniso >= 2.0f) {
+        return R300_TX_MAX_ANISO_2_TO_1;
+    } else {
+        return R300_TX_MAX_ANISO_1_TO_1;
+    }
+}
+
+static void*
+        r300_create_sampler_state(struct pipe_context* pipe,
+                                  const struct pipe_sampler_state* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    struct r300_sampler_state* sampler = CALLOC_STRUCT(r300_sampler_state);
+    int lod_bias;
+
+    sampler->filter0 |=
+        (translate_wrap(state->wrap_s) << R300_TX_WRAP_S_SHIFT) |
+        (translate_wrap(state->wrap_t) << R300_TX_WRAP_T_SHIFT) |
+        (translate_wrap(state->wrap_r) << R300_TX_WRAP_R_SHIFT);
+
+    sampler->filter0 |= translate_tex_filters(state->min_img_filter,
+                                              state->mag_img_filter,
+                                              state->min_mip_filter);
+
+    lod_bias = CLAMP((int)(state->lod_bias * 32), -(1 << 9), (1 << 9) - 1);
+
+    sampler->filter1 |= lod_bias << R300_LOD_BIAS_SHIFT;
+
+    sampler->filter1 |= anisotropy(state->max_anisotropy);
+
+    util_pack_color(state->border_color, PIPE_FORMAT_A8R8G8B8_UNORM,
+                    &sampler->border_color);
+
+    /* R500-specific fixups and optimizations */
+    if (r300_screen(r300->context.screen)->caps->is_r500) {
+        sampler->filter1 |= R500_BORDER_FIX;
+    }
+
+    return (void*)sampler;
+}
+
+static void r300_bind_sampler_states(struct pipe_context* pipe,
+                                     unsigned count,
+                                     void** states)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    int i;
+
+    if (count > 8) {
+        return;
+    }
+
+    for (i = 0; i < count; i++) {
+        if (r300->sampler_states[i] != states[i]) {
+            r300->sampler_states[i] = (struct r300_sampler_state*)states[i];
+            r300->dirty_state |= (R300_NEW_SAMPLER << i);
+        }
+    }
+
+    r300->sampler_count = count;
+}
+
+static void r300_delete_sampler_state(struct pipe_context* pipe, void* state)
+{
+    FREE(state);
+}
+
+static void r300_set_sampler_textures(struct pipe_context* pipe,
+                                      unsigned count,
+                                      struct pipe_texture** texture)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    int i;
+
+    /* XXX magic num */
+    if (count > 8) {
+        return;
+    }
+
+    for (i = 0; i < count; i++) {
+        if (r300->textures[i] != (struct r300_texture*)texture[i]) {
+            pipe_texture_reference((struct pipe_texture**)&r300->textures[i],
+                texture[i]);
+            r300->dirty_state |= (R300_NEW_TEXTURE << i);
+        }
+    }
+
+    for (i = count; i < 8; i++) {
+        if (r300->textures[i]) {
+            pipe_texture_reference((struct pipe_texture**)&r300->textures[i],
+                NULL);
+            r300->dirty_state |= (R300_NEW_TEXTURE << i);
+        }
+    }
+
+    r300->texture_count = count;
+}
+
+static void r300_set_scissor_state(struct pipe_context* pipe,
+                                   const struct pipe_scissor_state* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    draw_flush(r300->draw);
+
+    r300->scissor_state->scissor_top_left =
+        (state->minx << R300_SCISSORS_X_SHIFT) |
+        (state->miny << R300_SCISSORS_Y_SHIFT);
+    r300->scissor_state->scissor_bottom_right =
+        (state->maxx << R300_SCISSORS_X_SHIFT) |
+        (state->maxy << R300_SCISSORS_Y_SHIFT);
+
+    r300->dirty_state |= R300_NEW_SCISSOR;
+}
+
+static void r300_set_viewport_state(struct pipe_context* pipe,
+                                    const struct pipe_viewport_state* state)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    /* XXX handing this off to Draw for now */
+    draw_set_viewport_state(r300->draw, state);
+}
+
+static void r300_set_vertex_buffers(struct pipe_context* pipe,
+                                    unsigned count,
+                                    const struct pipe_vertex_buffer* buffers)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    memcpy(r300->vertex_buffers, buffers,
+        sizeof(struct pipe_vertex_buffer) * count);
+
+    r300->vertex_buffer_count = count;
+
+    draw_flush(r300->draw);
+    draw_set_vertex_buffers(r300->draw, count, buffers);
+}
+
+static void r300_set_vertex_elements(struct pipe_context* pipe,
+                                    unsigned count,
+                                    const struct pipe_vertex_element* elements)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    /* XXX Draw */
+    draw_flush(r300->draw);
+    draw_set_vertex_elements(r300->draw, count, elements);
+}
+
+static void* r300_create_vs_state(struct pipe_context* pipe,
+                                  const struct pipe_shader_state* state)
+{
+    struct r300_context* context = r300_context(pipe);
+    /* XXX handing this off to Draw for now */
+    return draw_create_vertex_shader(context->draw, state);
+}
+
+static void r300_bind_vs_state(struct pipe_context* pipe, void* state) {
+    struct r300_context* context = r300_context(pipe);
+    /* XXX handing this off to Draw for now */
+    draw_bind_vertex_shader(context->draw, (struct draw_vertex_shader*)state);
+}
+
+static void r300_delete_vs_state(struct pipe_context* pipe, void* state)
+{
+    struct r300_context* context = r300_context(pipe);
+    /* XXX handing this off to Draw for now */
+    draw_delete_vertex_shader(context->draw, (struct draw_vertex_shader*)state);
+}
+
+void r300_init_state_functions(struct r300_context* r300)
+{
+    r300->context.create_blend_state = r300_create_blend_state;
+    r300->context.bind_blend_state = r300_bind_blend_state;
+    r300->context.delete_blend_state = r300_delete_blend_state;
+
+    r300->context.set_blend_color = r300_set_blend_color;
+
+    r300->context.set_clip_state = r300_set_clip_state;
+
+    r300->context.set_constant_buffer = r300_set_constant_buffer;
+
+    r300->context.create_depth_stencil_alpha_state = r300_create_dsa_state;
+    r300->context.bind_depth_stencil_alpha_state = r300_bind_dsa_state;
+    r300->context.delete_depth_stencil_alpha_state = r300_delete_dsa_state;
+
+    r300->context.set_edgeflags = r300_set_edgeflags;
+
+    r300->context.set_framebuffer_state = r300_set_framebuffer_state;
+
+    r300->context.create_fs_state = r300_create_fs_state;
+    r300->context.bind_fs_state = r300_bind_fs_state;
+    r300->context.delete_fs_state = r300_delete_fs_state;
+
+    r300->context.set_polygon_stipple = r300_set_polygon_stipple;
+
+    r300->context.create_rasterizer_state = r300_create_rs_state;
+    r300->context.bind_rasterizer_state = r300_bind_rs_state;
+    r300->context.delete_rasterizer_state = r300_delete_rs_state;
+
+    r300->context.create_sampler_state = r300_create_sampler_state;
+    r300->context.bind_sampler_states = r300_bind_sampler_states;
+    r300->context.delete_sampler_state = r300_delete_sampler_state;
+
+    r300->context.set_sampler_textures = r300_set_sampler_textures;
+
+    r300->context.set_scissor_state = r300_set_scissor_state;
+
+    r300->context.set_viewport_state = r300_set_viewport_state;
+
+    r300->context.set_vertex_buffers = r300_set_vertex_buffers;
+    r300->context.set_vertex_elements = r300_set_vertex_elements;
+
+    r300->context.create_vs_state = r300_create_vs_state;
+    r300->context.bind_vs_state = r300_bind_vs_state;
+    r300->context.delete_vs_state = r300_delete_vs_state;
+}
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
new file mode 100644
index 0000000000..a51904096f
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_state_derived.h"
+
+/* r300_state_derived: Various bits of state which are dependent upon
+ * currently bound CSO data. */
+
+static uint32_t translate_vertex_data_type(int type) {
+    switch (type) {
+        case EMIT_1F:
+        case EMIT_1F_PSIZE:
+            return R300_DATA_TYPE_FLOAT_1;
+            break;
+        case EMIT_2F:
+            return R300_DATA_TYPE_FLOAT_2;
+            break;
+        case EMIT_3F:
+            return R300_DATA_TYPE_FLOAT_3;
+            break;
+        case EMIT_4F:
+            return R300_DATA_TYPE_FLOAT_4;
+            break;
+        default:
+            debug_printf("r300: Implementation error: "
+                    "Bad vertex data type!\n");
+            break;
+    }
+
+    return 0;
+}
+
+/* Update the vertex_info struct in our r300_context.
+ *
+ * The vertex_info struct describes the post-TCL format of vertices. It is
+ * required for Draw when doing SW TCL, and also for describing the
+ * dreaded RS block on R300 chipsets. */
+/* XXX this function should be able to handle vert shaders as well as draw */
+static void r300_update_vertex_layout(struct r300_context* r300)
+{
+    struct vertex_info vinfo;
+    boolean pos = false, psize = false, fog = false;
+    int i, texs = 0, cols = 0;
+
+    struct tgsi_shader_info* info = &r300->fs->info;
+    memset(&vinfo, 0, sizeof(vinfo));
+
+    assert(info->num_inputs <= 16);
+
+    /* This is rather lame. Since draw_find_vs_output doesn't return an error
+     * when it can't find an output, we have to pre-iterate and count each
+     * output ourselves. */
+    for (i = 0; i < info->num_inputs; i++) {
+        switch (info->input_semantic_name[i]) {
+            case TGSI_SEMANTIC_POSITION:
+                pos = true;
+                break;
+            case TGSI_SEMANTIC_COLOR:
+                cols++;
+                break;
+            case TGSI_SEMANTIC_FOG:
+                fog = true;
+                break;
+            case TGSI_SEMANTIC_PSIZE:
+                psize = true;
+                break;
+            case TGSI_SEMANTIC_GENERIC:
+                texs++;
+                break;
+            default:
+                debug_printf("r300: Unknown vertex input %d\n",
+                    info->input_semantic_name[i]);
+                break;
+        }
+    }
+
+    /* Do the actual vertex_info setup.
+     *
+     * vertex_info has four uints of hardware-specific data in it.
+     * vinfo.hwfmt[0] is R300_VAP_VTX_STATE_CNTL
+     * vinfo.hwfmt[1] is R300_VAP_VSM_VTX_ASSM
+     * vinfo.hwfmt[2] is R300_VAP_OUTPUT_VTX_FMT_0
+     * vinfo.hwfmt[3] is R300_VAP_OUTPUT_VTX_FMT_1 */
+
+    vinfo.hwfmt[0] = 0x5555; /* XXX this is classic Mesa bonghits */
+
+    if (!pos) {
+        debug_printf("r300: Forcing vertex position attribute emit...\n");
+    }
+
+    draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_POS,
+        draw_find_vs_output(r300->draw, TGSI_SEMANTIC_POSITION, 0));
+    vinfo.hwfmt[1] |= R300_INPUT_CNTL_POS;
+    vinfo.hwfmt[2] |= R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT;
+
+    if (psize) {
+        draw_emit_vertex_attr(&vinfo, EMIT_1F_PSIZE, INTERP_LINEAR,
+            draw_find_vs_output(r300->draw, TGSI_SEMANTIC_PSIZE, 0));
+        vinfo.hwfmt[2] |= R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT;
+    }
+
+    for (i = 0; i < cols; i++) {
+        draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR,
+            draw_find_vs_output(r300->draw, TGSI_SEMANTIC_COLOR, i));
+        vinfo.hwfmt[1] |= R300_INPUT_CNTL_COLOR;
+        vinfo.hwfmt[2] |= (R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT << i);
+    }
+
+    if (fog) {
+        draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE,
+            draw_find_vs_output(r300->draw, TGSI_SEMANTIC_FOG, 0));
+        vinfo.hwfmt[2] |=
+            (R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT << cols);
+    }
+
+    for (i = 0; i < texs; i++) {
+        draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR,
+            draw_find_vs_output(r300->draw, TGSI_SEMANTIC_GENERIC, i));
+        vinfo.hwfmt[1] |= (R300_INPUT_CNTL_TC0 << i);
+        vinfo.hwfmt[3] |= (4 << (3 * i));
+    }
+
+    draw_compute_vertex_size(&vinfo);
+
+    if (memcmp(&r300->vertex_info, &vinfo, sizeof(struct vertex_info))) {
+        uint32_t temp;
+
+#define BORING_SWIZZLE \
+        ((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) | \
+         (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) | \
+         (R300_SWIZZLE_SELECT_Z << R300_SWIZZLE_SELECT_Z_SHIFT) | \
+         (R300_SWIZZLE_SELECT_W << R300_SWIZZLE_SELECT_W_SHIFT) | \
+         (0xf << R300_WRITE_ENA_SHIFT))
+
+        for (i = 0; i < vinfo.num_attribs; i++) {
+            temp = translate_vertex_data_type(vinfo.attrib[i].emit) |
+                R300_SIGNED;
+            if (i & 1) {
+                r300->vertex_info.vap_prog_stream_cntl[i >> 1] &= 0xffff0000;
+                r300->vertex_info.vap_prog_stream_cntl[i >> 1] |=
+                        (translate_vertex_data_type(vinfo.attrib[i].emit) |
+                        R300_SIGNED) << 16;
+            } else {
+                r300->vertex_info.vap_prog_stream_cntl[i >> 1] &= 0xffff;
+                r300->vertex_info.vap_prog_stream_cntl[i >> 1] |=
+                    translate_vertex_data_type(vinfo.attrib[i].emit) |
+                    R300_SIGNED;
+            }
+
+            r300->vertex_info.vap_prog_stream_cntl_ext[i >> 1] |=
+                (BORING_SWIZZLE << (i & 1 ? 16 : 0));
+        }
+        r300->vertex_info.vap_prog_stream_cntl[i >> 1] |= (R300_LAST_VEC <<
+                (i & 1 ? 16 : 0));
+
+        memcpy(&r300->vertex_info, &vinfo, sizeof(struct vertex_info));
+        r300->dirty_state |= R300_NEW_VERTEX_FORMAT;
+    }
+}
+
+/* Set up the RS block. This is the part of the chipset that actually does
+ * the rasterization of vertices into fragments. This is also the part of the
+ * chipset that locks up if any part of it is even slightly wrong. */
+void r300_update_rs_block(struct r300_context* r300)
+{
+}
+
+void r300_update_derived_state(struct r300_context* r300)
+{
+    if (r300->dirty_state & R300_NEW_FRAGMENT_SHADER) {
+        r300_update_vertex_layout(r300);
+    }
+
+    if (r300->dirty_state & R300_NEW_VERTEX_FORMAT) {
+        r300_update_rs_block(r300);
+    }
+}
diff --git a/src/gallium/drivers/r300/r300_state_derived.h b/src/gallium/drivers/r300/r300_state_derived.h
new file mode 100644
index 0000000000..72ba6b928d
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_state_derived.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_STATE_DERIVED_H
+#define R300_STATE_DERIVED_H
+
+#include "draw/draw_vertex.h"
+
+#include "r300_context.h"
+#include "r300_reg.h"
+
+void r300_update_derived_state(struct r300_context* r300);
+
+#endif /* R300_STATE_DERIVED_H */
diff --git a/src/gallium/drivers/r300/r300_state_inlines.h b/src/gallium/drivers/r300/r300_state_inlines.h
new file mode 100644
index 0000000000..005fb74ed6
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_state_inlines.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2009 Joakim Sindholt <opensource@zhasha.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_STATE_INLINES_H
+#define R300_STATE_INLINES_H
+
+#include "pipe/p_format.h"
+
+#include "r300_reg.h"
+
+static INLINE uint32_t r300_translate_colorformat(enum pipe_format format)
+{
+    switch (format) {
+    case PIPE_FORMAT_A8R8G8B8_UNORM:
+        return R300_COLOR_FORMAT_ARGB8888;
+    case PIPE_FORMAT_I8_UNORM:
+        return R300_COLOR_FORMAT_I8;
+    case PIPE_FORMAT_A1R5G5B5_UNORM:
+        return R300_COLOR_FORMAT_ARGB1555;
+    case PIPE_FORMAT_R5G6B5_UNORM:
+        return R300_COLOR_FORMAT_RGB565;
+    /* XXX Not in pipe_format
+    case PIPE_FORMAT_A32R32G32B32:
+        return R300_COLOR_FORMAT_ARGB32323232;
+    case PIPE_FORMAT_A16R16G16B16:
+        return R300_COLOR_FORMAT_ARGB16161616; */
+    case PIPE_FORMAT_A4R4G4B4_UNORM:
+        return R300_COLOR_FORMAT_ARGB4444;
+    /* XXX Not in pipe_format
+    case PIPE_FORMAT_A10R10G10B10_UNORM:
+        return R500_COLOR_FORMAT_ARGB10101010;
+    case PIPE_FORMAT_A2R10G10B10_UNORM:
+        return R500_COLOR_FORMAT_ARGB2101010;
+    case PIPE_FORMAT_I10_UNORM:
+        return R500_COLOR_FORMAT_I10; */
+    default:
+        debug_printf("r300: Implementation error: " \
+            "Got unsupported color format %s in %s\n",
+            pf_name(format), __FUNCTION__);
+        break;
+    }
+    
+    return 0;
+}
+
+static INLINE uint32_t r300_translate_zsformat(enum pipe_format format)
+{
+    switch (format) {
+    case PIPE_FORMAT_Z16_UNORM:
+        return R300_DEPTHFORMAT_16BIT_INT_Z;
+    /* XXX R300_DEPTHFORMAT_16BIT_13E3 anyone? */
+    case PIPE_FORMAT_Z24S8_UNORM:
+        return R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
+    default:
+        debug_printf("r300: Implementation error: " \
+            "Got unsupported ZS format %s in %s\n",
+            pf_name(format), __FUNCTION__);
+        break;
+    }
+    
+    return 0;
+}
+
+#endif /* R300_STATE_INLINES_H */
diff --git a/src/gallium/drivers/r300/r300_state_shader.c b/src/gallium/drivers/r300/r300_state_shader.c
new file mode 100644
index 0000000000..d10ac55580
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_state_shader.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_state_shader.h"
+
+static void r300_copy_passthrough_shader(struct r300_fragment_shader* fs)
+{
+    struct r300_fragment_shader* pt = &r300_passthrough_fragment_shader;
+    fs->shader.stack_size = pt->shader.stack_size;
+    fs->alu_instruction_count = pt->alu_instruction_count;
+    fs->tex_instruction_count = pt->tex_instruction_count;
+    fs->indirections = pt->indirections;
+    fs->instructions[0] = pt->instructions[0];
+}
+
+static void r500_copy_passthrough_shader(struct r500_fragment_shader* fs)
+{
+    struct r500_fragment_shader* pt = &r500_passthrough_fragment_shader;
+    fs->shader.stack_size = pt->shader.stack_size;
+    fs->instruction_count = pt->instruction_count;
+    fs->instructions[0] = pt->instructions[0];
+}
+
+void r300_translate_fragment_shader(struct r300_context* r300,
+                           struct r300_fragment_shader* fs)
+{
+    r300_copy_passthrough_shader(fs);
+}
+
+void r500_translate_fragment_shader(struct r300_context* r300,
+                           struct r500_fragment_shader* fs)
+{
+    r500_copy_passthrough_shader(fs);
+}
diff --git a/src/gallium/drivers/r300/r300_state_shader.h b/src/gallium/drivers/r300/r300_state_shader.h
new file mode 100644
index 0000000000..73025b2dcc
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_state_shader.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_STATE_SHADER_H
+#define R300_STATE_SHADER_H
+
+#include "r300_context.h"
+#include "r300_reg.h"
+#include "r300_screen.h"
+
+void r300_translate_fragment_shader(struct r300_context* r300,
+                           struct r300_fragment_shader* fs);
+
+void r500_translate_fragment_shader(struct r300_context* r300,
+                           struct r500_fragment_shader* fs);
+
+static const struct r300_fragment_shader r300_passthrough_fragment_shader = {
+    /* XXX This is the emission code. TODO: decode
+    OUT_CS_REG(R300_US_CONFIG, 0);
+    OUT_CS_REG(R300_US_CODE_OFFSET, 0x0);
+    OUT_CS_REG(R300_US_CODE_ADDR_0, 0x0);
+    OUT_CS_REG(R300_US_CODE_ADDR_1, 0x0);
+    OUT_CS_REG(R300_US_CODE_ADDR_2, 0x0);
+    OUT_CS_REG(R300_US_CODE_ADDR_3, 0x400000);
+*/
+    .alu_instruction_count = 1,
+    .tex_instruction_count = 0,
+    .indirections = 1,
+    .shader.stack_size = 2,
+
+    .instructions[0].alu_rgb_inst = R300_RGB_SWIZA(R300_ALU_ARGC_SRC0C_XYZ) |
+        R300_RGB_SWIZB(R300_ALU_ARGC_ONE) |
+        R300_RGB_SWIZC(R300_ALU_ARGC_ZERO) |
+        R300_ALU_OUTC_MAD,
+    .instructions[0].alu_rgb_addr = R300_RGB_ADDR0(0) | R300_RGB_ADDR1(0) |
+        R300_RGB_ADDR2(0) | R300_ALU_DSTC_OUTPUT_XYZ,
+    .instructions[0].alu_alpha_inst = R300_ALPHA_SWIZA(R300_ALU_ARGA_SRC0A) |
+        R300_ALPHA_SWIZB(R300_ALU_ARGA_ONE) |
+        R300_ALPHA_SWIZC(R300_ALU_ARGA_ZERO) |
+        R300_ALU_OUTA_MAD,
+    .instructions[0].alu_alpha_addr = R300_ALPHA_ADDR0(0) |
+        R300_ALPHA_ADDR1(0) | R300_ALPHA_ADDR2(0) | R300_ALU_DSTA_OUTPUT,
+};
+
+static const struct r500_fragment_shader r500_passthrough_fragment_shader = {
+    .shader.stack_size = 0,
+    .instruction_count = 1,
+    .instructions[0].inst0 = R500_INST_TYPE_OUT |
+        R500_INST_TEX_SEM_WAIT | R500_INST_LAST |
+        R500_INST_RGB_OMASK_RGB | R500_INST_ALPHA_OMASK |
+        R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP,
+    .instructions[0].inst1 =
+        R500_RGB_ADDR0(0) | R500_RGB_ADDR1(0) | R500_RGB_ADDR1_CONST |
+        R500_RGB_ADDR2(0) | R500_RGB_ADDR2_CONST,
+    .instructions[0].inst2 =
+        R500_ALPHA_ADDR0(0) | R500_ALPHA_ADDR1(0) | R500_ALPHA_ADDR1_CONST |
+        R500_ALPHA_ADDR2(0) | R500_ALPHA_ADDR2_CONST,
+    .instructions[0].inst3 =
+        R500_ALU_RGB_SEL_A_SRC0 | R500_ALU_RGB_R_SWIZ_A_R |
+        R500_ALU_RGB_G_SWIZ_A_G | R500_ALU_RGB_B_SWIZ_A_B |
+        R500_ALU_RGB_SEL_B_SRC0 | R500_ALU_RGB_R_SWIZ_B_R |
+        R500_ALU_RGB_B_SWIZ_B_G | R500_ALU_RGB_G_SWIZ_B_B,
+    .instructions[0].inst4 =
+        R500_ALPHA_OP_CMP | R500_ALPHA_SWIZ_A_A | R500_ALPHA_SWIZ_B_A,
+    .instructions[0].inst5 =
+        R500_ALU_RGBA_OP_CMP | R500_ALU_RGBA_R_SWIZ_0 |
+        R500_ALU_RGBA_G_SWIZ_0 | R500_ALU_RGBA_B_SWIZ_0 |
+        R500_ALU_RGBA_A_SWIZ_0,
+};
+
+#endif /* R300_STATE_SHADER_H */
diff --git a/src/gallium/drivers/r300/r300_surface.c b/src/gallium/drivers/r300/r300_surface.c
new file mode 100644
index 0000000000..49e4a96f83
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_surface.c
@@ -0,0 +1,326 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *                Joakim Sindholt <opensource@zhasha.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_surface.h"
+
+/* Provides pipe_context's "surface_fill". Commonly used for clearing
+ * buffers. */
+static void r300_surface_fill(struct pipe_context* pipe,
+                              struct pipe_surface* dest,
+                              unsigned x, unsigned y,
+                              unsigned w, unsigned h,
+                              unsigned color)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    CS_LOCALS(r300);
+    struct r300_capabilities* caps = ((struct r300_screen*)pipe->screen)->caps;
+    struct r300_texture* tex = (struct r300_texture*)dest->texture;
+    int i;
+    float r, g, b, a;
+    unsigned pixpitch = tex->stride / tex->tex.block.size;
+    r = (float)((color >> 16) & 0xff) / 255.0f;
+    g = (float)((color >>  8) & 0xff) / 255.0f;
+    b = (float)((color >>  0) & 0xff) / 255.0f;
+    debug_printf("r300: Filling surface %p at (%d,%d),"
+        " dimensions %dx%d (pixel pitch %d), color 0x%x\n",
+        dest, x, y, w, h, pixpitch, color);
+
+    /* Fallback? */
+    /*if (0) {
+        debug_printf("r300: Falling back on surface clear...");
+        void* map = pipe->screen->surface_map(pipe->screen, dest,
+            PIPE_BUFFER_USAGE_CPU_WRITE);
+        pipe_fill_rect(map, &dest->block, &dest->stride, x, y, w, h, color);
+        pipe->screen->surface_unmap(pipe->screen, dest);
+        return;
+    }*/
+
+    BEGIN_CS(163 + (caps->is_r500 ? 22 : 14) + (caps->has_tcl ? 4 : 2));
+    /* Flush PVS. */
+    OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0x0);
+
+    OUT_CS_REG(R300_SE_VTE_CNTL, R300_VPORT_X_SCALE_ENA |
+        R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
+        R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
+        R300_VPORT_Z_OFFSET_ENA | R300_VTX_W0_FMT);
+    /* Vertex size. */
+    OUT_CS_REG(R300_VAP_VTX_SIZE, 0x8);
+    /* Max and min vertex index clamp. */
+    OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, 0xFFFFFF);
+    OUT_CS_REG(R300_VAP_VF_MIN_VTX_INDX, 0x0);
+    /* XXX endian */
+    OUT_CS_REG(R300_VAP_CNTL_STATUS, R300_VC_NO_SWAP);
+    OUT_CS_REG(R300_VAP_PROG_STREAM_CNTL_0, 0x0);
+    /* XXX magic number not in r300_reg */
+    OUT_CS_REG(R300_VAP_PSC_SGN_NORM_CNTL, 0xAAAAAAAA);
+    OUT_CS_REG(R300_VAP_CLIP_CNTL, 0x0);
+    OUT_CS_REG_SEQ(R300_VAP_GB_VERT_CLIP_ADJ, 4);
+    OUT_CS_32F(1.0);
+    OUT_CS_32F(1.0);
+    OUT_CS_32F(1.0);
+    OUT_CS_32F(1.0);
+    /* XXX is this too long? */
+    OUT_CS_REG(VAP_PVS_VTX_TIMEOUT_REG, 0xFFFF);
+    OUT_CS_REG(R300_GB_ENABLE, R300_GB_POINT_STUFF_ENABLE |
+        R300_GB_LINE_STUFF_ENABLE | R300_GB_TRIANGLE_STUFF_ENABLE);
+    /* XXX more magic numbers */
+    OUT_CS_REG(R300_GB_MSPOS0, 0x66666666);
+    OUT_CS_REG(R300_GB_MSPOS1, 0x66666666);
+    /* XXX why doesn't classic Mesa write the number of pipes, too? */
+    OUT_CS_REG(R300_GB_TILE_CONFIG, R300_GB_TILE_ENABLE |
+        R300_GB_TILE_SIZE_16);
+    OUT_CS_REG(R300_GB_SELECT, R300_GB_FOG_SELECT_1_1_W);
+    OUT_CS_REG(R300_GB_AA_CONFIG, 0x0);
+    /* XXX point tex stuffing */
+    OUT_CS_REG_SEQ(R300_GA_POINT_S0, 1);
+    OUT_CS_32F(0.0);
+    OUT_CS_REG_SEQ(R300_GA_POINT_S1, 1);
+    OUT_CS_32F(1.0);
+    OUT_CS_REG(R300_GA_TRIANGLE_STIPPLE, 0x5 |
+        (0x5 << R300_GA_TRIANGLE_STIPPLE_Y_SHIFT_SHIFT));
+    /* XXX should this be related to the actual point size? */
+    OUT_CS_REG(R300_GA_POINT_MINMAX, 0x6 |
+        (0x1800 << R300_GA_POINT_MINMAX_MAX_SHIFT));
+    /* XXX this big chunk should be refactored into rs_state */
+    OUT_CS_REG(R300_GA_LINE_CNTL, 0x00030006);
+    OUT_CS_REG(R300_GA_LINE_STIPPLE_CONFIG, 0x3BAAAAAB);
+    OUT_CS_REG(R300_GA_LINE_STIPPLE_VALUE, 0x00000000);
+    OUT_CS_REG(R300_GA_LINE_S0, 0x00000000);
+    OUT_CS_REG(R300_GA_LINE_S1, 0x3F800000);
+    OUT_CS_REG(R300_GA_ENHANCE, 0x00000002);
+    OUT_CS_REG(R300_GA_COLOR_CONTROL, 0x0003AAAA);
+    OUT_CS_REG(R300_GA_SOLID_RG, 0x00000000);
+    OUT_CS_REG(R300_GA_SOLID_BA, 0x00000000);
+    OUT_CS_REG(R300_GA_POLY_MODE, 0x00000000);
+    OUT_CS_REG(R300_GA_ROUND_MODE, 0x00000001);
+    OUT_CS_REG(R300_GA_OFFSET, 0x00000000);
+    OUT_CS_REG(R300_GA_FOG_SCALE, 0x3DBF1412);
+    OUT_CS_REG(R300_GA_FOG_OFFSET, 0x00000000);
+    OUT_CS_REG(R300_SU_TEX_WRAP, 0x00000000);
+    OUT_CS_REG(R300_SU_POLY_OFFSET_FRONT_SCALE, 0x00000000);
+    OUT_CS_REG(R300_SU_POLY_OFFSET_FRONT_OFFSET, 0x00000000);
+    OUT_CS_REG(R300_SU_POLY_OFFSET_BACK_SCALE, 0x00000000);
+    OUT_CS_REG(R300_SU_POLY_OFFSET_BACK_OFFSET, 0x00000000);
+    OUT_CS_REG(R300_SU_POLY_OFFSET_ENABLE, 0x00000000);
+    OUT_CS_REG(R300_SU_CULL_MODE, 0x00000000);
+    OUT_CS_REG(R300_SU_DEPTH_SCALE, 0x4B7FFFFF);
+    OUT_CS_REG(R300_SU_DEPTH_OFFSET, 0x00000000);
+    OUT_CS_REG(R300_SC_HYPERZ, 0x0000001C);
+    OUT_CS_REG(R300_SC_EDGERULE, 0x2DA49525);
+    OUT_CS_REG(R300_FG_FOG_BLEND, 0x00000002);
+    OUT_CS_REG(R300_FG_FOG_COLOR_R, 0x00000000);
+    OUT_CS_REG(R300_FG_FOG_COLOR_G, 0x00000000);
+    OUT_CS_REG(R300_FG_FOG_COLOR_B, 0x00000000);
+    OUT_CS_REG(R300_FG_DEPTH_SRC, 0x00000000);
+    OUT_CS_REG(R300_FG_DEPTH_SRC, 0x00000000);
+    OUT_CS_REG(R300_RB3D_CCTL, 0x00000000);
+    OUT_CS_REG(RB3D_COLOR_CHANNEL_MASK, 0x0000000F);
+
+    /* XXX: Oh the wonderful unknown.
+     * Not writing these 8 regs seems to make no difference at all and seeing
+     * as how they're not documented, we should leave them out for now.
+    OUT_CS_REG_SEQ(0x4E54, 8);
+    for (i = 0; i < 8; i++) {
+        OUT_CS(0x00000000);
+    } */
+    OUT_CS_REG(R300_RB3D_AARESOLVE_CTL, 0x00000000);
+    OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 0x00000000);
+    OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD, 0xFFFFFFFF);
+    OUT_CS_REG(R300_ZB_FORMAT, 0x00000002);
+    OUT_CS_REG(R300_ZB_ZCACHE_CTLSTAT, 0x00000003);
+    OUT_CS_REG(R300_ZB_BW_CNTL, 0x00000000);
+    OUT_CS_REG(R300_ZB_DEPTHCLEARVALUE, 0x00000000);
+    /* XXX Moar unknown that should probably be left out.
+    OUT_CS_REG(0x4F30, 0x00000000);
+    OUT_CS_REG(0x4F34, 0x00000000); */
+    OUT_CS_REG(R300_ZB_HIZ_OFFSET, 0x00000000);
+    OUT_CS_REG(R300_ZB_HIZ_PITCH, 0x00000000);
+    if (caps->has_tcl) {
+        OUT_CS_REG(R300_VAP_PROG_STREAM_CNTL_0,
+            (R300_DATA_TYPE_FLOAT_4 << R300_DATA_TYPE_0_SHIFT) |
+            ((R300_LAST_VEC | (1 << R300_DST_VEC_LOC_SHIFT) |
+                R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT));
+    } else {
+        OUT_CS_REG(R300_VAP_PROG_STREAM_CNTL_0,
+            (R300_DATA_TYPE_FLOAT_4 << R300_DATA_TYPE_0_SHIFT) |
+            ((R300_LAST_VEC | (2 << R300_DST_VEC_LOC_SHIFT) |
+                R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT));
+    }
+    OUT_CS_REG(R300_FG_FOG_BLEND, 0x00000000);
+    OUT_CS_REG(R300_VAP_PROG_STREAM_CNTL_EXT_0, 0xF688F688);
+    OUT_CS_REG(R300_VAP_VTX_STATE_CNTL, 0x1);
+    OUT_CS_REG(R300_VAP_VSM_VTX_ASSM, 0x405);
+    OUT_CS_REG(R300_SE_VTE_CNTL, 0x0000043F);
+    OUT_CS_REG(R300_VAP_VTX_SIZE, 0x00000008);
+    OUT_CS_REG(R300_VAP_PSC_SGN_NORM_CNTL, 0xAAAAAAAA);
+    OUT_CS_REG(R300_VAP_OUTPUT_VTX_FMT_0, 0x00000003);
+    OUT_CS_REG(R300_VAP_OUTPUT_VTX_FMT_1, 0x00000000);
+    OUT_CS_REG(R300_TX_ENABLE, 0x0);
+    /* XXX viewport setup */
+    OUT_CS_REG_SEQ(R300_SE_VPORT_XSCALE, 6);
+    OUT_CS_32F(1.0);
+    OUT_CS_32F((float)x);
+    OUT_CS_32F(1.0);
+    OUT_CS_32F((float)y);
+    OUT_CS_32F(1.0);
+    OUT_CS_32F(0.0);
+
+    if (caps->has_tcl) {
+        OUT_CS_REG(R300_VAP_CLIP_CNTL, R300_CLIP_DISABLE |
+            R300_PS_UCP_MODE_CLIP_AS_TRIFAN);
+    }
+
+    /* The size of the point we're about to draw, in sixths of pixels */
+    OUT_CS_REG(R300_GA_POINT_SIZE,
+        ((h * 6) & R300_POINTSIZE_Y_MASK) |
+        ((w * 6) << R300_POINTSIZE_X_SHIFT));
+
+    /* XXX */
+    OUT_CS_REG(R300_SC_CLIP_RULE, 0xaaaa);
+
+    /* Pixel scissors */
+    OUT_CS_REG_SEQ(R300_SC_SCISSORS_TL, 2);
+    OUT_CS((x << R300_SCISSORS_X_SHIFT) | (y << R300_SCISSORS_Y_SHIFT));
+    OUT_CS((w << R300_SCISSORS_X_SHIFT) | (h << R300_SCISSORS_Y_SHIFT));
+
+    /* RS block setup */
+    if (caps->is_r500) {
+        /* XXX We seem to be in disagreement about how many of these we have
+         * RS:RS_IP_[0-15] [R/W] 32 bits Access: 8/16/32 MMReg:0x4074-0x40b0
+         * Now that's from the docs. I don't care what the mesa driver says */
+        OUT_CS_REG_SEQ(R500_RS_IP_0, 16);
+        for (i = 0; i < 16; i++) {
+            OUT_CS((R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
+                (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
+                (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
+                (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT));
+        }
+        OUT_CS_REG_SEQ(R300_RS_COUNT, 2);
+        OUT_CS((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
+        OUT_CS(0x00000000);
+        OUT_CS_REG(R500_RS_INST_0, R500_RS_INST_COL_CN_WRITE);
+    } else {
+        OUT_CS_REG_SEQ(R300_RS_IP_0, 8);
+        for (i = 0; i < 8; i++) {
+            OUT_CS(R300_RS_SEL_T(R300_RS_SEL_K0) |
+                R300_RS_SEL_R(R300_RS_SEL_K0) | R300_RS_SEL_Q(R300_RS_SEL_K1));
+        }
+        OUT_CS_REG_SEQ(R300_RS_COUNT, 2);
+        OUT_CS((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
+        /* XXX Shouldn't this be 0? */
+        OUT_CS(1);
+        OUT_CS_REG(R300_RS_INST_0, R300_RS_INST_COL_CN_WRITE);
+    }
+    END_CS;
+
+    /* Fragment shader setup */
+    if (caps->is_r500) {
+        r500_emit_fragment_shader(r300, &r500_passthrough_fragment_shader);
+    } else {
+        r300_emit_fragment_shader(r300, &r300_passthrough_fragment_shader);
+    }
+
+    BEGIN_CS(8 + (caps->has_tcl ? 20 : 2));
+    OUT_CS_REG_SEQ(R300_US_OUT_FMT_0, 4);
+    OUT_CS(R300_C0_SEL_B | R300_C1_SEL_G | R300_C2_SEL_R | R300_C3_SEL_A);
+    OUT_CS(R300_US_OUT_FMT_UNUSED);
+    OUT_CS(R300_US_OUT_FMT_UNUSED);
+    OUT_CS(R300_US_OUT_FMT_UNUSED);
+    OUT_CS_REG(R300_US_W_FMT, R300_W_FMT_W0);
+    /* XXX these magic numbers should be explained when
+     * this becomes a cached state object */
+    if (caps->has_tcl) {
+        OUT_CS_REG(R300_VAP_CNTL, 0xA |
+            (0x5 << R300_PVS_NUM_CNTLRS_SHIFT) |
+            (0xB << R300_VF_MAX_VTX_NUM_SHIFT) |
+            (caps->num_vert_fpus << R300_PVS_NUM_FPUS_SHIFT));
+        OUT_CS_REG(R300_VAP_PVS_CODE_CNTL_0, 0x00100000);
+        OUT_CS_REG(R300_VAP_PVS_CONST_CNTL, 0x00000000);
+        OUT_CS_REG(R300_VAP_PVS_CODE_CNTL_1, 0x00000001);
+        /* XXX translate these back into normal instructions */
+        OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0x1);
+        OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG, 0x0);
+        OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, 8);
+        OUT_CS(0x00F00203);
+        OUT_CS(0x00D10001);
+        OUT_CS(0x01248001);
+        OUT_CS(0x00000000);
+        OUT_CS(0x00F02203);
+        OUT_CS(0x00D10021);
+        OUT_CS(0x01248021);
+        OUT_CS(0x00000000);
+    } else {
+        OUT_CS_REG(R300_VAP_CNTL, 0xA |
+            (0x5 << R300_PVS_NUM_CNTLRS_SHIFT) |
+            (0x5 << R300_VF_MAX_VTX_NUM_SHIFT) |
+            (caps->num_vert_fpus << R300_PVS_NUM_FPUS_SHIFT));
+    }
+    END_CS;
+
+    r300_emit_blend_state(r300, &blend_clear_state);
+    r300_emit_blend_color_state(r300, &blend_color_clear_state);
+    r300_emit_dsa_state(r300, &dsa_clear_state);
+
+    BEGIN_CS(24);
+    /* Flush colorbuffer and blend caches. */
+    OUT_CS_REG(R300_RB3D_DSTCACHE_CTLSTAT,
+        R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D |
+        R300_RB3D_DSTCACHE_CTLSTAT_DC_FINISH_SIGNAL);
+    OUT_CS_REG(R300_ZB_ZCACHE_CTLSTAT,
+        R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
+        R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
+
+    OUT_CS_REG_SEQ(R300_RB3D_COLOROFFSET0, 1);
+    OUT_CS_RELOC(tex->buffer, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+    OUT_CS_REG(R300_RB3D_COLORPITCH0, pixpitch |
+        r300_translate_colorformat(tex->tex.format));
+    OUT_CS_REG(RB3D_COLOR_CHANNEL_MASK, 0x0000000F);
+    /* XXX Packet3 */
+    OUT_CS(CP_PACKET3(R200_3D_DRAW_IMMD_2, 8));
+    OUT_CS(R300_PRIM_TYPE_POINT | R300_PRIM_WALK_RING |
+    (1 << R300_PRIM_NUM_VERTICES_SHIFT));
+    OUT_CS_32F(w / 2.0);
+    OUT_CS_32F(h / 2.0);
+    /* XXX this should be the depth value to clear to */
+    OUT_CS_32F(1.0);
+    OUT_CS_32F(1.0);
+    OUT_CS_32F(r);
+    OUT_CS_32F(g);
+    OUT_CS_32F(b);
+    OUT_CS_32F(1.0);
+
+    /* XXX figure out why this is 0xA and not 0x2 */
+    OUT_CS_REG(R300_RB3D_DSTCACHE_CTLSTAT, 0xA);
+    /* XXX OUT_CS_REG(R300_ZB_ZCACHE_CTLSTAT,
+        R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
+        R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE); */
+
+    END_CS;
+
+    r300->dirty_hw++;
+}
+
+void r300_init_surface_functions(struct r300_context* r300)
+{
+    r300->context.surface_fill = r300_surface_fill;
+}
diff --git a/src/gallium/drivers/r300/r300_surface.h b/src/gallium/drivers/r300/r300_surface.h
new file mode 100644
index 0000000000..442eac2cf2
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_surface.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_SURFACE_H
+#define R300_SURFACE_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+
+#include "util/u_rect.h"
+
+#include "r300_context.h"
+#include "r300_cs.h"
+#include "r300_emit.h"
+#include "r300_state_shader.h"
+#include "r300_state_inlines.h"
+
+const struct r300_blend_state blend_clear_state = {
+    .blend_control = 0x0,
+    .alpha_blend_control = 0x0,
+    .rop = 0x0,
+    .dither = 0x0,
+};
+
+const struct r300_blend_color_state blend_color_clear_state = {
+    .blend_color = 0x0,
+    .blend_color_red_alpha = 0x0,
+    .blend_color_green_blue = 0x0,
+};
+
+const struct r300_dsa_state dsa_clear_state = {
+    .alpha_function = 0x0,
+    .alpha_reference = 0x0,
+    .z_buffer_control = 0x0,
+    .z_stencil_control = 0x0,
+    .stencil_ref_mask = R300_STENCILWRITEMASK_MASK,
+    .z_buffer_top = R300_ZTOP_ENABLE,
+    .stencil_ref_bf = 0x0,
+};
+
+#endif /* R300_SURFACE_H */
diff --git a/src/gallium/drivers/r300/r300_swtcl_emit.c b/src/gallium/drivers/r300/r300_swtcl_emit.c
new file mode 100644
index 0000000000..5b028aaf7b
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_swtcl_emit.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "draw/draw_pipe.h"
+#include "draw/draw_vbuf.h"
+#include "util/u_memory.h"
+
+#include "r300_cs.h"
+#include "r300_context.h"
+#include "r300_reg.h"
+
+/* r300_swtcl_emit: Vertex and index buffer primitive emission. No HW TCL. */
+
+struct r300_swtcl_render {
+    /* Parent class */
+    struct vbuf_render base;
+    
+    /* Pipe context */
+    struct r300_context* r300;
+
+    /* Vertex information */
+    size_t vertex_size;
+    unsigned prim;
+    unsigned hwprim;
+
+    /* VBO */
+    struct pipe_buffer* vbo;
+    size_t vbo_size;
+    size_t vbo_offset;
+    void* vbo_map;
+    size_t vbo_alloc_size;
+    size_t vbo_max_used;
+};
+
+static INLINE struct r300_swtcl_render*
+r300_swtcl_render(struct vbuf_render* render)
+{
+    return (struct r300_swtcl_render*)render;
+}
+
+static const struct vertex_info*
+r300_swtcl_render_get_vertex_info(struct vbuf_render* render)
+{
+    struct r300_swtcl_render* r300render = r300_swtcl_render(render);
+    struct r300_context* r300 = r300render->r300;
+
+    r300_update_derived_state(r300);
+
+    return &r300->vertex_info;
+}
+
+static boolean r300_swtcl_render_allocate_vertices(struct vbuf_render* render,
+                                                   ushort vertex_size,
+                                                   ushort count)
+{
+    struct r300_swtcl_render* r300render = r300_swtcl_render(render);
+    struct r300_context* r300 = r300render->r300;
+    struct pipe_screen* screen = r300->context.screen;
+    size_t size = (size_t)vertex_size * (size_t)count;
+
+    if (r300render->vbo) {
+        pipe_buffer_reference(screen, &r300render->vbo, NULL);
+    }
+
+    r300render->vbo_size = MAX2(size, r300render->vbo_alloc_size);
+    r300render->vbo_offset = 0;
+    r300render->vbo = pipe_buffer_create(screen,
+                                         64,
+                                         PIPE_BUFFER_USAGE_VERTEX,
+                                         r300render->vbo_size);
+
+    r300render->vertex_size = vertex_size;
+
+    if (r300render->vbo) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+static void* r300_swtcl_render_map_vertices(struct vbuf_render* render)
+{
+    struct r300_swtcl_render* r300render = r300_swtcl_render(render);
+    struct pipe_screen* screen = r300render->r300->context.screen;
+
+    r300render->vbo_map = pipe_buffer_map(screen, r300render->vbo,
+                                          PIPE_BUFFER_USAGE_CPU_WRITE);
+
+    return (unsigned char*)r300render->vbo_map + r300render->vbo_offset;
+}
+
+static void r300_swtcl_render_unmap_vertices(struct vbuf_render* render,
+                                             ushort min,
+                                             ushort max)
+{
+    struct r300_swtcl_render* r300render = r300_swtcl_render(render);
+    struct pipe_screen* screen = r300render->r300->context.screen;
+
+    r300render->vbo_max_used = MAX2(r300render->vbo_max_used,
+             r300render->vertex_size * (max + 1));
+
+    pipe_buffer_unmap(screen, r300render->vbo);
+}
+
+static void r300_swtcl_render_release_vertices(struct vbuf_render* render)
+{
+    struct r300_swtcl_render* r300render = r300_swtcl_render(render);
+    struct pipe_screen* screen = r300render->r300->context.screen;
+
+    pipe_buffer_reference(screen, &r300render->vbo, NULL);
+}
+
+static boolean r300_swtcl_render_set_primitive(struct vbuf_render* render,
+                                               unsigned prim)
+{
+    struct r300_swtcl_render* r300render = r300_swtcl_render(render);
+    r300render->prim = prim;
+
+    switch (prim) {
+        case PIPE_PRIM_POINTS:
+            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_POINTS;
+            break;
+        case PIPE_PRIM_LINES:
+            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_LINES;
+            break;
+        case PIPE_PRIM_LINE_LOOP:
+            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_LINE_LOOP;
+            break;
+        case PIPE_PRIM_LINE_STRIP:
+            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_LINE_STRIP;
+            break;
+        case PIPE_PRIM_TRIANGLES:
+            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_TRIANGLES;
+            break;
+        case PIPE_PRIM_TRIANGLE_STRIP:
+            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_TRIANGLE_STRIP;
+            break;
+        case PIPE_PRIM_TRIANGLE_FAN:
+            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_TRIANGLE_FAN;
+            break;
+        case PIPE_PRIM_QUADS:
+            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_QUADS;
+            break;
+        case PIPE_PRIM_QUAD_STRIP:
+            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_QUAD_STRIP;
+            break;
+        case PIPE_PRIM_POLYGON:
+            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_POLYGON;
+            break;
+        default:
+            return false;
+            break;
+    }
+
+    return true;
+}
+
+static void prepare_render(struct r300_swtcl_render* render)
+{
+    struct r300_context* r300 = render->r300;
+    int i;
+
+    CS_LOCALS(r300);
+
+    /* Make sure that all possible state is emitted. */
+    r300_emit_dirty_state(r300);
+
+    debug_printf("r300: Preparing vertex buffer %p for render, "
+            "vertex size %d, vertex count %d\n", render->vbo,
+            r300->vertex_info.vinfo.size, render->vbo_size);
+    /* Set the pointer to our vertex buffer. The emitted values are this:
+     * PACKET3 [3D_LOAD_VBPNTR]
+     * COUNT   [1]
+     * FORMAT  [size | stride << 8]
+     * VBPNTR  [relocated BO]
+     */
+    BEGIN_CS(5);
+    OUT_CS(CP_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, 3));
+    OUT_CS(1);
+    OUT_CS(r300->vertex_info.vinfo.size | (r300->vertex_info.vinfo.size << 8));
+    OUT_CS_RELOC(render->vbo, 0, RADEON_GEM_DOMAIN_GTT, 0, 0);
+    END_CS;
+}
+
+static void r300_swtcl_render_draw_arrays(struct vbuf_render* render,
+                                          unsigned start,
+                                          unsigned count)
+{
+    struct r300_swtcl_render* r300render = r300_swtcl_render(render);
+    struct r300_context* r300 = r300render->r300;
+    struct pipe_screen* screen = r300->context.screen;
+
+    CS_LOCALS(r300);
+
+    prepare_render(r300render);
+
+    debug_printf("r300: Doing vbuf render, count %d\n", count);
+
+    BEGIN_CS(2);
+    OUT_CS(CP_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0));
+    OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (count << 16) |
+           r300render->hwprim | R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
+    END_CS;
+}
+
+static void r300_swtcl_render_draw(struct vbuf_render* render,
+                                   const ushort* indices,
+                                   uint count)
+{
+    struct r300_swtcl_render* r300render = r300_swtcl_render(render);
+    struct r300_context* r300 = r300render->r300;
+    struct pipe_screen* screen = r300->context.screen;
+    struct pipe_buffer* index_buffer;
+    void* index_map;
+
+    CS_LOCALS(r300);
+
+    prepare_render(r300render);
+
+    /* Send our indices into an index buffer. */
+    index_buffer = pipe_buffer_create(screen, 64, PIPE_BUFFER_USAGE_VERTEX,
+                                      count * 4);
+    if (!index_buffer) {
+        return;
+    }
+
+    index_map = pipe_buffer_map(screen, index_buffer,
+                                PIPE_BUFFER_USAGE_CPU_WRITE);
+    memcpy(index_map, indices, count * 4);
+    pipe_buffer_unmap(screen, index_buffer);
+
+    debug_printf("r300: Doing indexbuf render, count %d\n", count);
+
+    BEGIN_CS(5);
+    OUT_CS(CP_PACKET3(R300_PACKET3_3D_DRAW_INDX_2, 0));
+    OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
+           r300render->hwprim | R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
+
+    OUT_CS(CP_PACKET3(R300_PACKET3_INDX_BUFFER, 2));
+    OUT_CS(R300_INDX_BUFFER_ONE_REG_WR | (R300_VAP_PORT_IDX0 >> 2));
+    OUT_CS_RELOC(index_buffer, 0, RADEON_GEM_DOMAIN_GTT, 0, 0);
+    END_CS;
+}
+
+static void r300_swtcl_render_destroy(struct vbuf_render* render)
+{
+    FREE(render);
+}
+
+static struct vbuf_render* r300_swtcl_render_create(struct r300_context* r300)
+{
+    struct r300_swtcl_render* r300render = CALLOC_STRUCT(r300_swtcl_render);
+    struct pipe_screen* screen = r300->context.screen;
+
+    r300render->r300 = r300;
+
+    /* XXX find real numbers plz */
+    r300render->base.max_vertex_buffer_bytes = 128 * 1024;
+    r300render->base.max_indices = 16 * 1024;
+
+    r300render->base.get_vertex_info = r300_swtcl_render_get_vertex_info;
+    r300render->base.allocate_vertices = r300_swtcl_render_allocate_vertices;
+    r300render->base.map_vertices = r300_swtcl_render_map_vertices;
+    r300render->base.unmap_vertices = r300_swtcl_render_unmap_vertices;
+    r300render->base.set_primitive = r300_swtcl_render_set_primitive;
+    r300render->base.draw = r300_swtcl_render_draw;
+    r300render->base.draw_arrays = r300_swtcl_render_draw_arrays;
+    r300render->base.release_vertices = r300_swtcl_render_release_vertices;
+    r300render->base.destroy = r300_swtcl_render_destroy;
+
+    /* XXX bonghits ahead
+    r300render->vbo_alloc_size = 128 * 4096;
+    r300render->vbo_size = r300render->vbo_alloc_size;
+    r300render->vbo_offset = 0;
+    r300render->vbo = pipe_buffer_create(screen,
+                                         64,
+                                         PIPE_BUFFER_USAGE_VERTEX,
+                                         r300render->vbo_size);
+    r300render->vbo_map = pipe_buffer_map(screen,
+                                          r300render->vbo,
+                                          PIPE_BUFFER_USAGE_CPU_WRITE);
+    pipe_buffer_unmap(screen, r300render->vbo); */
+
+    return &r300render->base;
+}
+
+struct draw_stage* r300_draw_swtcl_stage(struct r300_context* r300)
+{
+    struct vbuf_render* render;
+    struct draw_stage* stage;
+
+    render = r300_swtcl_render_create(r300);
+
+    if (!render) {
+        return NULL;
+    }
+
+    stage = draw_vbuf_stage(r300->draw, render);
+
+    if (!stage) {
+        render->destroy(render);
+        return NULL;
+    }
+
+    draw_set_render(r300->draw, render);
+
+    return stage;
+}
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
new file mode 100644
index 0000000000..edd4370663
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_texture.h"
+
+static int minify(int i)
+{
+    return MAX2(1, i >> 1);
+}
+
+static void r300_setup_miptree(struct r300_texture* tex)
+{
+    struct pipe_texture* base = &tex->tex;
+    int stride, size, offset;
+
+    for (int i = 0; i <= base->last_level; i++) {
+        if (i > 0) {
+            base->width[i] = minify(base->width[i-1]);
+            base->height[i] = minify(base->height[i-1]);
+            base->depth[i] = minify(base->depth[i-1]);
+        }
+
+        base->nblocksx[i] = pf_get_nblocksx(&base->block, base->width[i]);
+        base->nblocksy[i] = pf_get_nblocksy(&base->block, base->width[i]);
+
+        /* Radeons enjoy things in multiples of 32. */
+        /* XXX NPOT -> 64, not 32 */
+        stride = (base->nblocksx[i] * base->block.size + 63) & ~63;
+        size = stride * base->nblocksy[i] * base->depth[i];
+
+        /* XXX 64 for NPOT */
+        tex->offset[i] = (tex->size + 63) & ~63;
+        tex->size = tex->offset[i] + size;
+    }
+}
+
+/* Create a new texture. */
+static struct pipe_texture*
+    r300_texture_create(struct pipe_screen* screen,
+                        const struct pipe_texture* template)
+{
+    /* XXX struct r300_screen* r300screen = r300_screen(screen); */
+
+    struct r300_texture* tex = CALLOC_STRUCT(r300_texture);
+
+    if (!tex) {
+        return NULL;
+    }
+
+    tex->tex = *template;
+    tex->tex.refcount = 1;
+    tex->tex.screen = screen;
+
+    r300_setup_miptree(tex);
+
+    tex->buffer = screen->buffer_create(screen, 64,
+                                        PIPE_BUFFER_USAGE_PIXEL,
+                                        tex->size);
+
+    if (!tex->buffer) {
+        FREE(tex);
+        return NULL;
+    }
+
+    return (struct pipe_texture*)tex;
+}
+
+static void r300_texture_release(struct pipe_screen* screen,
+                                 struct pipe_texture** texture)
+{
+    if (!*texture) {
+        return;
+    }
+
+    (*texture)->refcount--;
+
+    if ((*texture)->refcount <= 0) {
+        struct r300_texture* tex = (struct r300_texture*)*texture;
+
+        pipe_buffer_reference(screen, &tex->buffer, NULL);
+
+        FREE(tex);
+    }
+
+    *texture = NULL;
+}
+
+static struct pipe_surface* r300_get_tex_surface(struct pipe_screen* screen,
+                                                 struct pipe_texture* texture,
+                                                 unsigned face,
+                                                 unsigned level,
+                                                 unsigned zslice,
+                                                 unsigned flags)
+{
+    struct r300_texture* tex = (struct r300_texture*)texture;
+    struct pipe_surface* surface = CALLOC_STRUCT(pipe_surface);
+    unsigned offset;
+
+    /* XXX this is certainly dependent on tex target */
+    offset = tex->offset[level];
+
+    if (surface) {
+        surface->refcount = 1;
+        pipe_texture_reference(&surface->texture, texture);
+        surface->format = texture->format;
+        surface->width = texture->width[level];
+        surface->height = texture->height[level];
+        surface->offset = offset;
+        surface->usage = flags;
+        surface->status = PIPE_SURFACE_STATUS_DEFINED;
+    }
+
+    return surface;
+}
+
+static void r300_tex_surface_release(struct pipe_screen* screen,
+                                     struct pipe_surface** surface)
+{
+    struct pipe_surface* s = *surface;
+
+    s->refcount--;
+
+    if (s->refcount <= 0) {
+        pipe_texture_reference(&s->texture, NULL);
+        FREE(s);
+    }
+
+    *surface = NULL;
+}
+
+static struct pipe_texture*
+    r300_texture_blanket(struct pipe_screen* screen,
+                         const struct pipe_texture* base,
+                         const unsigned* stride,
+                         struct pipe_buffer* buffer)
+{
+    struct r300_texture* tex;
+
+    if (base->target != PIPE_TEXTURE_2D ||
+        base->last_level != 0 ||
+        base->depth[0] != 1) {
+        return NULL;
+    }
+
+    tex = CALLOC_STRUCT(r300_texture);
+    if (!tex) {
+        return NULL;
+    }
+
+    tex->tex = *base;
+    tex->tex.refcount = 1;
+    tex->tex.screen = screen;
+
+    tex->stride = *stride;
+
+    pipe_buffer_reference(screen, &tex->buffer, buffer);
+
+    return (struct pipe_texture*)tex;
+}
+
+void r300_init_screen_texture_functions(struct pipe_screen* screen)
+{
+    screen->texture_create = r300_texture_create;
+    screen->texture_release = r300_texture_release;
+    screen->get_tex_surface = r300_get_tex_surface;
+    screen->tex_surface_release = r300_tex_surface_release;
+    screen->texture_blanket = r300_texture_blanket;
+}
diff --git a/src/gallium/drivers/r300/r300_texture.h b/src/gallium/drivers/r300/r300_texture.h
new file mode 100644
index 0000000000..7964229a94
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_texture.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_TEXTURE_H
+#define R300_TEXTURE_H
+
+#include "pipe/p_screen.h"
+
+#include "util/u_math.h"
+
+#include "r300_context.h"
+
+void r300_init_screen_texture_functions(struct pipe_screen* screen);
+
+#endif /* R300_TEXTURE_H */
diff --git a/src/gallium/drivers/r300/r300_winsys.h b/src/gallium/drivers/r300/r300_winsys.h
new file mode 100644
index 0000000000..5a3a212892
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_winsys.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_WINSYS_H
+#define R300_WINSYS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* The public interface header for the r300 pipe driver.
+ * Any winsys hosting this pipe needs to implement r300_winsys and then
+ * call r300_create_context to start things. */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+struct radeon_cs;
+
+struct r300_winsys {
+
+    /* PCI ID */
+    uint32_t pci_id;
+
+    /* GB pipe count */
+    uint32_t gb_pipes;
+
+    /* CS object. This is very much like Intel's batchbuffer.
+     * Fill it full of dwords and relocs and then submit.
+     * Repeat as needed. */
+    /* Note: Unlike Mesa's version of this, we don't keep a copy of the CSM
+     * that was used to create this CS. Is this a good idea? */
+    /* Note: The pipe driver doesn't know how to use this. This is purely
+     * for the winsys. */
+    struct radeon_cs* cs;
+
+    /* Check to see if there's room for commands. */
+    boolean (*check_cs)(struct radeon_cs* cs, int size);
+
+    /* Start a command emit. */
+    void (*begin_cs)(struct radeon_cs* cs,
+           int size,
+           const char* file,
+           const char* function,
+           int line);
+
+    /* Write a dword to the command buffer. */
+    void (*write_cs_dword)(struct radeon_cs* cs, uint32_t dword);
+
+    /* Write a relocated dword to the command buffer. */
+    void (*write_cs_reloc)(struct radeon_cs* cs,
+           struct pipe_buffer* bo,
+           uint32_t rd,
+           uint32_t wd,
+           uint32_t flags);
+
+    /* Finish a command emit. */
+    void (*end_cs)(struct radeon_cs* cs,
+           const char* file,
+           const char* function,
+           int line);
+
+    /* Flush the CS. */
+    void (*flush_cs)(struct radeon_cs* cs);
+};
+
+struct pipe_context* r300_create_context(struct pipe_screen* screen,
+                                         struct pipe_winsys* winsys,
+                                         struct r300_winsys* r300_winsys);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* R300_WINSYS_H */
diff --git a/src/gallium/drivers/softpipe/Makefile b/src/gallium/drivers/softpipe/Makefile
index 120bdfd9dd..516e3992fd 100644
--- a/src/gallium/drivers/softpipe/Makefile
+++ b/src/gallium/drivers/softpipe/Makefile
@@ -14,7 +14,7 @@ C_SOURCES = \
 	sp_draw_arrays.c \
 	sp_prim_setup.c \
 	sp_prim_vbuf.c \
-	sp_quad.c \
+	sp_quad_pipe.c \
 	sp_quad_alpha_test.c \
 	sp_quad_blend.c \
 	sp_quad_colormask.c \
@@ -42,6 +42,3 @@ C_SOURCES = \
 	sp_surface.c 
 
 include ../../Makefile.template
-
-symlinks:
-
diff --git a/src/gallium/drivers/softpipe/SConscript b/src/gallium/drivers/softpipe/SConscript
index c1f7daa8ab..f8720638a7 100644
--- a/src/gallium/drivers/softpipe/SConscript
+++ b/src/gallium/drivers/softpipe/SConscript
@@ -17,7 +17,7 @@ softpipe = env.ConvenienceLibrary(
 		'sp_setup.c',
 		'sp_quad_alpha_test.c',
 		'sp_quad_blend.c',
-		'sp_quad.c',
+		'sp_quad_pipe.c',
 		'sp_quad_colormask.c',
 		'sp_quad_coverage.c',
 		'sp_quad_depth_test.c',
diff --git a/src/gallium/drivers/softpipe/sp_clear.c b/src/gallium/drivers/softpipe/sp_clear.c
index dfa46c9fb7..ad108ec446 100644
--- a/src/gallium/drivers/softpipe/sp_clear.c
+++ b/src/gallium/drivers/softpipe/sp_clear.c
@@ -85,7 +85,7 @@ softpipe_clear(struct pipe_context *pipe, struct pipe_surface *ps,
 #endif
    }
 
-   for (i = 0; i < softpipe->framebuffer.num_cbufs; i++) {
+   for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) {
       if (ps == sp_tile_cache_get_surface(softpipe->cbuf_cache[i])) {
          unsigned cv;
          if (ps->format != PIPE_FORMAT_A8R8G8B8_UNORM) {
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index cd1e6663d8..ff5d1b54a4 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -2,6 +2,7 @@
  * 
  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2008 VMware, Inc.  All rights reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -52,15 +53,15 @@
  * Map any drawing surfaces which aren't already mapped
  */
 void
-softpipe_map_surfaces(struct softpipe_context *sp)
+softpipe_map_transfers(struct softpipe_context *sp)
 {
    unsigned i;
 
-   for (i = 0; i < sp->framebuffer.num_cbufs; i++) {
-      sp_tile_cache_map_surfaces(sp->cbuf_cache[i]);
+   for (i = 0; i < sp->framebuffer.nr_cbufs; i++) {
+      sp_tile_cache_map_transfers(sp->cbuf_cache[i]);
    }
 
-   sp_tile_cache_map_surfaces(sp->zsbuf_cache);
+   sp_tile_cache_map_transfers(sp->zsbuf_cache);
 }
 
 
@@ -68,25 +69,25 @@ softpipe_map_surfaces(struct softpipe_context *sp)
  * Unmap any mapped drawing surfaces
  */
 void
-softpipe_unmap_surfaces(struct softpipe_context *sp)
+softpipe_unmap_transfers(struct softpipe_context *sp)
 {
    uint i;
 
-   for (i = 0; i < sp->framebuffer.num_cbufs; i++)
+   for (i = 0; i < sp->framebuffer.nr_cbufs; i++)
       sp_flush_tile_cache(sp, sp->cbuf_cache[i]);
    sp_flush_tile_cache(sp, sp->zsbuf_cache);
 
-   for (i = 0; i < sp->framebuffer.num_cbufs; i++) {
-      sp_tile_cache_unmap_surfaces(sp->cbuf_cache[i]);
+   for (i = 0; i < sp->framebuffer.nr_cbufs; i++) {
+      sp_tile_cache_unmap_transfers(sp->cbuf_cache[i]);
    }
-   sp_tile_cache_unmap_surfaces(sp->zsbuf_cache);
+   sp_tile_cache_unmap_transfers(sp->zsbuf_cache);
 }
 
 
 static void softpipe_destroy( struct pipe_context *pipe )
 {
    struct softpipe_context *softpipe = softpipe_context( pipe );
-   struct pipe_winsys *ws = pipe->winsys;
+   struct pipe_screen *screen = pipe->screen;
    uint i;
 
    if (softpipe->draw)
@@ -115,7 +116,7 @@ static void softpipe_destroy( struct pipe_context *pipe )
 
    for (i = 0; i < Elements(softpipe->constants); i++) {
       if (softpipe->constants[i].buffer) {
-         winsys_buffer_reference(ws, &softpipe->constants[i].buffer, NULL);
+         pipe_buffer_reference(screen, &softpipe->constants[i].buffer, NULL);
       }
    }
 
@@ -221,6 +222,24 @@ softpipe_create( struct pipe_screen *screen,
       softpipe->quad[i].output = sp_quad_output_stage(softpipe);
    }
 
+   /* vertex shader samplers */
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      softpipe->tgsi.vert_samplers[i].base.get_samples = sp_get_samples_vertex;
+      softpipe->tgsi.vert_samplers[i].unit = i;
+      softpipe->tgsi.vert_samplers[i].sp = softpipe;
+      softpipe->tgsi.vert_samplers[i].cache = softpipe->tex_cache[i];
+      softpipe->tgsi.vert_samplers_list[i] = &softpipe->tgsi.vert_samplers[i];
+   }
+
+   /* fragment shader samplers */
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      softpipe->tgsi.frag_samplers[i].base.get_samples = sp_get_samples_fragment;
+      softpipe->tgsi.frag_samplers[i].unit = i;
+      softpipe->tgsi.frag_samplers[i].sp = softpipe;
+      softpipe->tgsi.frag_samplers[i].cache = softpipe->tex_cache[i];
+      softpipe->tgsi.frag_samplers_list[i] = &softpipe->tgsi.frag_samplers[i];
+   }
+
    /*
     * Create drawing context and plug our rendering stage into it.
     */
@@ -228,6 +247,11 @@ softpipe_create( struct pipe_screen *screen,
    if (!softpipe->draw) 
       goto fail;
 
+   draw_texture_samplers(softpipe->draw,
+                         PIPE_MAX_SAMPLERS,
+                         (struct tgsi_sampler **)
+                            softpipe->tgsi.vert_samplers_list);
+
    softpipe->setup = sp_draw_render_stage(softpipe);
    if (!softpipe->setup)
       goto fail;
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 2b9a2a8ee5..59d6df8f2d 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -32,11 +32,11 @@
 #define SP_CONTEXT_H
 
 #include "pipe/p_context.h"
-#include "pipe/p_defines.h"
 
 #include "draw/draw_vertex.h"
 
-#include "sp_quad.h"
+#include "sp_quad_pipe.h"
+#include "sp_tex_sample.h"
 
 
 /**
@@ -50,7 +50,6 @@
  */
 #define SP_NUM_QUAD_THREADS 1
 
-struct softpipe_winsys;
 struct softpipe_vbuf_render;
 struct draw_context;
 struct draw_stage;
@@ -62,15 +61,15 @@ struct sp_vertex_shader;
 struct softpipe_context {
    struct pipe_context pipe;  /**< base class */
 
-   /* The most recent drawing state as set by the driver:
-    */
-   const struct pipe_blend_state   *blend;
+   /** Constant state objects */
+   const struct pipe_blend_state *blend;
    const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
-   const struct pipe_depth_stencil_alpha_state   *depth_stencil;
+   const struct pipe_depth_stencil_alpha_state *depth_stencil;
    const struct pipe_rasterizer_state *rasterizer;
    const struct sp_fragment_shader *fs;
    const struct sp_vertex_shader *vs;
 
+   /** Other rendering state */
    struct pipe_blend_color blend_color;
    struct pipe_clip_state clip;
    struct pipe_constant_buffer constants[PIPE_SHADER_TYPES];
@@ -81,23 +80,20 @@ struct softpipe_context {
    struct pipe_viewport_state viewport;
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
    struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
-   unsigned dirty;
 
    unsigned num_samplers;
    unsigned num_textures;
    unsigned num_vertex_elements;
    unsigned num_vertex_buffers;
 
-   boolean no_rast;
+   unsigned dirty; /**< Mask of SP_NEW_x flags */
 
    /* Counter for occlusion queries.  Note this supports overlapping
     * queries.
     */
-   uint64 occlusion_count;
+   uint64_t occlusion_count;
 
-   /*
-    * Mapped vertex buffers
-    */
+   /** Mapped vertex buffers */
    ubyte *mapped_vbuffer[PIPE_MAX_ATTRIBS];
    
    /** Mapped constant buffers */
@@ -107,16 +103,11 @@ struct softpipe_context {
    struct vertex_info vertex_info;
    struct vertex_info vertex_info_vbuf;
 
+   /** Which vertex shader output slot contains point size */
    int psize_slot;
 
    unsigned reduced_api_prim;  /**< PIPE_PRIM_POINTS, _LINES or _TRIANGLES */
 
-#if 0
-   /* Stipple derived state:
-    */
-   ubyte stipple_masks[16][16];
-#endif
-
    /** Derived from scissor and surface bounds: */
    struct pipe_scissor_state cliprect;
 
@@ -139,6 +130,14 @@ struct softpipe_context {
       struct quad_stage *first; /**< points to one of the above stages */
    } quad[SP_NUM_QUAD_THREADS];
 
+   /** TGSI exec things */
+   struct {
+      struct sp_shader_sampler vert_samplers[PIPE_MAX_SAMPLERS];
+      struct sp_shader_sampler *vert_samplers_list[PIPE_MAX_SAMPLERS];
+      struct sp_shader_sampler frag_samplers[PIPE_MAX_SAMPLERS];
+      struct sp_shader_sampler *frag_samplers_list[PIPE_MAX_SAMPLERS];
+   } tgsi;
+
    /** The primitive drawing context */
    struct draw_context *draw;
    struct draw_stage *setup;
@@ -150,8 +149,9 @@ struct softpipe_context {
 
    struct softpipe_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
 
-   int use_sse : 1;
-   int dump_fs : 1;
+   unsigned use_sse : 1;
+   unsigned dump_fs : 1;
+   unsigned no_rast : 1;
 };
 
 
diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c
index 424bd56846..f117096bf7 100644
--- a/src/gallium/drivers/softpipe/sp_draw_arrays.c
+++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c
@@ -33,7 +33,7 @@
 
 #include "pipe/p_defines.h"
 #include "pipe/p_context.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "pipe/p_inlines.h"
 
 #include "sp_context.h"
@@ -47,16 +47,22 @@ static void
 softpipe_map_constant_buffers(struct softpipe_context *sp)
 {
    struct pipe_winsys *ws = sp->pipe.winsys;
-   uint i;
+   uint i, size;
+
    for (i = 0; i < PIPE_SHADER_TYPES; i++) {
-      if (sp->constants[i].size)
+      if (sp->constants[i].buffer && sp->constants[i].buffer->size)
          sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i].buffer,
                                                   PIPE_BUFFER_USAGE_CPU_READ);
    }
 
+   if (sp->constants[PIPE_SHADER_VERTEX].buffer)
+      size = sp->constants[PIPE_SHADER_VERTEX].buffer->size;
+   else
+      size = 0;
+
    draw_set_mapped_constant_buffer(sp->draw,
                                    sp->mapped_constants[PIPE_SHADER_VERTEX],
-                                   sp->constants[PIPE_SHADER_VERTEX].size);
+                                   size);
 }
 
 static void
@@ -73,7 +79,7 @@ softpipe_unmap_constant_buffers(struct softpipe_context *sp)
    draw_set_mapped_constant_buffer(sp->draw, NULL, 0);
 
    for (i = 0; i < 2; i++) {
-      if (sp->constants[i].size)
+      if (sp->constants[i].buffer && sp->constants[i].buffer->size)
          ws->buffer_unmap(ws, sp->constants[i].buffer);
       sp->mapped_constants[i] = NULL;
    }
@@ -128,7 +134,7 @@ softpipe_draw_range_elements(struct pipe_context *pipe,
    if (sp->dirty)
       softpipe_update_derived( sp );
 
-   softpipe_map_surfaces(sp);
+   softpipe_map_transfers(sp);
    softpipe_map_constant_buffers(sp);
 
    /*
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
index 401764bb43..035f4b963e 100644
--- a/src/gallium/drivers/softpipe/sp_flush.c
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -57,7 +57,7 @@ softpipe_flush( struct pipe_context *pipe,
    }
 
    if (flags & PIPE_FLUSH_RENDER_CACHE) {
-      for (i = 0; i < softpipe->framebuffer.num_cbufs; i++)
+      for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++)
          if (softpipe->cbuf_cache[i])
             sp_flush_tile_cache(softpipe, softpipe->cbuf_cache[i]);
 
@@ -70,7 +70,7 @@ softpipe_flush( struct pipe_context *pipe,
        * that's called before swapbuffers because we don't always want
        * to unmap surfaces when flushing.
        */
-      softpipe_unmap_surfaces(softpipe);
+      softpipe_unmap_transfers(softpipe);
    }
 
    /* Enable to dump BMPs of the color/depth buffers each frame */
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 701ee4c72f..3c7ba565d6 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -29,7 +29,7 @@
 #include "sp_context.h"
 #include "sp_state.h"
 #include "sp_fs.h"
-#include "sp_headers.h"
+#include "sp_quad.h"
 
 
 #include "pipe/p_state.h"
@@ -39,11 +39,19 @@
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_parse.h"
 
-struct sp_exec_fragment_shader {
+struct sp_exec_fragment_shader
+{
    struct sp_fragment_shader base;
 };
 
 
+/** cast wrapper */
+static INLINE struct sp_exec_fragment_shader *
+sp_exec_fragment_shader(const struct sp_fragment_shader *base)
+{
+   return (struct sp_exec_fragment_shader *) base;
+}
+
 
 /**
  * Compute quad X,Y,Z,W for the four fragments in a quad.
@@ -84,12 +92,18 @@ sp_setup_pos_vector(const struct tgsi_interp_coef *coef,
 static void
 exec_prepare( const struct sp_fragment_shader *base,
 	      struct tgsi_exec_machine *machine,
-	      struct tgsi_sampler *samplers )
+	      struct tgsi_sampler **samplers )
 {
-   tgsi_exec_machine_bind_shader( machine,
-				  base->shader.tokens,
-				  PIPE_MAX_SAMPLERS,
-				  samplers );
+   /*
+    * Bind tokens/shader to the interpreter's machine state.
+    * Avoid redundant binding.
+    */
+   if (machine->Tokens != base->shader.tokens) {
+      tgsi_exec_machine_bind_shader( machine,
+                                     base->shader.tokens,
+                                     PIPE_MAX_SAMPLERS,
+                                     samplers );
+   }
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 50eb2c07bc..7e22081132 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -29,7 +29,7 @@
 #include "sp_context.h"
 #include "sp_state.h"
 #include "sp_fs.h"
-#include "sp_headers.h"
+#include "sp_quad.h"
 
 
 #include "pipe/p_state.h"
@@ -40,7 +40,7 @@
 #include "tgsi/tgsi_sse2.h"
 
 
-#ifdef PIPE_ARCH_X86
+#if defined(PIPE_ARCH_X86)
 
 #include "rtasm/rtasm_x86sse.h"
 
@@ -69,7 +69,7 @@ struct sp_sse_fragment_shader {
 static void
 fs_sse_prepare( const struct sp_fragment_shader *base,
 		struct tgsi_exec_machine *machine,
-		struct tgsi_sampler *samplers )
+		struct tgsi_sampler **samplers )
 {
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_headers.h b/src/gallium/drivers/softpipe/sp_headers.h
deleted file mode 100644
index 4a42cb3c19..0000000000
--- a/src/gallium/drivers/softpipe/sp_headers.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
- */
-
-#ifndef SP_HEADERS_H
-#define SP_HEADERS_H
-
-#include "pipe/p_state.h"
-#include "tgsi/tgsi_exec.h"
-
-#define PRIM_POINT 1
-#define PRIM_LINE  2
-#define PRIM_TRI   3
-
-
-/* The rasterizer generates 2x2 quads of fragment and feeds them to
- * the current fp_machine (see below).
- * Remember that Y=0=top with Y increasing down the window.
- */
-#define QUAD_TOP_LEFT     0
-#define QUAD_TOP_RIGHT    1
-#define QUAD_BOTTOM_LEFT  2
-#define QUAD_BOTTOM_RIGHT 3
-
-#define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
-#define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
-#define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
-#define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
-#define MASK_ALL          0xf
-
-
-/**
- * Encodes everything we need to know about a 2x2 pixel block.  Uses
- * "Channel-Serial" or "SoA" layout.  
- */
-struct quad_header_input
-{
-   int x0;
-   int y0;
-   float coverage[QUAD_SIZE];    /** fragment coverage for antialiasing */
-   unsigned facing:1;   /**< Front (0) or back (1) facing? */
-   unsigned prim:2;     /**< PRIM_POINT, LINE, TRI */
-};
-
-struct quad_header_inout
-{
-   unsigned mask:4;
-};
-
-struct quad_header_output
-{
-   /** colors in SOA format (rrrr, gggg, bbbb, aaaa) */
-   float color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
-   float depth[QUAD_SIZE];
-};
-
-struct quad_header {
-   struct quad_header_input input;
-   struct quad_header_inout inout;
-   struct quad_header_output output;
-
-   const struct tgsi_interp_coef *coef;
-   const struct tgsi_interp_coef *posCoef;
-
-   unsigned nr_attrs;
-};
-
-#endif /* SP_HEADERS_H */
-
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index 425e13cd28..d56eed80a4 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -26,10 +26,10 @@
  **************************************************************************/
 
 /**
- * Post-transform vertex buffering.  This is an optional part of the
- * softpipe rendering pipeline.
- * Probably not desired in general, but useful for testing/debuggin.
- * Enabled/Disabled with SP_VBUF env var.
+ * Interface between 'draw' module's output and the softpipe rasterizer/setup
+ * code.  When the 'draw' module has finished filling a vertex buffer, the
+ * draw_arrays() functions below will be called.  Loop over the vertices and
+ * call the point/line/tri setup functions.
  *
  * Authors
  *  Brian Paul
@@ -60,6 +60,7 @@ struct softpipe_vbuf_render
    struct softpipe_context *softpipe;
    uint prim;
    uint vertex_size;
+   uint vertex_buffer_size;
    void *vertex_buffer;
 };
 
@@ -80,26 +81,44 @@ sp_vbuf_get_vertex_info(struct vbuf_render *vbr)
 }
 
 
-static void *
+static boolean
 sp_vbuf_allocate_vertices(struct vbuf_render *vbr,
-                            ushort vertex_size, ushort nr_vertices)
+                          ushort vertex_size, ushort nr_vertices)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
-   assert(!cvbr->vertex_buffer);
-   cvbr->vertex_buffer = align_malloc(vertex_size * nr_vertices, 16);
+   unsigned size = vertex_size * nr_vertices;
+
+   if (cvbr->vertex_buffer_size < size) {
+      align_free(cvbr->vertex_buffer);
+      cvbr->vertex_buffer = align_malloc(size, 16);
+      cvbr->vertex_buffer_size = size;
+   }
+
    cvbr->vertex_size = vertex_size;
-   return cvbr->vertex_buffer;
+   return cvbr->vertex_buffer != NULL;
 }
 
-
 static void
-sp_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
-                           unsigned vertex_size, unsigned vertices_used)
+sp_vbuf_release_vertices(struct vbuf_render *vbr)
+{
+   /* keep the old allocation for next time */
+}
+
+static void *
+sp_vbuf_map_vertices(struct vbuf_render *vbr)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   return cvbr->vertex_buffer;
+}
+
+static void 
+sp_vbuf_unmap_vertices(struct vbuf_render *vbr, 
+                       ushort min_index,
+                       ushort max_index )
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
-   align_free(vertices);
-   assert(vertices == cvbr->vertex_buffer);
-   cvbr->vertex_buffer = NULL;
+   assert( cvbr->vertex_buffer_size >= (max_index+1) * cvbr->vertex_size );
+   /* do nothing */
 }
 
 
@@ -115,8 +134,6 @@ sp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
    
    setup_prepare( setup_ctx );
 
-
-
    cvbr->prim = prim;
    return TRUE;
 
@@ -131,21 +148,23 @@ static INLINE cptrf4 get_vert( const void *vertex_buffer,
 }
 
 
+/**
+ * draw elements / indexed primitives
+ */
 static void
 sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
    struct softpipe_context *softpipe = cvbr->softpipe;
-   unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
-   unsigned i;
+   const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
    const void *vertex_buffer = cvbr->vertex_buffer;
+   unsigned i;
 
    /* XXX: break this dependency - make setup_context live under
     * softpipe, rename the old "setup" draw stage to something else.
     */
    struct draw_stage *setup = softpipe->setup;
-   struct setup_context *setup_ctx = sp_draw_setup_context(softpipe->setup);
-
+   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
 
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
@@ -258,13 +277,16 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
    struct softpipe_context *softpipe = cvbr->softpipe;
-   struct draw_stage *setup = softpipe->setup;
-   const void *vertex_buffer = NULL;
    const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
+   const void *vertex_buffer =
+      (void *) get_vert(cvbr->vertex_buffer, start, stride);
    unsigned i;
-   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
 
-   vertex_buffer = (void *)get_vert(cvbr->vertex_buffer, start, stride);
+   /* XXX: break this dependency - make setup_context live under
+    * softpipe, rename the old "setup" draw stage to something else.
+    */
+   struct draw_stage *setup = softpipe->setup;
+   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
 
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
@@ -389,6 +411,8 @@ sp_init_vbuf(struct softpipe_context *sp)
 
    sp->vbuf_render->base.get_vertex_info = sp_vbuf_get_vertex_info;
    sp->vbuf_render->base.allocate_vertices = sp_vbuf_allocate_vertices;
+   sp->vbuf_render->base.map_vertices = sp_vbuf_map_vertices;
+   sp->vbuf_render->base.unmap_vertices = sp_vbuf_unmap_vertices;
    sp->vbuf_render->base.set_primitive = sp_vbuf_set_primitive;
    sp->vbuf_render->base.draw = sp_vbuf_draw;
    sp->vbuf_render->base.draw_arrays = sp_vbuf_draw_arrays;
diff --git a/src/gallium/drivers/softpipe/sp_quad.h b/src/gallium/drivers/softpipe/sp_quad.h
index 08513cb95f..bd6c6cb912 100644
--- a/src/gallium/drivers/softpipe/sp_quad.h
+++ b/src/gallium/drivers/softpipe/sp_quad.h
@@ -31,39 +31,76 @@
 #ifndef SP_QUAD_H
 #define SP_QUAD_H
 
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_exec.h"
 
-struct softpipe_context;
-struct quad_header;
 
+#define QUAD_PRIM_POINT 1
+#define QUAD_PRIM_LINE  2
+#define QUAD_PRIM_TRI   3
 
-struct quad_stage {
-   struct softpipe_context *softpipe;
 
-   struct quad_stage *next;
+/* The rasterizer generates 2x2 quads of fragment and feeds them to
+ * the current fp_machine (see below).
+ * Remember that Y=0=top with Y increasing down the window.
+ */
+#define QUAD_TOP_LEFT     0
+#define QUAD_TOP_RIGHT    1
+#define QUAD_BOTTOM_LEFT  2
+#define QUAD_BOTTOM_RIGHT 3
+
+#define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
+#define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
+#define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
+#define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
+#define MASK_ALL          0xf
+
+
+/**
+ * Quad stage inputs (pos, coverage, front/back face, etc)
+ */
+struct quad_header_input
+{
+   int x0, y0;                /**< quad window pos, always even */
+   float coverage[QUAD_SIZE]; /**< fragment coverage for antialiasing */
+   unsigned facing:1;         /**< Front (0) or back (1) facing? */
+   unsigned prim:2;           /**< QUAD_PRIM_POINT, LINE, TRI */
+};
 
-   void (*begin)(struct quad_stage *qs);
 
-   /** the stage action */
-   void (*run)(struct quad_stage *qs, struct quad_header *quad);
+/**
+ * Quad stage inputs/outputs.
+ */
+struct quad_header_inout
+{
+   unsigned mask:4;
+};
+
 
-   void (*destroy)(struct quad_stage *qs);
+/**
+ * Quad stage outputs (color & depth).
+ */
+struct quad_header_output
+{
+   /** colors in SOA format (rrrr, gggg, bbbb, aaaa) */
+   float color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
+   float depth[QUAD_SIZE];
 };
 
 
-struct quad_stage *sp_quad_polygon_stipple_stage( struct softpipe_context *softpipe );
-struct quad_stage *sp_quad_earlyz_stage( struct softpipe_context *softpipe );
-struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe );
-struct quad_stage *sp_quad_alpha_test_stage( struct softpipe_context *softpipe );
-struct quad_stage *sp_quad_stencil_test_stage( struct softpipe_context *softpipe );
-struct quad_stage *sp_quad_depth_test_stage( struct softpipe_context *softpipe );
-struct quad_stage *sp_quad_occlusion_stage( struct softpipe_context *softpipe );
-struct quad_stage *sp_quad_coverage_stage( struct softpipe_context *softpipe );
-struct quad_stage *sp_quad_blend_stage( struct softpipe_context *softpipe );
-struct quad_stage *sp_quad_colormask_stage( struct softpipe_context *softpipe );
-struct quad_stage *sp_quad_output_stage( struct softpipe_context *softpipe );
+/**
+ * Encodes everything we need to know about a 2x2 pixel block.  Uses
+ * "Channel-Serial" or "SoA" layout.  
+ */
+struct quad_header {
+   struct quad_header_input input;
+   struct quad_header_inout inout;
+   struct quad_header_output output;
 
-void sp_build_quad_pipeline(struct softpipe_context *sp);
+   const struct tgsi_interp_coef *coef;
+   const struct tgsi_interp_coef *posCoef;
 
-void sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad);
+   unsigned nr_attrs;
+};
 
 #endif /* SP_QUAD_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
index 5bebd141e9..0845bae0e6 100644
--- a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
@@ -4,8 +4,8 @@
  */
 
 #include "sp_context.h"
-#include "sp_headers.h"
 #include "sp_quad.h"
+#include "sp_quad_pipe.h"
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
 
@@ -14,7 +14,7 @@ static void
 alpha_test_quad(struct quad_stage *qs, struct quad_header *quad)
 {
    struct softpipe_context *softpipe = qs->softpipe;
-   const float ref = softpipe->depth_stencil->alpha.ref;
+   const float ref = softpipe->depth_stencil->alpha.ref_value;
    unsigned passMask = 0x0, j;
    const uint cbuf = 0; /* only output[0].alpha is tested */
    const float *aaaa = quad->output.color[cbuf][3];
diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c
index 6f64c6e584..e134e44337 100644
--- a/src/gallium/drivers/softpipe/sp_quad_blend.c
+++ b/src/gallium/drivers/softpipe/sp_quad_blend.c
@@ -34,10 +34,10 @@
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "sp_context.h"
-#include "sp_headers.h"
+#include "sp_quad.h"
 #include "sp_surface.h"
 #include "sp_tile_cache.h"
-#include "sp_quad.h"
+#include "sp_quad_pipe.h"
 
 
 #define VEC4_COPY(DST, SRC) \
@@ -105,7 +105,7 @@ logicop_quad(struct quad_stage *qs, struct quad_header *quad)
    uint cbuf;
 
    /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.num_cbufs; cbuf++) {
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
       float dest[4][QUAD_SIZE];
       ubyte src[4][4], dst[4][4], res[4][4];
       uint *src4 = (uint *) src;
@@ -239,7 +239,7 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
    }
 
    /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.num_cbufs; cbuf++) {
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
       float source[4][QUAD_SIZE], dest[4][QUAD_SIZE];
       struct softpipe_cached_tile *tile
          = sp_get_cached_tile(softpipe,
diff --git a/src/gallium/drivers/softpipe/sp_quad_bufloop.c b/src/gallium/drivers/softpipe/sp_quad_bufloop.c
index 92e9af09c1..953d8516b9 100644
--- a/src/gallium/drivers/softpipe/sp_quad_bufloop.c
+++ b/src/gallium/drivers/softpipe/sp_quad_bufloop.c
@@ -1,9 +1,9 @@
 
 #include "util/u_memory.h"
 #include "sp_context.h"
-#include "sp_headers.h"
-#include "sp_surface.h"
 #include "sp_quad.h"
+#include "sp_surface.h"
+#include "sp_quad_pipe.h"
 
 
 /**
@@ -17,7 +17,7 @@ cbuf_loop_quad(struct quad_stage *qs, struct quad_header *quad)
    unsigned i;
 
    assert(sizeof(quad->outputs.color) == sizeof(tmp));
-   assert(softpipe->framebuffer.num_cbufs <= PIPE_MAX_COLOR_BUFS);
+   assert(softpipe->framebuffer.nr_cbufs <= PIPE_MAX_COLOR_BUFS);
 
    /* make copy of original colors since they can get modified
     * by blending and masking.
@@ -28,7 +28,7 @@ cbuf_loop_quad(struct quad_stage *qs, struct quad_header *quad)
     */
    memcpy(tmp, quad->outputs.color, sizeof(tmp));
 
-   for (i = 0; i < softpipe->framebuffer.num_cbufs; i++) {
+   for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) {
       /* set current cbuffer */
 #if 0 /* obsolete & going away */
       softpipe->current_cbuf = i;
diff --git a/src/gallium/drivers/softpipe/sp_quad_colormask.c b/src/gallium/drivers/softpipe/sp_quad_colormask.c
index f32bdfab78..dc90e5d5e9 100644
--- a/src/gallium/drivers/softpipe/sp_quad_colormask.c
+++ b/src/gallium/drivers/softpipe/sp_quad_colormask.c
@@ -34,9 +34,9 @@
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "sp_context.h"
-#include "sp_headers.h"
-#include "sp_surface.h"
 #include "sp_quad.h"
+#include "sp_surface.h"
+#include "sp_quad_pipe.h"
 #include "sp_tile_cache.h"
 
 
@@ -51,7 +51,7 @@ colormask_quad(struct quad_stage *qs, struct quad_header *quad)
    uint cbuf;
 
    /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.num_cbufs; cbuf++) {
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
       float dest[4][QUAD_SIZE];
       struct softpipe_cached_tile *tile
          = sp_get_cached_tile(softpipe,
diff --git a/src/gallium/drivers/softpipe/sp_quad_coverage.c b/src/gallium/drivers/softpipe/sp_quad_coverage.c
index ee29aa7dfe..4aeee85870 100644
--- a/src/gallium/drivers/softpipe/sp_quad_coverage.c
+++ b/src/gallium/drivers/softpipe/sp_quad_coverage.c
@@ -35,8 +35,8 @@
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
 #include "sp_context.h"
-#include "sp_headers.h"
 #include "sp_quad.h"
+#include "sp_quad_pipe.h"
 
 
 /**
@@ -46,14 +46,15 @@ static void
 coverage_quad(struct quad_stage *qs, struct quad_header *quad)
 {
    struct softpipe_context *softpipe = qs->softpipe;
+   const uint prim = quad->input.prim;
 
-   if ((softpipe->rasterizer->poly_smooth && quad->input.prim == PRIM_TRI) ||
-       (softpipe->rasterizer->line_smooth && quad->input.prim == PRIM_LINE) ||
-       (softpipe->rasterizer->point_smooth && quad->input.prim == PRIM_POINT)) {
+   if ((softpipe->rasterizer->poly_smooth && prim == QUAD_PRIM_TRI) ||
+       (softpipe->rasterizer->line_smooth && prim == QUAD_PRIM_LINE) ||
+       (softpipe->rasterizer->point_smooth && prim == QUAD_PRIM_POINT)) {
       uint cbuf;
 
       /* loop over colorbuffer outputs */
-      for (cbuf = 0; cbuf < softpipe->framebuffer.num_cbufs; cbuf++) {
+      for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
          float (*quadColor)[4] = quad->output.color[cbuf];
          unsigned j;
          for (j = 0; j < QUAD_SIZE; j++) {
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
index 523bd3e080..d463930bae 100644
--- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -32,9 +32,9 @@
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
 #include "sp_context.h"
-#include "sp_headers.h"
-#include "sp_surface.h"
 #include "sp_quad.h"
+#include "sp_surface.h"
+#include "sp_quad_pipe.h"
 #include "sp_tile_cache.h"
 
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_earlyz.c b/src/gallium/drivers/softpipe/sp_quad_earlyz.c
index 6e2dde304e..496fd39ed1 100644
--- a/src/gallium/drivers/softpipe/sp_quad_earlyz.c
+++ b/src/gallium/drivers/softpipe/sp_quad_earlyz.c
@@ -31,8 +31,8 @@
 
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
-#include "sp_headers.h"
 #include "sp_quad.h"
+#include "sp_quad_pipe.h"
 
 
 /**
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 1f0cb3e035..adca5df73d 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -2,6 +2,7 @@
  * 
  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2008 VMware, Inc.  All rights reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -42,16 +43,15 @@
 
 #include "sp_context.h"
 #include "sp_state.h"
-#include "sp_headers.h"
 #include "sp_quad.h"
+#include "sp_quad_pipe.h"
 #include "sp_texture.h"
 #include "sp_tex_sample.h"
 
 
 struct quad_shade_stage
 {
-   struct quad_stage stage;
-   struct tgsi_sampler samplers[PIPE_MAX_SAMPLERS];
+   struct quad_stage stage;  /**< base class */
    struct tgsi_exec_machine machine;
    struct tgsi_exec_vector *inputs, *outputs;
 };
@@ -147,18 +147,11 @@ static void shade_begin(struct quad_stage *qs)
 {
    struct quad_shade_stage *qss = quad_shade_stage(qs);
    struct softpipe_context *softpipe = qs->softpipe;
-   unsigned i;
-   unsigned num = MAX2(softpipe->num_textures, softpipe->num_samplers);
-
-   /* set TGSI sampler state that varies */
-   for (i = 0; i < num; i++) {
-      qss->samplers[i].state = softpipe->sampler[i];
-      qss->samplers[i].texture = softpipe->texture[i];
-   }
 
    softpipe->fs->prepare( softpipe->fs, 
 			  &qss->machine,
-			  qss->samplers );
+			  (struct tgsi_sampler **)
+                             softpipe->tgsi.frag_samplers_list );
 
    qs->next->begin(qs->next);
 }
@@ -178,7 +171,6 @@ static void shade_destroy(struct quad_stage *qs)
 struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe )
 {
    struct quad_shade_stage *qss = CALLOC_STRUCT(quad_shade_stage);
-   uint i;
 
    /* allocate storage for program inputs/outputs, aligned to 16 bytes */
    qss->inputs = MALLOC(PIPE_MAX_ATTRIBS * sizeof(*qss->inputs) + 16);
@@ -191,14 +183,6 @@ struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe )
    qss->stage.run = shade_quad;
    qss->stage.destroy = shade_destroy;
 
-   /* set TGSI sampler state that's constant */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      assert(softpipe->tex_cache[i]);
-      qss->samplers[i].get_samples = sp_get_samples;
-      qss->samplers[i].pipe = &softpipe->pipe;
-      qss->samplers[i].cache = softpipe->tex_cache[i];
-   }
-
    tgsi_exec_machine_init( &qss->machine );
 
    return &qss->stage;
diff --git a/src/gallium/drivers/softpipe/sp_quad_occlusion.c b/src/gallium/drivers/softpipe/sp_quad_occlusion.c
index 169bd82876..dfa7ff3b1d 100644
--- a/src/gallium/drivers/softpipe/sp_quad_occlusion.c
+++ b/src/gallium/drivers/softpipe/sp_quad_occlusion.c
@@ -35,9 +35,9 @@
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
 #include "sp_context.h"
-#include "sp_headers.h"
-#include "sp_surface.h"
 #include "sp_quad.h"
+#include "sp_surface.h"
+#include "sp_quad_pipe.h"
 
 static unsigned count_bits( unsigned val )
 {
diff --git a/src/gallium/drivers/softpipe/sp_quad_output.c b/src/gallium/drivers/softpipe/sp_quad_output.c
index b7aac7f84a..92d5f9f3c1 100644
--- a/src/gallium/drivers/softpipe/sp_quad_output.c
+++ b/src/gallium/drivers/softpipe/sp_quad_output.c
@@ -27,9 +27,9 @@
 
 #include "util/u_memory.h"
 #include "sp_context.h"
-#include "sp_headers.h"
-#include "sp_surface.h"
 #include "sp_quad.h"
+#include "sp_surface.h"
+#include "sp_quad_pipe.h"
 #include "sp_tile_cache.h"
 
 
@@ -48,7 +48,7 @@ output_quad(struct quad_stage *qs, struct quad_header *quad)
    uint cbuf;
 
    /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.num_cbufs; cbuf++) {
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
       struct softpipe_cached_tile *tile
          = sp_get_cached_tile(softpipe,
                               softpipe->cbuf_cache[cbuf],
diff --git a/src/gallium/drivers/softpipe/sp_quad.c b/src/gallium/drivers/softpipe/sp_quad_pipe.c
index 892ef87ee9..892ef87ee9 100644
--- a/src/gallium/drivers/softpipe/sp_quad.c
+++ b/src/gallium/drivers/softpipe/sp_quad_pipe.c
diff --git a/src/gallium/drivers/softpipe/sp_quad_pipe.h b/src/gallium/drivers/softpipe/sp_quad_pipe.h
new file mode 100644
index 0000000000..0e40586ffc
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_pipe.h
@@ -0,0 +1,74 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_QUAD_PIPE_H
+#define SP_QUAD_PIPE_H
+
+
+struct softpipe_context;
+struct quad_header;
+
+
+/**
+ * Fragment processing is performed on 2x2 blocks of pixels called "quads".
+ * Quad processing is performed with a pipeline of stages represented by
+ * this type.
+ */
+struct quad_stage {
+   struct softpipe_context *softpipe;
+
+   struct quad_stage *next;
+
+   void (*begin)(struct quad_stage *qs);
+
+   /** the stage action */
+   void (*run)(struct quad_stage *qs, struct quad_header *quad);
+
+   void (*destroy)(struct quad_stage *qs);
+};
+
+
+struct quad_stage *sp_quad_polygon_stipple_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_earlyz_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_alpha_test_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_stencil_test_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_depth_test_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_occlusion_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_coverage_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_blend_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_colormask_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_output_stage( struct softpipe_context *softpipe );
+
+void sp_build_quad_pipeline(struct softpipe_context *sp);
+
+void sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad);
+
+#endif /* SP_QUAD_PIPE_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad_stencil.c b/src/gallium/drivers/softpipe/sp_quad_stencil.c
index abb5487748..5e9d447737 100644
--- a/src/gallium/drivers/softpipe/sp_quad_stencil.c
+++ b/src/gallium/drivers/softpipe/sp_quad_stencil.c
@@ -5,10 +5,10 @@
 
 
 #include "sp_context.h"
-#include "sp_headers.h"
+#include "sp_quad.h"
 #include "sp_surface.h"
 #include "sp_tile_cache.h"
-#include "sp_quad.h"
+#include "sp_quad_pipe.h"
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
 
@@ -222,8 +222,8 @@ stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
    zFailOp = softpipe->depth_stencil->stencil[face].zfail_op;
    zPassOp = softpipe->depth_stencil->stencil[face].zpass_op;
    ref     = softpipe->depth_stencil->stencil[face].ref_value;
-   wrtMask = softpipe->depth_stencil->stencil[face].write_mask;
-   valMask = softpipe->depth_stencil->stencil[face].value_mask;
+   wrtMask = softpipe->depth_stencil->stencil[face].writemask;
+   valMask = softpipe->depth_stencil->stencil[face].valuemask;
 
    assert(ps); /* shouldn't get here if there's no stencil buffer */
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_stipple.c b/src/gallium/drivers/softpipe/sp_quad_stipple.c
index ccf37f6be5..05e862f097 100644
--- a/src/gallium/drivers/softpipe/sp_quad_stipple.c
+++ b/src/gallium/drivers/softpipe/sp_quad_stipple.c
@@ -4,8 +4,8 @@
  */
 
 #include "sp_context.h"
-#include "sp_headers.h"
 #include "sp_quad.h"
+#include "sp_quad_pipe.h"
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
 
@@ -19,11 +19,13 @@ stipple_quad(struct quad_stage *qs, struct quad_header *quad)
    static const uint bit31 = 1 << 31;
    static const uint bit30 = 1 << 30;
 
-   if (quad->input.prim == PRIM_TRI) {
+   if (quad->input.prim == QUAD_PRIM_TRI) {
       struct softpipe_context *softpipe = qs->softpipe;
       /* need to invert Y to index into OpenGL's stipple pattern */
       int y0, y1;
       uint stipple0, stipple1;
+      const int col0 = quad->input.x0 % 32;
+
       if (softpipe->rasterizer->origin_lower_left) {
          y0 = softpipe->framebuffer.height - 1 - quad->input.y0;
          y1 = y0 - 1;
@@ -32,12 +34,11 @@ stipple_quad(struct quad_stage *qs, struct quad_header *quad)
          y0 = quad->input.y0;
          y1 = y0 + 1;
       }
+
       stipple0 = softpipe->poly_stipple.stipple[y0 % 32];
       stipple1 = softpipe->poly_stipple.stipple[y1 % 32];
 
-#if 1
-      {
-      const int col0 = quad->input.x0 % 32;
+      /* turn off quad mask bits that fail the stipple test */
       if ((stipple0 & (bit31 >> col0)) == 0)
          quad->inout.mask &= ~MASK_TOP_LEFT;
 
@@ -49,19 +50,11 @@ stipple_quad(struct quad_stage *qs, struct quad_header *quad)
 
       if ((stipple1 & (bit30 >> col0)) == 0)
          quad->inout.mask &= ~MASK_BOTTOM_RIGHT;
-      }
-#else
-      /* We'd like to use this code, but we'd need to redefine
-       * MASK_TOP_LEFT to be (1 << 1) and MASK_TOP_RIGHT to be (1 << 0),
-       * and similarly for the BOTTOM bits.  But that may have undesirable
-       * side effects elsewhere.
-       */
-      const int col0 = 30 - (quad->input.x0 % 32);
-      quad->inout.mask &= (((stipple0 >> col0) & 0x3) | 
-                     (((stipple1 >> col0) & 0x3) << 2));
-#endif
-      if (!quad->inout.mask)
+
+      if (!quad->inout.mask) {
+         /* all fragments failed stipple test, end of quad pipeline */
          return;
+      }
    }
 
    qs->next->run(qs->next, quad);
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
index 2106ee1d23..b0d8e01426 100644
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -37,8 +37,8 @@
 #include "sp_query.h"
 
 struct softpipe_query {
-   uint64 start;
-   uint64 end;
+   uint64_t start;
+   uint64_t end;
 };
 
 
@@ -87,7 +87,7 @@ static boolean
 softpipe_get_query_result(struct pipe_context *pipe, 
 			  struct pipe_query *q,
 			  boolean wait,
-			  uint64 *result )
+			  uint64_t *result )
 {
    struct softpipe_query *sq = softpipe_query(q);
    *result = sq->end - sq->start;
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 9644dbd168..7380a6ae2b 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -27,7 +27,8 @@
 
 
 #include "util/u_memory.h"
-#include "pipe/p_winsys.h"
+#include "util/u_simple_screen.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 
@@ -55,7 +56,9 @@ softpipe_get_param(struct pipe_screen *screen, int param)
 {
    switch (param) {
    case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
-      return 8;
+      return PIPE_MAX_SAMPLERS;
+   case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+      return PIPE_MAX_SAMPLERS;
    case PIPE_CAP_NPOT_TEXTURES:
       return 1;
    case PIPE_CAP_TWO_SIDED_STENCIL:
@@ -172,6 +175,7 @@ softpipe_create_screen(struct pipe_winsys *winsys)
    screen->base.is_format_supported = softpipe_is_format_supported;
 
    softpipe_init_screen_texture_funcs(&screen->base);
+   u_simple_screen_init(&screen->base);
 
    return &screen->base;
 }
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index 13d8017393..0925653b5d 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -32,13 +32,12 @@
  * \author  Brian Paul
  */
 
-#include "sp_setup.h"
-
 #include "sp_context.h"
-#include "sp_headers.h"
+#include "sp_prim_setup.h"
 #include "sp_quad.h"
+#include "sp_quad_pipe.h"
+#include "sp_setup.h"
 #include "sp_state.h"
-#include "sp_prim_setup.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vertex.h"
@@ -265,17 +264,20 @@ is_inf_or_nan(float x)
 }
 
 
-static boolean cull_tri( struct setup_context *setup,
-		      float det )
+/**
+ * Do triangle cull test using tri determinant (sign indicates orientation)
+ * \return true if triangle is to be culled.
+ */
+static INLINE boolean
+cull_tri(const struct setup_context *setup, float det)
 {
-   if (det != 0) 
-   {   
+   if (det != 0) {   
       /* if (det < 0 then Z points toward camera and triangle is 
        * counter-clockwise winding.
        */
       unsigned winding = (det < 0) ? PIPE_WINDING_CCW : PIPE_WINDING_CW;
-      
-      if ((winding & setup->winding) == 0) 
+
+      if ((winding & setup->winding) == 0)
 	 return FALSE;
    }
 
@@ -968,7 +970,7 @@ void setup_tri( struct setup_context *setup,
    setup_tri_coefficients( setup );
    setup_tri_edges( setup );
 
-   setup->quad.input.prim = PRIM_TRI;
+   setup->quad.input.prim = QUAD_PRIM_TRI;
 
    setup->span.y = 0;
    setup->span.y_flags = 0;
@@ -1009,7 +1011,7 @@ void setup_tri( struct setup_context *setup,
  * for a line.
  */
 static void
-line_linear_coeff(struct setup_context *setup,
+line_linear_coeff(const struct setup_context *setup,
                   struct tgsi_interp_coef *coef,
                   uint vertSlot, uint i)
 {
@@ -1029,9 +1031,9 @@ line_linear_coeff(struct setup_context *setup,
  * for a line.
  */
 static void
-line_persp_coeff(struct setup_context *setup,
-                  struct tgsi_interp_coef *coef,
-                  uint vertSlot, uint i)
+line_persp_coeff(const struct setup_context *setup,
+                 struct tgsi_interp_coef *coef,
+                 uint vertSlot, uint i)
 {
    /* XXX double-check/verify this arithmetic */
    const float a0 = setup->vmin[vertSlot][i] * setup->vmin[0][3];
@@ -1206,7 +1208,7 @@ setup_line(struct setup_context *setup,
 
    setup->quad.input.x0 = setup->quad.input.y0 = -1;
    setup->quad.inout.mask = 0x0;
-   setup->quad.input.prim = PRIM_LINE;
+   setup->quad.input.prim = QUAD_PRIM_LINE;
    /* XXX temporary: set coverage to 1.0 so the line appears
     * if AA mode happens to be enabled.
     */
@@ -1266,7 +1268,7 @@ setup_line(struct setup_context *setup,
 
 
 static void
-point_persp_coeff(struct setup_context *setup,
+point_persp_coeff(const struct setup_context *setup,
                   const float (*vert)[4],
                   struct tgsi_interp_coef *coef,
                   uint vertSlot, uint i)
@@ -1361,7 +1363,7 @@ setup_point( struct setup_context *setup,
       }
    }
 
-   setup->quad.input.prim = PRIM_POINT;
+   setup->quad.input.prim = QUAD_PRIM_POINT;
 
    if (halfSize <= 0.5 && !round) {
       /* special case for 1-pixel points */
@@ -1497,7 +1499,7 @@ void setup_prepare( struct setup_context *setup )
    }
 
    /* Mark surfaces as defined now */
-   for (i = 0; i < sp->framebuffer.num_cbufs; i++){
+   for (i = 0; i < sp->framebuffer.nr_cbufs; i++){
       if (sp->framebuffer.cbufs[i]) {
          sp->framebuffer.cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
       }
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 476ef3dc8f..6f558e6da5 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -69,7 +69,7 @@ struct sp_fragment_shader {
 
    void (*prepare)( const struct sp_fragment_shader *shader,
 		    struct tgsi_exec_machine *machine,
-		    struct tgsi_sampler *samplers);
+		    struct tgsi_sampler **samplers);
 
    /* Run the shader - this interface will get cleaned up in the
     * future:
@@ -184,10 +184,10 @@ softpipe_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags);
 
 
 void
-softpipe_map_surfaces(struct softpipe_context *sp);
+softpipe_map_transfers(struct softpipe_context *sp);
 
 void
-softpipe_unmap_surfaces(struct softpipe_context *sp);
+softpipe_unmap_transfers(struct softpipe_context *sp);
 
 void
 softpipe_map_texture_surfaces(struct softpipe_context *sp);
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
index e5b609cf6c..4d01a9dbe1 100644
--- a/src/gallium/drivers/softpipe/sp_state_fs.c
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -32,7 +32,7 @@
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
 #include "tgsi/tgsi_dump.h"
@@ -146,16 +146,15 @@ softpipe_set_constant_buffer(struct pipe_context *pipe,
                              const struct pipe_constant_buffer *buf)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
-   struct pipe_winsys *ws = pipe->winsys;
+   struct pipe_screen *screen = pipe->screen;
 
    assert(shader < PIPE_SHADER_TYPES);
    assert(index == 0);
 
    /* note: reference counting */
-   winsys_buffer_reference(ws,
+   pipe_buffer_reference(screen,
 			 &softpipe->constants[shader].buffer,
 			 buf ? buf->buffer : NULL);
-   softpipe->constants[shader].size = buf ? buf->size : 0;
 
    softpipe->dirty |= SP_NEW_CONSTANTS;
 }
diff --git a/src/gallium/drivers/softpipe/sp_state_surface.c b/src/gallium/drivers/softpipe/sp_state_surface.c
index b5376e522d..1493c65884 100644
--- a/src/gallium/drivers/softpipe/sp_state_surface.c
+++ b/src/gallium/drivers/softpipe/sp_state_surface.c
@@ -64,7 +64,7 @@ softpipe_set_framebuffer_state(struct pipe_context *pipe,
       }
    }
 
-   sp->framebuffer.num_cbufs = fb->num_cbufs;
+   sp->framebuffer.nr_cbufs = fb->nr_cbufs;
 
    /* zbuf changing? */
    if (sp->framebuffer.zsbuf != fb->zsbuf) {
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 49250ec084..adbd0cb7f0 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -2,6 +2,7 @@
  * 
  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2008 VMware, Inc.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -33,17 +34,18 @@
  */
 
 #include "sp_context.h"
-#include "sp_headers.h"
+#include "sp_quad.h"
 #include "sp_surface.h"
+#include "sp_texture.h"
 #include "sp_tex_sample.h"
 #include "sp_tile_cache.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
-#include "tgsi/tgsi_exec.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
 
+
 /*
  * Note, the FRAC macro has to work perfectly.  Otherwise you'll sometimes
  * see 1-pixel bands of improperly weighted linear-filtered textures.
@@ -57,7 +59,11 @@
 /**
  * Linear interpolation macro
  */
-#define LERP(T, A, B)  ( (A) + (T) * ((B) - (A)) )
+static INLINE float
+lerp(float a, float v0, float v1)
+{
+   return v0 + a * (v1 - v0);
+}
 
 
 /**
@@ -72,13 +78,28 @@ static INLINE float
 lerp_2d(float a, float b,
         float v00, float v10, float v01, float v11)
 {
-   const float temp0 = LERP(a, v00, v10);
-   const float temp1 = LERP(a, v01, v11);
-   return LERP(b, temp0, temp1);
+   const float temp0 = lerp(a, v00, v10);
+   const float temp1 = lerp(a, v01, v11);
+   return lerp(b, temp0, temp1);
 }
 
 
 /**
+ * As above, but 3D interpolation of 8 values.
+ */
+static INLINE float
+lerp_3d(float a, float b, float c,
+        float v000, float v100, float v010, float v110,
+        float v001, float v101, float v011, float v111)
+{
+   const float temp0 = lerp_2d(a, b, v000, v100, v010, v110);
+   const float temp1 = lerp_2d(a, b, v001, v101, v011, v111);
+   return lerp(c, temp0, temp1);
+}
+
+
+
+/**
  * If A is a signed integer, A % B doesn't give the right value for A < 0
  * (in terms of texture repeat).  Just casting to unsigned fixes that.
  */
@@ -86,250 +107,275 @@ lerp_2d(float a, float b,
 
 
 /**
- * Apply texture coord wrapping mode and return integer texture index.
+ * Apply texture coord wrapping mode and return integer texture indexes
+ * for a vector of four texcoords (S or T or P).
  * \param wrapMode  PIPE_TEX_WRAP_x
- * \param s  the texcoord
+ * \param s  the incoming texcoords
  * \param size  the texture image size
+ * \param icoord  returns the integer texcoords
  * \return  integer texture index
  */
-static INLINE int
-nearest_texcoord(unsigned wrapMode, float s, unsigned size)
+static INLINE void
+nearest_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
+                   int icoord[4])
 {
-   int i;
+   uint ch;
    switch (wrapMode) {
    case PIPE_TEX_WRAP_REPEAT:
       /* s limited to [0,1) */
       /* i limited to [0,size-1] */
-      i = util_ifloor(s * size);
-      i = REMAINDER(i, size);
-      return i;
+      for (ch = 0; ch < 4; ch++) {
+         int i = util_ifloor(s[ch] * size);
+         icoord[ch] = REMAINDER(i, size);
+      }
+      return;
    case PIPE_TEX_WRAP_CLAMP:
       /* s limited to [0,1] */
       /* i limited to [0,size-1] */
-      if (s <= 0.0F)
-         i = 0;
-      else if (s >= 1.0F)
-         i = size - 1;
-      else
-         i = util_ifloor(s * size);
-      return i;
+      for (ch = 0; ch < 4; ch++) {
+         if (s[ch] <= 0.0F)
+            icoord[ch] = 0;
+         else if (s[ch] >= 1.0F)
+            icoord[ch] = size - 1;
+         else
+            icoord[ch] = util_ifloor(s[ch] * size);
+      }
+      return;
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
       {
          /* s limited to [min,max] */
          /* i limited to [0, size-1] */
          const float min = 1.0F / (2.0F * size);
          const float max = 1.0F - min;
-         if (s < min)
-            i = 0;
-         else if (s > max)
-            i = size - 1;
-         else
-            i = util_ifloor(s * size);
+         for (ch = 0; ch < 4; ch++) {
+            if (s[ch] < min)
+               icoord[ch] = 0;
+            else if (s[ch] > max)
+               icoord[ch] = size - 1;
+            else
+               icoord[ch] = util_ifloor(s[ch] * size);
+         }
       }
-      return i;
+      return;
    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
       {
          /* s limited to [min,max] */
          /* i limited to [-1, size] */
          const float min = -1.0F / (2.0F * size);
          const float max = 1.0F - min;
-         if (s <= min)
-            i = -1;
-         else if (s >= max)
-            i = size;
-         else
-            i = util_ifloor(s * size);
+         for (ch = 0; ch < 4; ch++) {
+            if (s[ch] <= min)
+               icoord[ch] = -1;
+            else if (s[ch] >= max)
+               icoord[ch] = size;
+            else
+               icoord[ch] = util_ifloor(s[ch] * size);
+         }
       }
-      return i;
+      return;
    case PIPE_TEX_WRAP_MIRROR_REPEAT:
       {
          const float min = 1.0F / (2.0F * size);
          const float max = 1.0F - min;
-         const int flr = util_ifloor(s);
-         float u;
-         if (flr & 1)
-            u = 1.0F - (s - (float) flr);
-         else
-            u = s - (float) flr;
-         if (u < min)
-            i = 0;
-         else if (u > max)
-            i = size - 1;
-         else
-            i = util_ifloor(u * size);
+         for (ch = 0; ch < 4; ch++) {
+            const int flr = util_ifloor(s[ch]);
+            float u;
+            if (flr & 1)
+               u = 1.0F - (s[ch] - (float) flr);
+            else
+               u = s[ch] - (float) flr;
+            if (u < min)
+               icoord[ch] = 0;
+            else if (u > max)
+               icoord[ch] = size - 1;
+            else
+               icoord[ch] = util_ifloor(u * size);
+         }
       }
-      return i;
+      return;
    case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      {
+      for (ch = 0; ch < 4; ch++) {
          /* s limited to [0,1] */
          /* i limited to [0,size-1] */
-         const float u = fabsf(s);
+         const float u = fabsf(s[ch]);
          if (u <= 0.0F)
-            i = 0;
+            icoord[ch] = 0;
          else if (u >= 1.0F)
-            i = size - 1;
+            icoord[ch] = size - 1;
          else
-            i = util_ifloor(u * size);
+            icoord[ch] = util_ifloor(u * size);
       }
-      return i;
+      return;
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
       {
          /* s limited to [min,max] */
          /* i limited to [0, size-1] */
          const float min = 1.0F / (2.0F * size);
          const float max = 1.0F - min;
-         const float u = fabsf(s);
-         if (u < min)
-            i = 0;
-         else if (u > max)
-            i = size - 1;
-         else
-            i = util_ifloor(u * size);
+         for (ch = 0; ch < 4; ch++) {
+            const float u = fabsf(s[ch]);
+            if (u < min)
+               icoord[ch] = 0;
+            else if (u > max)
+               icoord[ch] = size - 1;
+            else
+               icoord[ch] = util_ifloor(u * size);
+         }
       }
-      return i;
+      return;
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
       {
          /* s limited to [min,max] */
          /* i limited to [0, size-1] */
          const float min = -1.0F / (2.0F * size);
          const float max = 1.0F - min;
-         const float u = fabsf(s);
-         if (u < min)
-            i = -1;
-         else if (u > max)
-            i = size;
-         else
-            i = util_ifloor(u * size);
+         for (ch = 0; ch < 4; ch++) {
+            const float u = fabsf(s[ch]);
+            if (u < min)
+               icoord[ch] = -1;
+            else if (u > max)
+               icoord[ch] = size;
+            else
+               icoord[ch] = util_ifloor(u * size);
+         }
       }
-      return i;
+      return;
    default:
       assert(0);
-      return 0;
    }
 }
 
 
 /**
- * Used to compute texel locations for linear sampling.
+ * Used to compute texel locations for linear sampling for four texcoords.
  * \param wrapMode  PIPE_TEX_WRAP_x
- * \param s  the texcoord
+ * \param s  the texcoords
  * \param size  the texture image size
- * \param i0  returns first texture index
- * \param i1  returns second texture index (usually *i0 + 1)
- * \param a  returns blend factor/weight between texture indexes
+ * \param icoord0  returns first texture indexes
+ * \param icoord1  returns second texture indexes (usually icoord0 + 1)
+ * \param w  returns blend factor/weight between texture indexes
+ * \param icoord  returns the computed integer texture coords
  */
 static INLINE void
-linear_texcoord(unsigned wrapMode, float s, unsigned size,
-                int *i0, int *i1, float *a)
+linear_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
+                  int icoord0[4], int icoord1[4], float w[4])
 {
-   float u;
+   uint ch;
+
    switch (wrapMode) {
    case PIPE_TEX_WRAP_REPEAT:
-      u = s * size - 0.5F;
-      *i0 = REMAINDER(util_ifloor(u), size);
-      *i1 = REMAINDER(*i0 + 1, size);
-      break;
+      for (ch = 0; ch < 4; ch++) {
+         float u = s[ch] * size - 0.5F;
+         icoord0[ch] = REMAINDER(util_ifloor(u), size);
+         icoord1[ch] = REMAINDER(icoord0[ch] + 1, size);
+         w[ch] = FRAC(u);
+      }
+      break;;
    case PIPE_TEX_WRAP_CLAMP:
-      if (s <= 0.0F)
-         u = 0.0F;
-      else if (s >= 1.0F)
-         u = (float) size;
-      else
-         u = s * size;
-      u -= 0.5F;
-      *i0 = util_ifloor(u);
-      *i1 = *i0 + 1;
-      break;
+      for (ch = 0; ch < 4; ch++) {
+         float u = CLAMP(s[ch], 0.0F, 1.0F);
+         u = u * size - 0.5f;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      if (s <= 0.0F)
-         u = 0.0F;
-      else if (s >= 1.0F)
-         u = (float) size;
-      else
-         u = s * size;
-      u -= 0.5F;
-      *i0 = util_ifloor(u);
-      *i1 = *i0 + 1;
-      if (*i0 < 0)
-         *i0 = 0;
-      if (*i1 >= (int) size)
-         *i1 = size - 1;
-      break;
+      for (ch = 0; ch < 4; ch++) {
+         float u = CLAMP(s[ch], 0.0F, 1.0F);
+         u = u * size - 0.5f;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         if (icoord0[ch] < 0)
+            icoord0[ch] = 0;
+         if (icoord1[ch] >= (int) size)
+            icoord1[ch] = size - 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
       {
          const float min = -1.0F / (2.0F * size);
          const float max = 1.0F - min;
-         if (s <= min)
-            u = min * size;
-         else if (s >= max)
-            u = max * size;
-         else
-            u = s * size;
-         u -= 0.5F;
-         *i0 = util_ifloor(u);
-         *i1 = *i0 + 1;
+         for (ch = 0; ch < 4; ch++) {
+            float u = CLAMP(s[ch], min, max);
+            u = u * size - 0.5f;
+            icoord0[ch] = util_ifloor(u);
+            icoord1[ch] = icoord0[ch] + 1;
+            w[ch] = FRAC(u);
+         }
       }
-      break;
+      break;;
    case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      {
-         const int flr = util_ifloor(s);
+      for (ch = 0; ch < 4; ch++) {
+         const int flr = util_ifloor(s[ch]);
+         float u;
          if (flr & 1)
-            u = 1.0F - (s - (float) flr);
+            u = 1.0F - (s[ch] - (float) flr);
          else
-            u = s - (float) flr;
-         u = (u * size) - 0.5F;
-         *i0 = util_ifloor(u);
-         *i1 = *i0 + 1;
-         if (*i0 < 0)
-            *i0 = 0;
-         if (*i1 >= (int) size)
-            *i1 = size - 1;
+            u = s[ch] - (float) flr;
+         u = u * size - 0.5F;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         if (icoord0[ch] < 0)
+            icoord0[ch] = 0;
+         if (icoord1[ch] >= (int) size)
+            icoord1[ch] = size - 1;
+         w[ch] = FRAC(u);
       }
-      break;
+      break;;
    case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      u = fabsf(s);
-      if (u >= 1.0F)
-         u = (float) size;
-      else
-         u *= size;
-      u -= 0.5F;
-      *i0 = util_ifloor(u);
-      *i1 = *i0 + 1;
-      break;
+      for (ch = 0; ch < 4; ch++) {
+         float u = fabsf(s[ch]);
+         if (u >= 1.0F)
+            u = (float) size;
+         else
+            u *= size;
+         u -= 0.5F;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      u = fabsf(s);
-      if (u >= 1.0F)
-         u = (float) size;
-      else
-         u *= size;
-      u -= 0.5F;
-      *i0 = util_ifloor(u);
-      *i1 = *i0 + 1;
-      if (*i0 < 0)
-         *i0 = 0;
-      if (*i1 >= (int) size)
-         *i1 = size - 1;
-      break;
+      for (ch = 0; ch < 4; ch++) {
+         float u = fabsf(s[ch]);
+         if (u >= 1.0F)
+            u = (float) size;
+         else
+            u *= size;
+         u -= 0.5F;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         if (icoord0[ch] < 0)
+            icoord0[ch] = 0;
+         if (icoord1[ch] >= (int) size)
+            icoord1[ch] = size - 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
       {
          const float min = -1.0F / (2.0F * size);
          const float max = 1.0F - min;
-         u = fabsf(s);
-         if (u <= min)
-            u = min * size;
-         else if (u >= max)
-            u = max * size;
-         else
-            u *= size;
-         u -= 0.5F;
-         *i0 = util_ifloor(u);
-         *i1 = *i0 + 1;
+         for (ch = 0; ch < 4; ch++) {
+            float u = fabsf(s[ch]);
+            if (u <= min)
+               u = min * size;
+            else if (u >= max)
+               u = max * size;
+            else
+               u *= size;
+            u -= 0.5F;
+            icoord0[ch] = util_ifloor(u);
+            icoord1[ch] = icoord0[ch] + 1;
+            w[ch] = FRAC(u);
+         }
       }
-      break;
+      break;;
    default:
       assert(0);
    }
-   *a = FRAC(u);
 }
 
 
@@ -337,21 +383,27 @@ linear_texcoord(unsigned wrapMode, float s, unsigned size,
  * For RECT textures / unnormalized texcoords
  * Only a subset of wrap modes supported.
  */
-static INLINE int
-nearest_texcoord_unnorm(unsigned wrapMode, float s, unsigned size)
+static INLINE void
+nearest_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
+                          int icoord[4])
 {
-   int i;
+   uint ch;
    switch (wrapMode) {
    case PIPE_TEX_WRAP_CLAMP:
-      i = util_ifloor(s);
-      return CLAMP(i, 0, (int) size-1);
+      for (ch = 0; ch < 4; ch++) {
+         int i = util_ifloor(s[ch]);
+         icoord[ch]= CLAMP(i, 0, (int) size-1);
+      }
+      return;
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
       /* fall-through */
    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      return util_ifloor( CLAMP(s, 0.5F, (float) size - 0.5F) );
+      for (ch = 0; ch < 4; ch++) {
+         icoord[ch]= util_ifloor( CLAMP(s[ch], 0.5F, (float) size - 0.5F) );
+      }
+      return;
    default:
       assert(0);
-      return 0;
    }
 }
 
@@ -361,30 +413,36 @@ nearest_texcoord_unnorm(unsigned wrapMode, float s, unsigned size)
  * Only a subset of wrap modes supported.
  */
 static INLINE void
-linear_texcoord_unnorm(unsigned wrapMode, float s, unsigned size,
-                       int *i0, int *i1, float *a)
+linear_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
+                         int icoord0[4], int icoord1[4], float w[4])
 {
+   uint ch;
    switch (wrapMode) {
    case PIPE_TEX_WRAP_CLAMP:
-      /* Not exactly what the spec says, but it matches NVIDIA output */
-      s = CLAMP(s - 0.5F, 0.0f, (float) size - 1.0f);
-      *i0 = util_ifloor(s);
-      *i1 = *i0 + 1;
-      break;
+      for (ch = 0; ch < 4; ch++) {
+         /* Not exactly what the spec says, but it matches NVIDIA output */
+         float u = CLAMP(s[ch] - 0.5F, 0.0f, (float) size - 1.0f);
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         w[ch] = FRAC(u);
+      }
+      return;
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
       /* fall-through */
    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      s = CLAMP(s, 0.5F, (float) size - 0.5F);
-      s -= 0.5F;
-      *i0 = util_ifloor(s);
-      *i1 = *i0 + 1;
-      if (*i1 > (int) size - 1)
-         *i1 = size - 1;
+      for (ch = 0; ch < 4; ch++) {
+         float u = CLAMP(s[ch], 0.5F, (float) size - 0.5F);
+         u -= 0.5F;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         if (icoord1[ch] > (int) size - 1)
+            icoord1[ch] = size - 1;
+         w[ch] = FRAC(u);
+      }
       break;
    default:
       assert(0);
    }
-   *a = FRAC(s);
 }
 
 
@@ -463,7 +521,8 @@ choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
  * This is only done for fragment shaders, not vertex shaders.
  */
 static float
-compute_lambda(struct tgsi_sampler *sampler,
+compute_lambda(const struct pipe_texture *tex,
+               const struct pipe_sampler_state *sampler,
                const float s[QUAD_SIZE],
                const float t[QUAD_SIZE],
                const float p[QUAD_SIZE],
@@ -471,7 +530,7 @@ compute_lambda(struct tgsi_sampler *sampler,
 {
    float rho, lambda;
 
-   assert(sampler->state->normalized_coords);
+   assert(sampler->normalized_coords);
 
    assert(s);
    {
@@ -479,7 +538,7 @@ compute_lambda(struct tgsi_sampler *sampler,
       float dsdy = s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT];
       dsdx = fabsf(dsdx);
       dsdy = fabsf(dsdy);
-      rho = MAX2(dsdx, dsdy) * sampler->texture->width[0];
+      rho = MAX2(dsdx, dsdy) * tex->width[0];
    }
    if (t) {
       float dtdx = t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT];
@@ -487,7 +546,7 @@ compute_lambda(struct tgsi_sampler *sampler,
       float max;
       dtdx = fabsf(dtdx);
       dtdy = fabsf(dtdy);
-      max = MAX2(dtdx, dtdy) * sampler->texture->height[0];
+      max = MAX2(dtdx, dtdy) * tex->height[0];
       rho = MAX2(rho, max);
    }
    if (p) {
@@ -496,13 +555,13 @@ compute_lambda(struct tgsi_sampler *sampler,
       float max;
       dpdx = fabsf(dpdx);
       dpdy = fabsf(dpdy);
-      max = MAX2(dpdx, dpdy) * sampler->texture->depth[0];
+      max = MAX2(dpdx, dpdy) * tex->depth[0];
       rho = MAX2(rho, max);
    }
 
    lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->state->lod_bias;
-   lambda = CLAMP(lambda, sampler->state->min_lod, sampler->state->max_lod);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
 
    return lambda;
 }
@@ -514,68 +573,74 @@ compute_lambda(struct tgsi_sampler *sampler,
  * 2. Determine if we're minifying or magnifying
  * 3. If minifying, choose mipmap levels
  * 4. Return image filter to use within mipmap images
+ * \param level0  Returns first mipmap level to sample from
+ * \param level1  Returns second mipmap level to sample from
+ * \param levelBlend  Returns blend factor between levels, in [0,1]
+ * \param imgFilter  Returns either the min or mag filter, depending on lambda
  */
 static void
-choose_mipmap_levels(struct tgsi_sampler *sampler,
+choose_mipmap_levels(const struct pipe_texture *texture,
+                     const struct pipe_sampler_state *sampler,
                      const float s[QUAD_SIZE],
                      const float t[QUAD_SIZE],
                      const float p[QUAD_SIZE],
+                     boolean computeLambda,
                      float lodbias,
                      unsigned *level0, unsigned *level1, float *levelBlend,
                      unsigned *imgFilter)
 {
-   if (sampler->state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
+   if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
       /* no mipmap selection needed */
-      *level0 = *level1 = CLAMP((int) sampler->state->min_lod,
-                                0, (int) sampler->texture->last_level);
+      *level0 = *level1 = CLAMP((int) sampler->min_lod,
+                                0, (int) texture->last_level);
 
-      if (sampler->state->min_img_filter != sampler->state->mag_img_filter) {
+      if (sampler->min_img_filter != sampler->mag_img_filter) {
          /* non-mipmapped texture, but still need to determine if doing
           * minification or magnification.
           */
-         float lambda = compute_lambda(sampler, s, t, p, lodbias);
+         float lambda = compute_lambda(texture, sampler, s, t, p, lodbias);
          if (lambda <= 0.0) {
-            *imgFilter = sampler->state->mag_img_filter;
+            *imgFilter = sampler->mag_img_filter;
          }
          else {
-            *imgFilter = sampler->state->min_img_filter;
+            *imgFilter = sampler->min_img_filter;
          }
       }
       else {
-         *imgFilter = sampler->state->mag_img_filter;
+         *imgFilter = sampler->mag_img_filter;
       }
    }
    else {
       float lambda;
 
-      if (1)
+      if (computeLambda)
          /* fragment shader */
-         lambda = compute_lambda(sampler, s, t, p, lodbias);
+         lambda = compute_lambda(texture, sampler, s, t, p, lodbias);
       else
          /* vertex shader */
          lambda = lodbias; /* not really a bias, but absolute LOD */
 
       if (lambda <= 0.0) { /* XXX threshold depends on the filter */
          /* magnifying */
-         *imgFilter = sampler->state->mag_img_filter;
+         *imgFilter = sampler->mag_img_filter;
          *level0 = *level1 = 0;
       }
       else {
          /* minifying */
-         *imgFilter = sampler->state->min_img_filter;
+         *imgFilter = sampler->min_img_filter;
 
          /* choose mipmap level(s) and compute the blend factor between them */
-         if (sampler->state->min_mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
+         if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
             /* Nearest mipmap level */
             const int lvl = (int) (lambda + 0.5);
             *level0 =
-            *level1 = CLAMP(lvl, 0, (int) sampler->texture->last_level);
+            *level1 = CLAMP(lvl, 0, (int) texture->last_level);
          }
          else {
             /* Linear interpolation between mipmap levels */
             const int lvl = (int) lambda;
-            *level0 = CLAMP(lvl,     0, (int) sampler->texture->last_level);
-            *level1 = CLAMP(lvl + 1, 0, (int) sampler->texture->last_level);
+            *level0 = CLAMP(lvl,     0, (int) texture->last_level);
+            *level1 = CLAMP(lvl + 1, 0, (int) texture->last_level);
             *levelBlend = FRAC(lambda);  /* blending weight between levels */
          }
       }
@@ -598,23 +663,29 @@ choose_mipmap_levels(struct tgsi_sampler *sampler,
  * sp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
  */
 static void
-get_texel(struct tgsi_sampler *sampler,
+get_texel(const struct tgsi_sampler *tgsi_sampler,
           unsigned face, unsigned level, int x, int y, int z,
           float rgba[NUM_CHANNELS][QUAD_SIZE], unsigned j)
 {
-   if (x < 0 || x >= (int) sampler->texture->width[level] ||
-       y < 0 || y >= (int) sampler->texture->height[level] ||
-       z < 0 || z >= (int) sampler->texture->depth[level]) {
-      rgba[0][j] = sampler->state->border_color[0];
-      rgba[1][j] = sampler->state->border_color[1];
-      rgba[2][j] = sampler->state->border_color[2];
-      rgba[3][j] = sampler->state->border_color[3];
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+
+   if (x < 0 || x >= (int) texture->width[level] ||
+       y < 0 || y >= (int) texture->height[level] ||
+       z < 0 || z >= (int) texture->depth[level]) {
+      rgba[0][j] = sampler->border_color[0];
+      rgba[1][j] = sampler->border_color[1];
+      rgba[2][j] = sampler->border_color[2];
+      rgba[3][j] = sampler->border_color[3];
    }
    else {
       const int tx = x % TILE_SIZE;
       const int ty = y % TILE_SIZE;
       const struct softpipe_cached_tile *tile
-         = sp_get_cached_tile_tex(sampler->pipe, sampler->cache,
+         = sp_get_cached_tile_tex(sp, samp->cache,
                                   x, y, z, face, level);
       rgba[0][j] = tile->data.color[ty][tx][0];
       rgba[1][j] = tile->data.color[ty][tx][1];
@@ -624,7 +695,7 @@ get_texel(struct tgsi_sampler *sampler,
       {
          debug_printf("Get texel %f %f %f %f from %s\n",
                       rgba[0][j], rgba[1][j], rgba[2][j], rgba[3][j],
-                      pf_name(sampler->texture->format));
+                      pf_name(texture->format));
       }
    }
 }
@@ -682,103 +753,124 @@ shadow_compare(uint compare_func,
  * Could probably extend for 3D...
  */
 static void
-sp_get_samples_2d_common(struct tgsi_sampler *sampler,
+sp_get_samples_2d_common(const struct tgsi_sampler *tgsi_sampler,
                          const float s[QUAD_SIZE],
                          const float t[QUAD_SIZE],
                          const float p[QUAD_SIZE],
+                         boolean computeLambda,
                          float lodbias,
                          float rgba[NUM_CHANNELS][QUAD_SIZE],
                          const unsigned faces[4])
 {
-   const uint compare_func = sampler->state->compare_func;
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+   const uint compare_func = sampler->compare_func;
    unsigned level0, level1, j, imgFilter;
    int width, height;
    float levelBlend;
 
-   choose_mipmap_levels(sampler, s, t, p, lodbias,
+   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
                         &level0, &level1, &levelBlend, &imgFilter);
 
-   assert(sampler->state->normalized_coords);
+   assert(sampler->normalized_coords);
 
-   width = sampler->texture->width[level0];
-   height = sampler->texture->height[level0];
+   width = texture->width[level0];
+   height = texture->height[level0];
 
    assert(width > 0);
 
    switch (imgFilter) {
    case PIPE_TEX_FILTER_NEAREST:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = nearest_texcoord(sampler->state->wrap_s, s[j], width);
-         int y = nearest_texcoord(sampler->state->wrap_t, t[j], height);
-         get_texel(sampler, faces[j], level0, x, y, 0, rgba, j);
-         if (sampler->state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-            shadow_compare(compare_func, rgba, p, j);
-         }
-
-         if (level0 != level1) {
-            /* get texels from second mipmap level and blend */
-            float rgba2[4][4];
-            unsigned c;
-            x = x / 2;
-            y = y / 2;
-            get_texel(sampler, faces[j], level1, x, y, 0, rgba2, j);
-            if (sampler->state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
-               shadow_compare(compare_func, rgba2, p, j);
+      {
+         int x[4], y[4];
+         nearest_texcoord_4(sampler->wrap_s, s, width, x);
+         nearest_texcoord_4(sampler->wrap_t, t, height, y);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            get_texel(tgsi_sampler, faces[j], level0, x[j], y[j], 0, rgba, j);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               shadow_compare(compare_func, rgba, p, j);
             }
 
-            for (c = 0; c < NUM_CHANNELS; c++) {
-               rgba[c][j] = LERP(levelBlend, rgba[c][j], rgba2[c][j]);
+            if (level0 != level1) {
+               /* get texels from second mipmap level and blend */
+               float rgba2[4][4];
+               unsigned c;
+               x[j] /= 2;
+               y[j] /= 2;
+               get_texel(tgsi_sampler, faces[j], level1, x[j], y[j], 0,
+                         rgba2, j);
+               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
+                  shadow_compare(compare_func, rgba2, p, j);
+               }
+
+               for (c = 0; c < NUM_CHANNELS; c++) {
+                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
+               }
             }
          }
       }
       break;
    case PIPE_TEX_FILTER_LINEAR:
    case PIPE_TEX_FILTER_ANISO:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         float tx[4][4], a, b;
-         int x0, y0, x1, y1, c;
-         linear_texcoord(sampler->state->wrap_s, s[j], width,  &x0, &x1, &a);
-         linear_texcoord(sampler->state->wrap_t, t[j], height, &y0, &y1, &b);
-         get_texel(sampler, faces[j], level0, x0, y0, 0, tx, 0);
-         get_texel(sampler, faces[j], level0, x1, y0, 0, tx, 1);
-         get_texel(sampler, faces[j], level0, x0, y1, 0, tx, 2);
-         get_texel(sampler, faces[j], level0, x1, y1, 0, tx, 3);
-         if (sampler->state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-            shadow_compare(compare_func, tx, p, 0);
-            shadow_compare(compare_func, tx, p, 1);
-            shadow_compare(compare_func, tx, p, 2);
-            shadow_compare(compare_func, tx, p, 3);
-         }
-
-         for (c = 0; c < 4; c++) {
-            rgba[c][j] = lerp_2d(a, b, tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
-         }
-
-         if (level0 != level1) {
-            /* get texels from second mipmap level and blend */
-            float rgba2[4][4];
-            x0 = x0 / 2;
-            y0 = y0 / 2;
-            x1 = x1 / 2;
-            y1 = y1 / 2;
-            get_texel(sampler, faces[j], level1, x0, y0, 0, tx, 0);
-            get_texel(sampler, faces[j], level1, x1, y0, 0, tx, 1);
-            get_texel(sampler, faces[j], level1, x0, y1, 0, tx, 2);
-            get_texel(sampler, faces[j], level1, x1, y1, 0, tx, 3);
-            if (sampler->state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
+      {
+         int x0[4], y0[4], x1[4], y1[4];
+         float xw[4], yw[4]; /* weights */
+
+         linear_texcoord_4(sampler->wrap_s, s, width, x0, x1, xw);
+         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            float tx[4][4]; /* texels */
+            int c;
+            get_texel(tgsi_sampler, faces[j], level0, x0[j], y0[j], 0, tx, 0);
+            get_texel(tgsi_sampler, faces[j], level0, x1[j], y0[j], 0, tx, 1);
+            get_texel(tgsi_sampler, faces[j], level0, x0[j], y1[j], 0, tx, 2);
+            get_texel(tgsi_sampler, faces[j], level0, x1[j], y1[j], 0, tx, 3);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
                shadow_compare(compare_func, tx, p, 0);
                shadow_compare(compare_func, tx, p, 1);
                shadow_compare(compare_func, tx, p, 2);
                shadow_compare(compare_func, tx, p, 3);
             }
 
+            /* interpolate R, G, B, A */
             for (c = 0; c < 4; c++) {
-               rgba2[c][j] = lerp_2d(a, b,
-                                     tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
+               rgba[c][j] = lerp_2d(xw[j], yw[j],
+                                    tx[c][0], tx[c][1],
+                                    tx[c][2], tx[c][3]);
             }
 
-            for (c = 0; c < NUM_CHANNELS; c++) {
-               rgba[c][j] = LERP(levelBlend, rgba[c][j], rgba2[c][j]);
+            if (level0 != level1) {
+               /* get texels from second mipmap level and blend */
+               float rgba2[4][4];
+               x0[j] /= 2;
+               y0[j] /= 2;
+               x1[j] /= 2;
+               y1[j] /= 2;
+               get_texel(tgsi_sampler, faces[j], level1, x0[j], y0[j], 0, tx, 0);
+               get_texel(tgsi_sampler, faces[j], level1, x1[j], y0[j], 0, tx, 1);
+               get_texel(tgsi_sampler, faces[j], level1, x0[j], y1[j], 0, tx, 2);
+               get_texel(tgsi_sampler, faces[j], level1, x1[j], y1[j], 0, tx, 3);
+               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
+                  shadow_compare(compare_func, tx, p, 0);
+                  shadow_compare(compare_func, tx, p, 1);
+                  shadow_compare(compare_func, tx, p, 2);
+                  shadow_compare(compare_func, tx, p, 3);
+               }
+
+               /* interpolate R, G, B, A */
+               for (c = 0; c < 4; c++) {
+                  rgba2[c][j] = lerp_2d(xw[j], yw[j],
+                                        tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
+               }
+
+               for (c = 0; c < NUM_CHANNELS; c++) {
+                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
+               }
             }
          }
       }
@@ -789,55 +881,65 @@ sp_get_samples_2d_common(struct tgsi_sampler *sampler,
 }
 
 
-static void
-sp_get_samples_1d(struct tgsi_sampler *sampler,
+static INLINE void
+sp_get_samples_1d(const struct tgsi_sampler *sampler,
                   const float s[QUAD_SIZE],
                   const float t[QUAD_SIZE],
                   const float p[QUAD_SIZE],
+                  boolean computeLambda,
                   float lodbias,
                   float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    static const unsigned faces[4] = {0, 0, 0, 0};
    static const float tzero[4] = {0, 0, 0, 0};
-   sp_get_samples_2d_common(sampler, s, tzero, NULL, lodbias, rgba, faces);
+   sp_get_samples_2d_common(sampler, s, tzero, NULL,
+                            computeLambda, lodbias, rgba, faces);
 }
 
 
-static void
-sp_get_samples_2d(struct tgsi_sampler *sampler,
+static INLINE void
+sp_get_samples_2d(const struct tgsi_sampler *sampler,
                   const float s[QUAD_SIZE],
                   const float t[QUAD_SIZE],
                   const float p[QUAD_SIZE],
+                  boolean computeLambda,
                   float lodbias,
                   float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    static const unsigned faces[4] = {0, 0, 0, 0};
-   sp_get_samples_2d_common(sampler, s, t, p, lodbias, rgba, faces);
+   sp_get_samples_2d_common(sampler, s, t, p,
+                            computeLambda, lodbias, rgba, faces);
 }
 
 
-static void
-sp_get_samples_3d(struct tgsi_sampler *sampler,
+static INLINE void
+sp_get_samples_3d(const struct tgsi_sampler *tgsi_sampler,
                   const float s[QUAD_SIZE],
                   const float t[QUAD_SIZE],
                   const float p[QUAD_SIZE],
+                  boolean computeLambda,
                   float lodbias,
                   float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
    /* get/map pipe_surfaces corresponding to 3D tex slices */
    unsigned level0, level1, j, imgFilter;
    int width, height, depth;
    float levelBlend;
    const uint face = 0;
 
-   choose_mipmap_levels(sampler, s, t, p, lodbias,
+   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
                         &level0, &level1, &levelBlend, &imgFilter);
 
-   assert(sampler->state->normalized_coords);
+   assert(sampler->normalized_coords);
 
-   width = sampler->texture->width[level0];
-   height = sampler->texture->height[level0];
-   depth = sampler->texture->depth[level0];
+   width = texture->width[level0];
+   height = texture->height[level0];
+   depth = texture->depth[level0];
 
    assert(width > 0);
    assert(height > 0);
@@ -845,89 +947,89 @@ sp_get_samples_3d(struct tgsi_sampler *sampler,
 
    switch (imgFilter) {
    case PIPE_TEX_FILTER_NEAREST:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = nearest_texcoord(sampler->state->wrap_s, s[j], width);
-         int y = nearest_texcoord(sampler->state->wrap_t, t[j], height);
-         int z = nearest_texcoord(sampler->state->wrap_r, p[j], depth);
-         get_texel(sampler, face, level0, x, y, z, rgba, j);
-
-         if (level0 != level1) {
-            /* get texels from second mipmap level and blend */
-            float rgba2[4][4];
-            unsigned c;
-            x /= 2;
-            y /= 2;
-            z /= 2;
-            get_texel(sampler, face, level1, x, y, z, rgba2, j);
-            for (c = 0; c < NUM_CHANNELS; c++) {
-               rgba[c][j] = LERP(levelBlend, rgba2[c][j], rgba[c][j]);
+      {
+         int x[4], y[4], z[4];
+         nearest_texcoord_4(sampler->wrap_s, s, width, x);
+         nearest_texcoord_4(sampler->wrap_t, t, height, y);
+         nearest_texcoord_4(sampler->wrap_r, p, depth, z);
+         for (j = 0; j < QUAD_SIZE; j++) {
+            get_texel(tgsi_sampler, face, level0, x[j], y[j], z[j], rgba, j);
+            if (level0 != level1) {
+               /* get texels from second mipmap level and blend */
+               float rgba2[4][4];
+               unsigned c;
+               x[j] /= 2;
+               y[j] /= 2;
+               z[j] /= 2;
+               get_texel(tgsi_sampler, face, level1, x[j], y[j], z[j], rgba2, j);
+               for (c = 0; c < NUM_CHANNELS; c++) {
+                  rgba[c][j] = lerp(levelBlend, rgba2[c][j], rgba[c][j]);
+               }
             }
          }
       }
       break;
    case PIPE_TEX_FILTER_LINEAR:
    case PIPE_TEX_FILTER_ANISO:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         float texel0[4][4], texel1[4][4];
-         float xw, yw, zw; /* interpolation weights */
-         int x0, x1, y0, y1, z0, z1, c;
-         linear_texcoord(sampler->state->wrap_s, s[j], width,  &x0, &x1, &xw);
-         linear_texcoord(sampler->state->wrap_t, t[j], height, &y0, &y1, &yw);
-         linear_texcoord(sampler->state->wrap_r, p[j], depth,  &z0, &z1, &zw);
-         get_texel(sampler, face, level0, x0, y0, z0, texel0, 0);
-         get_texel(sampler, face, level0, x1, y0, z0, texel0, 1);
-         get_texel(sampler, face, level0, x0, y1, z0, texel0, 2);
-         get_texel(sampler, face, level0, x1, y1, z0, texel0, 3);
-         get_texel(sampler, face, level0, x0, y0, z1, texel1, 0);
-         get_texel(sampler, face, level0, x1, y0, z1, texel1, 1);
-         get_texel(sampler, face, level0, x0, y1, z1, texel1, 2);
-         get_texel(sampler, face, level0, x1, y1, z1, texel1, 3);
-
-         /* 3D lerp */
-         for (c = 0; c < 4; c++) {
-            float ctemp0[4][4], ctemp1[4][4];
-            ctemp0[c][j] = lerp_2d(xw, yw,
-                                   texel0[c][0], texel0[c][1],
-                                   texel0[c][2], texel0[c][3]);
-            ctemp1[c][j] = lerp_2d(xw, yw,
-                                   texel1[c][0], texel1[c][1],
-                                   texel1[c][2], texel1[c][3]);
-            rgba[c][j] = LERP(zw, ctemp0[c][j], ctemp1[c][j]);
-         }
-
-         if (level0 != level1) {
-            /* get texels from second mipmap level and blend */
-            float rgba2[4][4];
-            x0 /= 2;
-            y0 /= 2;
-            z0 /= 2;
-            x1 /= 2;
-            y1 /= 2;
-            z1 /= 2;
-            get_texel(sampler, face, level1, x0, y0, z0, texel0, 0);
-            get_texel(sampler, face, level1, x1, y0, z0, texel0, 1);
-            get_texel(sampler, face, level1, x0, y1, z0, texel0, 2);
-            get_texel(sampler, face, level1, x1, y1, z0, texel0, 3);
-            get_texel(sampler, face, level1, x0, y0, z1, texel1, 0);
-            get_texel(sampler, face, level1, x1, y0, z1, texel1, 1);
-            get_texel(sampler, face, level1, x0, y1, z1, texel1, 2);
-            get_texel(sampler, face, level1, x1, y1, z1, texel1, 3);
-
-            /* 3D lerp */
+      {
+         int x0[4], x1[4], y0[4], y1[4], z0[4], z1[4];
+         float xw[4], yw[4], zw[4]; /* interpolation weights */
+         linear_texcoord_4(sampler->wrap_s, s, width,  x0, x1, xw);
+         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
+         linear_texcoord_4(sampler->wrap_r, p, depth,  z0, z1, zw);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int c;
+            float tx0[4][4], tx1[4][4];
+            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z0[j], tx0, 0);
+            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z0[j], tx0, 1);
+            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z0[j], tx0, 2);
+            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z0[j], tx0, 3);
+            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z1[j], tx1, 0);
+            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z1[j], tx1, 1);
+            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z1[j], tx1, 2);
+            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z1[j], tx1, 3);
+
+            /* interpolate R, G, B, A */
             for (c = 0; c < 4; c++) {
-               float ctemp0[4][4], ctemp1[4][4];
-               ctemp0[c][j] = lerp_2d(xw, yw,
-                                      texel0[c][0], texel0[c][1],
-                                      texel0[c][2], texel0[c][3]);
-               ctemp1[c][j] = lerp_2d(xw, yw,
-                                      texel1[c][0], texel1[c][1],
-                                      texel1[c][2], texel1[c][3]);
-               rgba2[c][j] = LERP(zw, ctemp0[c][j], ctemp1[c][j]);
+               rgba[c][j] = lerp_3d(xw[j], yw[j], zw[j],
+                                    tx0[c][0], tx0[c][1],
+                                    tx0[c][2], tx0[c][3],
+                                    tx1[c][0], tx1[c][1],
+                                    tx1[c][2], tx1[c][3]);
             }
 
-            /* blend mipmap levels */
-            for (c = 0; c < NUM_CHANNELS; c++) {
-               rgba[c][j] = LERP(levelBlend, rgba[c][j], rgba2[c][j]);
+            if (level0 != level1) {
+               /* get texels from second mipmap level and blend */
+               float rgba2[4][4];
+               x0[j] /= 2;
+               y0[j] /= 2;
+               z0[j] /= 2;
+               x1[j] /= 2;
+               y1[j] /= 2;
+               z1[j] /= 2;
+               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z0[j], tx0, 0);
+               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z0[j], tx0, 1);
+               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z0[j], tx0, 2);
+               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z0[j], tx0, 3);
+               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z1[j], tx1, 0);
+               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z1[j], tx1, 1);
+               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z1[j], tx1, 2);
+               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z1[j], tx1, 3);
+
+               /* interpolate R, G, B, A */
+               for (c = 0; c < 4; c++) {
+                  rgba2[c][j] = lerp_3d(xw[j], yw[j], zw[j],
+                                        tx0[c][0], tx0[c][1],
+                                        tx0[c][2], tx0[c][3],
+                                        tx1[c][0], tx1[c][1],
+                                        tx1[c][2], tx1[c][3]);
+               }
+
+               /* blend mipmap levels */
+               for (c = 0; c < NUM_CHANNELS; c++) {
+                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
+               }
             }
          }
       }
@@ -939,10 +1041,11 @@ sp_get_samples_3d(struct tgsi_sampler *sampler,
 
 
 static void
-sp_get_samples_cube(struct tgsi_sampler *sampler,
+sp_get_samples_cube(const struct tgsi_sampler *sampler,
                     const float s[QUAD_SIZE],
                     const float t[QUAD_SIZE],
                     const float p[QUAD_SIZE],
+                    boolean computeLambda,
                     float lodbias,
                     float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
@@ -951,67 +1054,80 @@ sp_get_samples_cube(struct tgsi_sampler *sampler,
    for (j = 0; j < QUAD_SIZE; j++) {
       faces[j] = choose_cube_face(s[j], t[j], p[j], ssss + j, tttt + j);
    }
-   sp_get_samples_2d_common(sampler, ssss, tttt, NULL, lodbias, rgba, faces);
+   sp_get_samples_2d_common(sampler, ssss, tttt, NULL,
+                            computeLambda, lodbias, rgba, faces);
 }
 
 
 static void
-sp_get_samples_rect(struct tgsi_sampler *sampler,
+sp_get_samples_rect(const struct tgsi_sampler *tgsi_sampler,
                     const float s[QUAD_SIZE],
                     const float t[QUAD_SIZE],
                     const float p[QUAD_SIZE],
+                    boolean computeLambda,
                     float lodbias,
                     float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
-   //sp_get_samples_2d_common(sampler, s, t, p, lodbias, rgba, faces);
-   static const uint face = 0;
-   const uint compare_func = sampler->state->compare_func;
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+   const uint face = 0;
+   const uint compare_func = sampler->compare_func;
    unsigned level0, level1, j, imgFilter;
    int width, height;
    float levelBlend;
 
-   choose_mipmap_levels(sampler, s, t, p, lodbias,
+   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
                         &level0, &level1, &levelBlend, &imgFilter);
 
    /* texture RECTS cannot be mipmapped */
    assert(level0 == level1);
 
-   width = sampler->texture->width[level0];
-   height = sampler->texture->height[level0];
+   width = texture->width[level0];
+   height = texture->height[level0];
 
    assert(width > 0);
 
    switch (imgFilter) {
    case PIPE_TEX_FILTER_NEAREST:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = nearest_texcoord_unnorm(sampler->state->wrap_s, s[j], width);
-         int y = nearest_texcoord_unnorm(sampler->state->wrap_t, t[j], height);
-         get_texel(sampler, face, level0, x, y, 0, rgba, j);
-         if (sampler->state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-            shadow_compare(compare_func, rgba, p, j);
+      {
+         int x[4], y[4];
+         nearest_texcoord_unnorm_4(sampler->wrap_s, s, width, x);
+         nearest_texcoord_unnorm_4(sampler->wrap_t, t, height, y);
+         for (j = 0; j < QUAD_SIZE; j++) {
+            get_texel(tgsi_sampler, face, level0, x[j], y[j], 0, rgba, j);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               shadow_compare(compare_func, rgba, p, j);
+            }
          }
       }
       break;
    case PIPE_TEX_FILTER_LINEAR:
    case PIPE_TEX_FILTER_ANISO:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         float tx[4][4], a, b;
-         int x0, y0, x1, y1, c;
-         linear_texcoord_unnorm(sampler->state->wrap_s, s[j], width,  &x0, &x1, &a);
-         linear_texcoord_unnorm(sampler->state->wrap_t, t[j], height, &y0, &y1, &b);
-         get_texel(sampler, face, level0, x0, y0, 0, tx, 0);
-         get_texel(sampler, face, level0, x1, y0, 0, tx, 1);
-         get_texel(sampler, face, level0, x0, y1, 0, tx, 2);
-         get_texel(sampler, face, level0, x1, y1, 0, tx, 3);
-         if (sampler->state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-            shadow_compare(compare_func, tx, p, 0);
-            shadow_compare(compare_func, tx, p, 1);
-            shadow_compare(compare_func, tx, p, 2);
-            shadow_compare(compare_func, tx, p, 3);
-         }
-
-         for (c = 0; c < 4; c++) {
-            rgba[c][j] = lerp_2d(a, b, tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
+      {
+         int x0[4], y0[4], x1[4], y1[4];
+         float xw[4], yw[4]; /* weights */
+         linear_texcoord_unnorm_4(sampler->wrap_s, s, width,  x0, x1, xw);
+         linear_texcoord_unnorm_4(sampler->wrap_t, t, height, y0, y1, yw);
+         for (j = 0; j < QUAD_SIZE; j++) {
+            float tx[4][4]; /* texels */
+            int c;
+            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], 0, tx, 0);
+            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], 0, tx, 1);
+            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], 0, tx, 2);
+            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], 0, tx, 3);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               shadow_compare(compare_func, tx, p, 0);
+               shadow_compare(compare_func, tx, p, 1);
+               shadow_compare(compare_func, tx, p, 2);
+               shadow_compare(compare_func, tx, p, 3);
+            }
+            for (c = 0; c < 4; c++) {
+               rgba[c][j] = lerp_2d(xw[j], yw[j],
+                                    tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
+            }
          }
       }
       break;
@@ -1021,49 +1137,45 @@ sp_get_samples_rect(struct tgsi_sampler *sampler,
 }
 
 
-
-
 /**
- * Called via tgsi_sampler::get_samples()
- * Use the sampler's state setting to get a filtered RGBA value
- * from the sampler's texture.
- *
- * XXX we can implement many versions of this function, each
- * tightly coded for a specific combination of sampler state
- * (nearest + repeat), (bilinear mipmap + clamp), etc.
- *
- * The update_samplers() function in st_atom_sampler.c could create
- * a new tgsi_sampler object for each state combo it finds....
+ * Common code for vertex/fragment program texture sampling.
  */
-void
-sp_get_samples(struct tgsi_sampler *sampler,
+static INLINE void
+sp_get_samples(struct tgsi_sampler *tgsi_sampler,
                const float s[QUAD_SIZE],
                const float t[QUAD_SIZE],
                const float p[QUAD_SIZE],
+               boolean computeLambda,
                float lodbias,
                float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
-   if (!sampler->texture)
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+
+   if (!texture)
       return;
 
-   switch (sampler->texture->target) {
+   switch (texture->target) {
    case PIPE_TEXTURE_1D:
-      assert(sampler->state->normalized_coords);
-      sp_get_samples_1d(sampler, s, t, p, lodbias, rgba);
+      assert(sampler->normalized_coords);
+      sp_get_samples_1d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
       break;
    case PIPE_TEXTURE_2D:
-      if (sampler->state->normalized_coords)
-         sp_get_samples_2d(sampler, s, t, p, lodbias, rgba);
+      if (sampler->normalized_coords)
+         sp_get_samples_2d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
       else
-         sp_get_samples_rect(sampler, s, t, p, lodbias, rgba);
+         sp_get_samples_rect(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
       break;
    case PIPE_TEXTURE_3D:
-      assert(sampler->state->normalized_coords);
-      sp_get_samples_3d(sampler, s, t, p, lodbias, rgba);
+      assert(sampler->normalized_coords);
+      sp_get_samples_3d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
       break;
    case PIPE_TEXTURE_CUBE:
-      assert(sampler->state->normalized_coords);
-      sp_get_samples_cube(sampler, s, t, p, lodbias, rgba);
+      assert(sampler->normalized_coords);
+      sp_get_samples_cube(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
       break;
    default:
       assert(0);
@@ -1084,3 +1196,34 @@ sp_get_samples(struct tgsi_sampler *sampler,
 #endif
 }
 
+
+/**
+ * Called via tgsi_sampler::get_samples() when running a fragment shader.
+ * Get four filtered RGBA values from the sampler's texture.
+ */
+void
+sp_get_samples_fragment(struct tgsi_sampler *tgsi_sampler,
+                        const float s[QUAD_SIZE],
+                        const float t[QUAD_SIZE],
+                        const float p[QUAD_SIZE],
+                        float lodbias,
+                        float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   sp_get_samples(tgsi_sampler, s, t, p, TRUE, lodbias, rgba);
+}
+
+
+/**
+ * Called via tgsi_sampler::get_samples() when running a vertex shader.
+ * Get four filtered RGBA values from the sampler's texture.
+ */
+void
+sp_get_samples_vertex(struct tgsi_sampler *tgsi_sampler,
+                      const float s[QUAD_SIZE],
+                      const float t[QUAD_SIZE],
+                      const float p[QUAD_SIZE],
+                      float lodbias,
+                      float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   sp_get_samples(tgsi_sampler, s, t, p, FALSE, lodbias, rgba);
+}
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.h b/src/gallium/drivers/softpipe/sp_tex_sample.h
index 404bfd0c36..40d8eb2c2a 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.h
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.h
@@ -1,17 +1,73 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
 #ifndef SP_TEX_SAMPLE_H
 #define SP_TEX_SAMPLE_H
 
 
-struct tgsi_sampler;
+#include "tgsi/tgsi_exec.h"
+
+
+/**
+ * Subclass of tgsi_sampler
+ */
+struct sp_shader_sampler
+{
+   struct tgsi_sampler base;  /**< base class */
+
+   uint unit;
+   struct softpipe_context *sp;
+   struct softpipe_tile_cache *cache;
+};
+
 
 
+static INLINE const struct sp_shader_sampler *
+sp_shader_sampler(const struct tgsi_sampler *sampler)
+{
+   return (const struct sp_shader_sampler *) sampler;
+}
+
+
+extern void
+sp_get_samples_fragment(struct tgsi_sampler *tgsi_sampler,
+                        const float s[QUAD_SIZE],
+                        const float t[QUAD_SIZE],
+                        const float p[QUAD_SIZE],
+                        float lodbias,
+                        float rgba[NUM_CHANNELS][QUAD_SIZE]);
+
 extern void
-sp_get_samples(struct tgsi_sampler *sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               float lodbias,
-               float rgba[NUM_CHANNELS][QUAD_SIZE]);
+sp_get_samples_vertex(struct tgsi_sampler *tgsi_sampler,
+                      const float s[QUAD_SIZE],
+                      const float t[QUAD_SIZE],
+                      const float p[QUAD_SIZE],
+                      float lodbias,
+                      float rgba[NUM_CHANNELS][QUAD_SIZE]);
 
 
 #endif /* SP_TEX_SAMPLE_H */
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index a64dc89f43..28a9784b16 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -33,7 +33,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_winsys.h"
+#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
@@ -94,49 +94,23 @@ softpipe_texture_layout(struct pipe_screen *screen,
    return spt->buffer != NULL;
 }
 
-/* Hack it up to use the old winsys->surface_alloc_storage()
- * method for now:
- */
 static boolean
 softpipe_displaytarget_layout(struct pipe_screen *screen,
                               struct softpipe_texture * spt)
 {
    struct pipe_winsys *ws = screen->winsys;
-   struct pipe_surface surf;
-   unsigned flags = (PIPE_BUFFER_USAGE_CPU_READ |
-                     PIPE_BUFFER_USAGE_CPU_WRITE |
-                     PIPE_BUFFER_USAGE_GPU_READ |
-                     PIPE_BUFFER_USAGE_GPU_WRITE);
-   int ret;
-
-
-   memset(&surf, 0, sizeof(surf));
-
-   ret =ws->surface_alloc_storage( ws, 
-                                   &surf,
-                                   spt->base.width[0], 
-                                   spt->base.height[0],
-                                   spt->base.format,
-                                   flags,
-                                   spt->base.tex_usage);
-   if(ret != 0)
-      return FALSE;
-
-   if (!surf.buffer) {
-      /* allocation failed */
-      return FALSE;
-   }
+   unsigned usage = (PIPE_BUFFER_USAGE_CPU_READ_WRITE |
+                     PIPE_BUFFER_USAGE_GPU_READ_WRITE);
 
-   /* Now extract the goodies: 
-    */
    spt->base.nblocksx[0] = pf_get_nblocksx(&spt->base.block, spt->base.width[0]);  
    spt->base.nblocksy[0] = pf_get_nblocksy(&spt->base.block, spt->base.height[0]);  
-   spt->stride[0] = surf.stride;
 
-   /* Transfer the reference:
-    */
-   spt->buffer = surf.buffer;
-   surf.buffer = NULL;
+   spt->buffer = ws->surface_buffer_create( ws, 
+                                            spt->base.width[0], 
+                                            spt->base.height[0],
+                                            spt->base.format,
+                                            usage,
+                                            &spt->stride[0]);
 
    return spt->buffer != NULL;
 }
@@ -231,28 +205,21 @@ softpipe_get_tex_surface(struct pipe_screen *screen,
                          unsigned face, unsigned level, unsigned zslice,
                          unsigned usage)
 {
-   struct pipe_winsys *ws = screen->winsys;
    struct softpipe_texture *spt = softpipe_texture(pt);
    struct pipe_surface *ps;
 
    assert(level <= pt->last_level);
 
    ps = CALLOC_STRUCT(pipe_surface);
-   ps->refcount = 1;
    if (ps) {
-      assert(ps->refcount);
+      ps->refcount = 1;
       pipe_texture_reference(&ps->texture, pt);
-      pipe_buffer_reference(screen, &ps->buffer, spt->buffer);
       ps->format = pt->format;
-      ps->block = pt->block;
       ps->width = pt->width[level];
       ps->height = pt->height[level];
-      ps->nblocksx = pt->nblocksx[level];
-      ps->nblocksy = pt->nblocksy[level];
-      ps->stride = spt->stride[level];
       ps->offset = spt->level_offset[level];
       ps->usage = usage;
-      
+
       /* Because we are softpipe, anything that the state tracker
        * thought was going to be done with the GPU will actually get
        * done with the CPU.  Let's adjust the flags to take that into
@@ -278,8 +245,7 @@ softpipe_get_tex_surface(struct pipe_screen *screen,
 
       if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
          ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
-            ps->nblocksy *
-            ps->stride;
+            pt->nblocksy[level] * spt->stride[level];
       }
       else {
          assert(face == 0);
@@ -299,37 +265,108 @@ softpipe_tex_surface_release(struct pipe_screen *screen,
     * needed post-processing to put them into hardware layout, this is
     * where it would happen.  For softpipe, nothing to do.
     */
-   assert ((*s)->texture);
+   assert(surf->texture);
    if (--surf->refcount == 0) {
-      pipe_texture_reference(&surf->texture, NULL); 
-      pipe_buffer_reference(screen, &surf->buffer, NULL);
+      pipe_texture_reference(&surf->texture, NULL);
       FREE(surf);
    }
    *s = NULL;
 }
 
 
+static struct pipe_transfer *
+softpipe_get_tex_transfer(struct pipe_screen *screen,
+                          struct pipe_texture *texture,
+                          unsigned face, unsigned level, unsigned zslice,
+                          enum pipe_transfer_usage usage,
+                          unsigned x, unsigned y, unsigned w, unsigned h)
+{
+   struct softpipe_texture *sptex = softpipe_texture(texture);
+   struct softpipe_transfer *spt;
+   struct pipe_transfer *pt;
+
+   assert(texture);
+   assert(level <= texture->last_level);
+
+   spt = CALLOC_STRUCT(softpipe_transfer);
+   pt = &spt->base;
+   if (spt) {
+      pt->refcount = 1;
+      pipe_texture_reference(&pt->texture, texture);
+      pt->format = texture->format;
+      pt->block = texture->block;
+      pt->x = x;
+      pt->y = y;
+      pt->width = w;
+      pt->height = h;
+      pt->nblocksx = texture->nblocksx[level];
+      pt->nblocksy = texture->nblocksy[level];
+      pt->stride = sptex->stride[level];
+      spt->offset = sptex->level_offset[level];
+      pt->usage = usage;
+      pt->face = face;
+      pt->level = level;
+      pt->zslice = zslice;
+
+      if (texture->target == PIPE_TEXTURE_CUBE ||
+          texture->target == PIPE_TEXTURE_3D) {
+         spt->offset += ((texture->target == PIPE_TEXTURE_CUBE) ? face :
+                         zslice) * pt->nblocksy * pt->stride;
+      }
+      else {
+         assert(face == 0);
+         assert(zslice == 0);
+      }
+   }
+   return pt;
+}
+
+
+static void 
+softpipe_tex_transfer_release(struct pipe_screen *screen, 
+                              struct pipe_transfer **t)
+{
+   struct softpipe_transfer *transfer = softpipe_transfer(*t);
+   /* Effectively do the texture_update work here - if texture images
+    * needed post-processing to put them into hardware layout, this is
+    * where it would happen.  For softpipe, nothing to do.
+    */
+   assert (transfer->base.texture);
+   if (--transfer->base.refcount == 0) {
+      pipe_texture_reference(&transfer->base.texture, NULL);
+      FREE(transfer);
+   }
+   *t = NULL;
+}
+
+
 static void *
-softpipe_surface_map( struct pipe_screen *screen,
-                      struct pipe_surface *surface,
-                      unsigned flags )
+softpipe_transfer_map( struct pipe_screen *screen,
+                       struct pipe_transfer *transfer )
 {
    ubyte *map;
+   struct softpipe_texture *spt;
+   unsigned flags = 0;
 
-   if (flags & ~surface->usage) {
-      assert(0);
-      return NULL;
+   assert(transfer->texture);
+   spt = softpipe_texture(transfer->texture);
+
+   if (transfer->usage != PIPE_TRANSFER_READ) {
+      flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
+   }
+
+   if (transfer->usage != PIPE_TRANSFER_WRITE) {
+      flags |= PIPE_BUFFER_USAGE_CPU_READ;
    }
 
-   map = pipe_buffer_map( screen, surface->buffer, flags );
+   map = pipe_buffer_map(screen, spt->buffer, flags);
    if (map == NULL)
       return NULL;
 
    /* May want to different things here depending on read/write nature
     * of the map:
     */
-   if (surface->texture &&
-       (flags & PIPE_BUFFER_USAGE_CPU_WRITE)) 
+   if (transfer->texture && transfer->usage != PIPE_TRANSFER_READ) 
    {
       /* Do something to notify sharing contexts of a texture change.
        * In softpipe, that would mean flushing the texture cache.
@@ -337,15 +374,22 @@ softpipe_surface_map( struct pipe_screen *screen,
       softpipe_screen(screen)->timestamp++;
    }
    
-   return map + surface->offset;
+   return map + softpipe_transfer(transfer)->offset +
+      transfer->y / transfer->block.height * transfer->stride +
+      transfer->x / transfer->block.width * transfer->block.size;
 }
 
 
 static void
-softpipe_surface_unmap(struct pipe_screen *screen,
-                       struct pipe_surface *surface)
+softpipe_transfer_unmap(struct pipe_screen *screen,
+                       struct pipe_transfer *transfer)
 {
-   pipe_buffer_unmap( screen, surface->buffer );
+   struct softpipe_texture *spt;
+
+   assert(transfer->texture);
+   spt = softpipe_texture(transfer->texture);
+
+   pipe_buffer_unmap( screen, spt->buffer );
 }
 
 
@@ -365,6 +409,8 @@ softpipe_init_screen_texture_funcs(struct pipe_screen *screen)
    screen->get_tex_surface = softpipe_get_tex_surface;
    screen->tex_surface_release = softpipe_tex_surface_release;
 
-   screen->surface_map = softpipe_surface_map;
-   screen->surface_unmap = softpipe_surface_unmap;
+   screen->get_tex_transfer = softpipe_get_tex_transfer;
+   screen->tex_transfer_release = softpipe_tex_transfer_release;
+   screen->transfer_map = softpipe_transfer_map;
+   screen->transfer_unmap = softpipe_transfer_unmap;
 }
diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h
index bf437a7c61..893aa7d11d 100644
--- a/src/gallium/drivers/softpipe/sp_texture.h
+++ b/src/gallium/drivers/softpipe/sp_texture.h
@@ -42,7 +42,7 @@ struct softpipe_texture
    struct pipe_texture base;
 
    unsigned long level_offset[PIPE_MAX_TEXTURE_LEVELS];
-   unsigned long stride[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned stride[PIPE_MAX_TEXTURE_LEVELS];
 
    /* The data is held here:
     */
@@ -51,14 +51,27 @@ struct softpipe_texture
    boolean modified;
 };
 
+struct softpipe_transfer
+{
+   struct pipe_transfer base;
+
+   unsigned long offset;
+};
+
 
-/** cast wrapper */
+/** cast wrappers */
 static INLINE struct softpipe_texture *
 softpipe_texture(struct pipe_texture *pt)
 {
    return (struct softpipe_texture *) pt;
 }
 
+static INLINE struct softpipe_transfer *
+softpipe_transfer(struct pipe_transfer *pt)
+{
+   return (struct softpipe_transfer *) pt;
+}
+
 
 extern void
 softpipe_init_texture_funcs( struct softpipe_context *softpipe );
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index b50c984513..593360aab0 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 /**
- * Framebuffer/surface tile caching.
+ * Texture tile caching.
  *
  * Author:
  *    Brian Paul
@@ -40,7 +40,7 @@
 #include "sp_texture.h"
 #include "sp_tile_cache.h"
 
-#define NUM_ENTRIES 32
+#define NUM_ENTRIES 50
 
 
 /** XXX move these */
@@ -52,7 +52,8 @@ struct softpipe_tile_cache
 {
    struct pipe_screen *screen;
    struct pipe_surface *surface;  /**< the surface we're caching */
-   void *surface_map;
+   struct pipe_transfer *transfer;
+   void *transfer_map;
    struct pipe_texture *texture;  /**< if caching a texture */
    struct softpipe_cached_tile entries[NUM_ENTRIES];
    uint clear_flags[(MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32];
@@ -60,8 +61,8 @@ struct softpipe_tile_cache
    uint clear_val;
    boolean depth_stencil; /** Is the surface a depth/stencil format? */
 
-   struct pipe_surface *tex_surf;
-   void *tex_surf_map;
+   struct pipe_transfer *tex_trans;
+   void *tex_trans_map;
    int tex_face, tex_level, tex_z;
 
    struct softpipe_cached_tile tile;  /**< scratch tile for clears */
@@ -131,16 +132,19 @@ sp_create_tile_cache( struct pipe_screen *screen )
 void
 sp_destroy_tile_cache(struct softpipe_tile_cache *tc)
 {
+   struct pipe_screen *screen;
    uint pos;
 
    for (pos = 0; pos < NUM_ENTRIES; pos++) {
       /*assert(tc->entries[pos].x < 0);*/
    }
-   if (tc->surface) {
-      pipe_surface_reference(&tc->surface, NULL);
+   if (tc->transfer) {
+      screen = tc->transfer->texture->screen;
+      screen->tex_transfer_release(screen, &tc->transfer);
    }
-   if (tc->tex_surf) {
-      pipe_surface_reference(&tc->tex_surf, NULL);
+   if (tc->tex_trans) {
+      screen = tc->tex_trans->texture->screen;
+      screen->tex_transfer_release(screen, &tc->tex_trans);
    }
 
    FREE( tc );
@@ -156,18 +160,29 @@ sp_tile_cache_set_surface(struct softpipe_tile_cache *tc,
 {
    assert(!tc->texture);
 
-   if (tc->surface_map) {
-      tc->screen->surface_unmap(tc->screen, tc->surface);
-      tc->surface_map = NULL;
+   if (tc->transfer) {
+      struct pipe_screen *screen = tc->transfer->texture->screen;
+
+      if (ps == tc->surface)
+         return;
+
+      if (tc->transfer_map) {
+         tc->screen->transfer_unmap(tc->screen, tc->transfer);
+         tc->transfer_map = NULL;
+      }
+
+      screen->tex_transfer_release(screen, &tc->transfer);
    }
 
-   pipe_surface_reference(&tc->surface, ps);
+   tc->surface = ps;
+
+   if (ps) {
+      struct pipe_screen *screen = ps->texture->screen;
 
-   if (tc->surface) {
-      if (tc->surface_map) /* XXX: this is always NULL!? */
-	 tc->surface_map = tc->screen->surface_map(tc->screen, tc->surface,
-                                                   PIPE_BUFFER_USAGE_CPU_READ | 
-                                                   PIPE_BUFFER_USAGE_CPU_WRITE);
+      tc->transfer = screen->get_tex_transfer(screen, ps->texture, ps->face,
+                                              ps->level, ps->zslice,
+                                              PIPE_TRANSFER_READ_WRITE,
+                                              0, 0, ps->width, ps->height);
 
       tc->depth_stencil = (ps->format == PIPE_FORMAT_S8Z24_UNORM ||
                            ps->format == PIPE_FORMAT_X8Z24_UNORM ||
@@ -181,7 +196,7 @@ sp_tile_cache_set_surface(struct softpipe_tile_cache *tc,
 
 
 /**
- * Return the surface being cached.
+ * Return the transfer being cached.
  */
 struct pipe_surface *
 sp_tile_cache_get_surface(struct softpipe_tile_cache *tc)
@@ -191,30 +206,27 @@ sp_tile_cache_get_surface(struct softpipe_tile_cache *tc)
 
 
 void
-sp_tile_cache_map_surfaces(struct softpipe_tile_cache *tc)
+sp_tile_cache_map_transfers(struct softpipe_tile_cache *tc)
 {
-   if (tc->surface && !tc->surface_map)
-      tc->surface_map = tc->screen->surface_map(tc->screen, tc->surface,
-                                                PIPE_BUFFER_USAGE_CPU_WRITE |
-                                                PIPE_BUFFER_USAGE_CPU_READ);
-
-   if (tc->tex_surf && !tc->tex_surf_map)
-      tc->tex_surf_map = tc->screen->surface_map(tc->screen, tc->tex_surf,
-                                                 PIPE_BUFFER_USAGE_CPU_READ);
+   if (tc->transfer && !tc->transfer_map)
+      tc->transfer_map = tc->screen->transfer_map(tc->screen, tc->transfer);
+
+   if (tc->tex_trans && !tc->tex_trans_map)
+      tc->tex_trans_map = tc->screen->transfer_map(tc->screen, tc->tex_trans);
 }
 
 
 void
-sp_tile_cache_unmap_surfaces(struct softpipe_tile_cache *tc)
+sp_tile_cache_unmap_transfers(struct softpipe_tile_cache *tc)
 {
-   if (tc->surface_map) {
-      tc->screen->surface_unmap(tc->screen, tc->surface);
-      tc->surface_map = NULL;
+   if (tc->transfer_map) {
+      tc->screen->transfer_unmap(tc->screen, tc->transfer);
+      tc->transfer_map = NULL;
    }
 
-   if (tc->tex_surf_map) {
-      tc->screen->surface_unmap(tc->screen, tc->tex_surf);
-      tc->tex_surf_map = NULL;
+   if (tc->tex_trans_map) {
+      tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
+      tc->tex_trans_map = NULL;
    }
 }
 
@@ -229,15 +241,20 @@ sp_tile_cache_set_texture(struct pipe_context *pipe,
 {
    uint i;
 
-   assert(!tc->surface);
+   assert(!tc->transfer);
 
    pipe_texture_reference(&tc->texture, texture);
 
-   if (tc->tex_surf_map) {
-      tc->screen->surface_unmap(tc->screen, tc->tex_surf);
-      tc->tex_surf_map = NULL;
+   if (tc->transfer) {
+      struct pipe_screen *screen = tc->transfer->texture->screen;
+
+      if (tc->tex_trans_map) {
+         tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
+         tc->tex_trans_map = NULL;
+      }
+
+      screen->tex_transfer_release(screen, &tc->tex_trans);
    }
-   pipe_surface_reference(&tc->tex_surf, NULL);
 
    /* mark as entries as invalid/empty */
    /* XXX we should try to avoid this when the teximage hasn't changed */
@@ -328,20 +345,20 @@ static void
 sp_tile_cache_flush_clear(struct pipe_context *pipe,
                           struct softpipe_tile_cache *tc)
 {
-   struct pipe_surface *ps = tc->surface;
-   const uint w = tc->surface->width;
-   const uint h = tc->surface->height;
+   struct pipe_transfer *pt = tc->transfer;
+   const uint w = tc->transfer->width;
+   const uint h = tc->transfer->height;
    uint x, y;
    uint numCleared = 0;
 
    /* clear the scratch tile to the clear value */
-   clear_tile(&tc->tile, ps->format, tc->clear_val);
+   clear_tile(&tc->tile, pt->format, tc->clear_val);
 
    /* push the tile to all positions marked as clear */
    for (y = 0; y < h; y += TILE_SIZE) {
       for (x = 0; x < w; x += TILE_SIZE) {
          if (is_clear_flag_set(tc->clear_flags, x, y)) {
-            pipe_put_tile_raw(ps,
+            pipe_put_tile_raw(pt,
                               x, y, TILE_SIZE, TILE_SIZE,
                               tc->tile.data.color32, 0/*STRIDE*/);
 
@@ -359,28 +376,28 @@ sp_tile_cache_flush_clear(struct pipe_context *pipe,
 
 
 /**
- * Flush the tile cache: write all dirty tiles back to the surface.
+ * Flush the tile cache: write all dirty tiles back to the transfer.
  * any tiles "flagged" as cleared will be "really" cleared.
  */
 void
 sp_flush_tile_cache(struct softpipe_context *softpipe,
                     struct softpipe_tile_cache *tc)
 {
-   struct pipe_surface *ps = tc->surface;
+   struct pipe_transfer *pt = tc->transfer;
    int inuse = 0, pos;
 
-   if (ps && ps->buffer) {
-      /* caching a drawing surface */
+   if (pt) {
+      /* caching a drawing transfer */
       for (pos = 0; pos < NUM_ENTRIES; pos++) {
          struct softpipe_cached_tile *tile = tc->entries + pos;
          if (tile->x >= 0) {
             if (tc->depth_stencil) {
-               pipe_put_tile_raw(ps,
+               pipe_put_tile_raw(pt,
                                  tile->x, tile->y, TILE_SIZE, TILE_SIZE,
                                  tile->data.depth32, 0/*STRIDE*/);
             }
             else {
-               pipe_put_tile_rgba(ps,
+               pipe_put_tile_rgba(pt,
                                   tile->x, tile->y, TILE_SIZE, TILE_SIZE,
                                   (float *) tile->data.color);
             }
@@ -415,7 +432,7 @@ struct softpipe_cached_tile *
 sp_get_cached_tile(struct softpipe_context *softpipe,
                    struct softpipe_tile_cache *tc, int x, int y)
 {
-   struct pipe_surface *ps = tc->surface;
+   struct pipe_transfer *pt = tc->transfer;
 
    /* tile pos in framebuffer: */
    const int tile_x = x & ~(TILE_SIZE - 1);
@@ -431,12 +448,12 @@ sp_get_cached_tile(struct softpipe_context *softpipe,
       if (tile->x != -1) {
          /* put dirty tile back in framebuffer */
          if (tc->depth_stencil) {
-            pipe_put_tile_raw(ps,
+            pipe_put_tile_raw(pt,
                               tile->x, tile->y, TILE_SIZE, TILE_SIZE,
                               tile->data.depth32, 0/*STRIDE*/);
          }
          else {
-            pipe_put_tile_rgba(ps,
+            pipe_put_tile_rgba(pt,
                                tile->x, tile->y, TILE_SIZE, TILE_SIZE,
                                (float *) tile->data.color);
          }
@@ -448,22 +465,22 @@ sp_get_cached_tile(struct softpipe_context *softpipe,
       if (is_clear_flag_set(tc->clear_flags, x, y)) {
          /* don't get tile from framebuffer, just clear it */
          if (tc->depth_stencil) {
-            clear_tile(tile, ps->format, tc->clear_val);
+            clear_tile(tile, pt->format, tc->clear_val);
          }
          else {
-            clear_tile_rgba(tile, ps->format, tc->clear_color);
+            clear_tile_rgba(tile, pt->format, tc->clear_color);
          }
          clear_clear_flag(tc->clear_flags, x, y);
       }
       else {
-         /* get new tile data from surface */
+         /* get new tile data from transfer */
          if (tc->depth_stencil) {
-            pipe_get_tile_raw(ps,
+            pipe_get_tile_raw(pt,
                               tile->x, tile->y, TILE_SIZE, TILE_SIZE,
                               tile->data.depth32, 0/*STRIDE*/);
          }
          else {
-            pipe_get_tile_rgba(ps,
+            pipe_get_tile_rgba(pt,
                                tile->x, tile->y, TILE_SIZE, TILE_SIZE,
                                (float *) tile->data.color);
          }
@@ -484,7 +501,7 @@ sp_get_cached_tile(struct softpipe_context *softpipe,
 static INLINE uint
 tex_cache_pos(int x, int y, int z, int face, int level)
 {
-   uint entry = x + y * 5 + z * 4 + face + level;
+   uint entry = x + y * 9 + z * 3 + face + level * 7;
    return entry % NUM_ENTRIES;
 }
 
@@ -494,11 +511,11 @@ tex_cache_pos(int x, int y, int z, int face, int level)
  * Tiles are read-only and indexed with more params.
  */
 const struct softpipe_cached_tile *
-sp_get_cached_tile_tex(struct pipe_context *pipe,
+sp_get_cached_tile_tex(struct softpipe_context *sp,
                        struct softpipe_tile_cache *tc, int x, int y, int z,
                        int face, int level)
 {
-   struct pipe_screen *screen = pipe->screen;
+   struct pipe_screen *screen = sp->pipe.screen;
    /* tile pos in framebuffer: */
    const int tile_x = x & ~(TILE_SIZE - 1);
    const int tile_y = y & ~(TILE_SIZE - 1);
@@ -510,8 +527,12 @@ sp_get_cached_tile_tex(struct pipe_context *pipe,
    if (tc->texture) {
       struct softpipe_texture *spt = softpipe_texture(tc->texture);
       if (spt->modified) {
-         /* texture was modified, force a cache reload */
-         tile->x = -1;
+         /* texture was modified, invalidate all cached tiles */
+         uint p;
+         for (p = 0; p < NUM_ENTRIES; p++) {
+            tile = tc->entries + p;
+            tile->x = -1;
+         }
          spt->modified = FALSE;
       }
    }
@@ -523,28 +544,37 @@ sp_get_cached_tile_tex(struct pipe_context *pipe,
        level != tile->level) {
       /* cache miss */
 
-      /* check if we need to get a new surface */
-      if (!tc->tex_surf ||
+#if 0
+      printf("miss at %u  x=%d y=%d z=%d face=%d level=%d\n", pos,
+             x/TILE_SIZE, y/TILE_SIZE, z, face, level);
+#endif
+      /* check if we need to get a new transfer */
+      if (!tc->tex_trans ||
           tc->tex_face != face ||
           tc->tex_level != level ||
           tc->tex_z != z) {
-         /* get new surface (view into texture) */
+         /* get new transfer (view into texture) */
+
+         if (tc->transfer) {
+            if (tc->tex_trans_map)
+               tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
 
-	 if (tc->tex_surf_map)
-            tc->screen->surface_unmap(tc->screen, tc->tex_surf);
+            screen->tex_transfer_release(screen, &tc->tex_trans);
+         }
 
-         tc->tex_surf = screen->get_tex_surface(screen, tc->texture, face, level, z, 
-                                                PIPE_BUFFER_USAGE_CPU_READ);
-         tc->tex_surf_map = screen->surface_map(screen, tc->tex_surf,
-                                                PIPE_BUFFER_USAGE_CPU_READ);
+         tc->tex_trans = screen->get_tex_transfer(screen, tc->texture, face, level, z, 
+                                                  PIPE_TRANSFER_READ, 0, 0,
+                                                  tc->texture->width[level],
+                                                  tc->texture->height[level]);
+         tc->tex_trans_map = screen->transfer_map(screen, tc->tex_trans);
 
          tc->tex_face = face;
          tc->tex_level = level;
          tc->tex_z = z;
       }
 
-      /* get tile from the surface (view into texture) */
-      pipe_get_tile_rgba(tc->tex_surf,
+      /* get tile from the transfer (view into texture) */
+      pipe_get_tile_rgba(tc->tex_trans,
                          tile_x, tile_y, TILE_SIZE, TILE_SIZE,
                          (float *) tile->data.color);
       tile->x = tile_x;
@@ -571,7 +601,7 @@ sp_tile_cache_clear(struct softpipe_tile_cache *tc, uint clearValue)
 
    tc->clear_val = clearValue;
 
-   switch (tc->surface->format) {
+   switch (tc->transfer->format) {
    case PIPE_FORMAT_R8G8B8A8_UNORM:
       r = (clearValue >> 24) & 0xff;
       g = (clearValue >> 16) & 0xff;
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.h b/src/gallium/drivers/softpipe/sp_tile_cache.h
index bc96c941f6..9ac3fdda94 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.h
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.h
@@ -74,10 +74,10 @@ extern struct pipe_surface *
 sp_tile_cache_get_surface(struct softpipe_tile_cache *tc);
 
 extern void
-sp_tile_cache_map_surfaces(struct softpipe_tile_cache *tc);
+sp_tile_cache_map_transfers(struct softpipe_tile_cache *tc);
 
 extern void
-sp_tile_cache_unmap_surfaces(struct softpipe_tile_cache *tc);
+sp_tile_cache_unmap_transfers(struct softpipe_tile_cache *tc);
 
 extern void
 sp_tile_cache_set_texture(struct pipe_context *pipe,
@@ -96,7 +96,7 @@ sp_get_cached_tile(struct softpipe_context *softpipe,
                    struct softpipe_tile_cache *tc, int x, int y);
 
 extern const struct softpipe_cached_tile *
-sp_get_cached_tile_tex(struct pipe_context *pipe,
+sp_get_cached_tile_tex(struct softpipe_context *softpipe,
                        struct softpipe_tile_cache *tc, int x, int y, int z,
                        int face, int level);
 
diff --git a/src/gallium/drivers/trace/Makefile b/src/gallium/drivers/trace/Makefile
new file mode 100644
index 0000000000..e1bd970937
--- /dev/null
+++ b/src/gallium/drivers/trace/Makefile
@@ -0,0 +1,14 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = trace
+
+C_SOURCES = \
+	tr_context.c \
+	tr_dump.c \
+	tr_screen.c \
+	tr_state.c \
+	tr_texture.c \
+	tr_winsys.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index 1dd7719379..ec8be27077 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -274,11 +274,11 @@ static INLINE boolean
 trace_context_get_query_result(struct pipe_context *_pipe, 
                                struct pipe_query *query,
                                boolean wait,
-                               uint64 *presult)
+                               uint64_t *presult)
 {
    struct trace_context *tr_ctx = trace_context(_pipe);
    struct pipe_context *pipe = tr_ctx->pipe;
-   uint64 result;
+   uint64_t result;
    boolean _result;
 
    trace_dump_call_begin("pipe_context", "get_query_result");
@@ -722,9 +722,9 @@ trace_context_set_framebuffer_state(struct pipe_context *_pipe,
    
    /* Unwrap the input state */
    memcpy(&unwrapped_state, state, sizeof(unwrapped_state));
-   for(i = 0; i < state->num_cbufs; ++i)
+   for(i = 0; i < state->nr_cbufs; ++i)
       unwrapped_state.cbufs[i] = trace_surface_unwrap(tr_ctx, state->cbufs[i]);
-   for(i = state->num_cbufs; i < PIPE_MAX_COLOR_BUFS; ++i)
+   for(i = state->nr_cbufs; i < PIPE_MAX_COLOR_BUFS; ++i)
       unwrapped_state.cbufs[i] = NULL;
    unwrapped_state.zsbuf = trace_surface_unwrap(tr_ctx, state->zsbuf);
    state = &unwrapped_state;
diff --git a/src/gallium/drivers/trace/tr_context.h b/src/gallium/drivers/trace/tr_context.h
index 7831900ec2..6704175964 100644
--- a/src/gallium/drivers/trace/tr_context.h
+++ b/src/gallium/drivers/trace/tr_context.h
@@ -30,7 +30,7 @@
 
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_debug.h"
+#include "util/u_debug.h"
 #include "pipe/p_context.h"
 
 
diff --git a/src/gallium/drivers/trace/tr_dump.c b/src/gallium/drivers/trace/tr_dump.c
index a0ead0ded3..d98cef221b 100644
--- a/src/gallium/drivers/trace/tr_dump.c
+++ b/src/gallium/drivers/trace/tr_dump.c
@@ -45,7 +45,7 @@
 #endif
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_debug.h"
+#include "util/u_debug.h"
 #include "util/u_memory.h"
 #include "util/u_string.h"
 #include "util/u_stream.h"
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
index 8789f86b1a..164c6bbc4d 100644
--- a/src/gallium/drivers/trace/tr_screen.c
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -315,26 +315,101 @@ trace_screen_tex_surface_release(struct pipe_screen *_screen,
 }
 
 
+static struct pipe_transfer *
+trace_screen_get_tex_transfer(struct pipe_screen *_screen,
+                              struct pipe_texture *texture,
+                              unsigned face, unsigned level,
+                              unsigned zslice,
+                              enum pipe_transfer_usage usage,
+                              unsigned x, unsigned y, unsigned w, unsigned h)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct trace_texture *tr_tex;
+   struct pipe_transfer *result;
+   
+   assert(texture);
+   tr_tex = trace_texture(tr_scr, texture);
+   texture = tr_tex->texture;
+   assert(texture->screen == screen);
+   
+   trace_dump_call_begin("pipe_screen", "get_tex_transfer");
+   
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(ptr, texture);
+   trace_dump_arg(uint, face);
+   trace_dump_arg(uint, level);
+   trace_dump_arg(uint, zslice);
+   trace_dump_arg(uint, usage);
+
+   result = screen->get_tex_transfer(screen, texture, face, level, zslice, usage,
+                                     x, y, w, h);
+
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   result = trace_transfer_create(tr_tex, result);
+
+   return result;
+}
+
+
+static void 
+trace_screen_tex_transfer_release(struct pipe_screen *_screen,
+                                 struct pipe_transfer **ptransfer)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct trace_texture *tr_tex;
+   struct trace_transfer *tr_trans;
+   struct pipe_transfer *transfer;
+   
+   assert(ptransfer);
+   if(*ptransfer) {
+      tr_tex = trace_texture(tr_scr, (*ptransfer)->texture);
+      tr_trans = trace_transfer(tr_tex, *ptransfer);
+      transfer = tr_trans->transfer;
+   }
+   else
+      transfer = NULL;
+   
+   if (*ptransfer) {
+      if (!--(*ptransfer)->refcount) {
+         trace_dump_call_begin("pipe_screen", "tex_transfer_destroy");
+         
+         trace_dump_arg(ptr, screen);
+         trace_dump_arg(ptr, transfer);
+
+         trace_transfer_destroy(tr_tex, *ptransfer);
+
+         trace_dump_call_end();
+      }
+   
+      *ptransfer = NULL;
+   }
+}
+
+
 static void *
-trace_screen_surface_map(struct pipe_screen *_screen,
-                         struct pipe_surface *surface,
-                         unsigned flags)
+trace_screen_transfer_map(struct pipe_screen *_screen,
+                          struct pipe_transfer *transfer)
 {
    struct trace_screen *tr_scr = trace_screen(_screen);
    struct pipe_screen *screen = tr_scr->screen;
    struct trace_texture *tr_tex;
-   struct trace_surface *tr_surf;
+   struct trace_transfer *tr_trans;
    void *map;
    
-   tr_tex = trace_texture(tr_scr, surface->texture);
-   tr_surf = trace_surface(tr_tex, surface);
-   surface = tr_surf->surface;
+   tr_tex = trace_texture(tr_scr, transfer->texture);
+   tr_trans = trace_transfer(tr_tex, transfer);
+   transfer = tr_trans->transfer;
 
-   map = screen->surface_map(screen, surface, flags);
+   map = screen->transfer_map(screen, transfer);
    if(map) {
-      if(flags & PIPE_BUFFER_USAGE_CPU_WRITE) {
-         assert(!tr_surf->map);
-         tr_surf->map = map;
+      if(transfer->usage != PIPE_TRANSFER_READ) {
+         assert(!tr_trans->map);
+         tr_trans->map = map;
       }
    }
    
@@ -343,33 +418,33 @@ trace_screen_surface_map(struct pipe_screen *_screen,
 
 
 static void 
-trace_screen_surface_unmap(struct pipe_screen *_screen,
-                           struct pipe_surface *surface)
+trace_screen_transfer_unmap(struct pipe_screen *_screen,
+                           struct pipe_transfer *transfer)
 {
    struct trace_screen *tr_scr = trace_screen(_screen);
    struct pipe_screen *screen = tr_scr->screen;
    struct trace_texture *tr_tex;
-   struct trace_surface *tr_surf;
+   struct trace_transfer *tr_trans;
    
-   tr_tex = trace_texture(tr_scr, surface->texture);
-   tr_surf = trace_surface(tr_tex, surface);
-   surface = tr_surf->surface;
+   tr_tex = trace_texture(tr_scr, transfer->texture);
+   tr_trans = trace_transfer(tr_tex, transfer);
+   transfer = tr_trans->transfer;
    
-   if(tr_surf->map) {
-      size_t size = surface->nblocksy * surface->stride;
+   if(tr_trans->map) {
+      size_t size = transfer->nblocksy * transfer->stride;
       
-      trace_dump_call_begin("pipe_winsys", "surface_write");
+      trace_dump_call_begin("pipe_winsys", "transfer_write");
       
       trace_dump_arg(ptr, screen);
       
-      trace_dump_arg(ptr, surface);
+      trace_dump_arg(ptr, transfer);
       
       trace_dump_arg_begin("data");
-      trace_dump_bytes(tr_surf->map, size);
+      trace_dump_bytes(tr_trans->map, size);
       trace_dump_arg_end();
 
       trace_dump_arg_begin("stride");
-      trace_dump_uint(surface->stride);
+      trace_dump_uint(transfer->stride);
       trace_dump_arg_end();
 
       trace_dump_arg_begin("size");
@@ -378,10 +453,10 @@ trace_screen_surface_unmap(struct pipe_screen *_screen,
    
       trace_dump_call_end();
 
-      tr_surf->map = NULL;
+      tr_trans->map = NULL;
    }
 
-   screen->surface_unmap(screen, surface);
+   screen->transfer_unmap(screen, transfer);
 }
 
 
@@ -437,8 +512,10 @@ trace_screen_create(struct pipe_screen *screen)
    tr_scr->base.texture_release = trace_screen_texture_release;
    tr_scr->base.get_tex_surface = trace_screen_get_tex_surface;
    tr_scr->base.tex_surface_release = trace_screen_tex_surface_release;
-   tr_scr->base.surface_map = trace_screen_surface_map;
-   tr_scr->base.surface_unmap = trace_screen_surface_unmap;
+   tr_scr->base.get_tex_transfer = trace_screen_get_tex_transfer;
+   tr_scr->base.tex_transfer_release = trace_screen_tex_transfer_release;
+   tr_scr->base.transfer_map = trace_screen_transfer_map;
+   tr_scr->base.transfer_unmap = trace_screen_transfer_unmap;
    
    tr_scr->screen = screen;
 
diff --git a/src/gallium/drivers/trace/tr_state.c b/src/gallium/drivers/trace/tr_state.c
index 986d939e0c..81a9e2376e 100644
--- a/src/gallium/drivers/trace/tr_state.c
+++ b/src/gallium/drivers/trace/tr_state.c
@@ -223,7 +223,6 @@ void trace_dump_constant_buffer(const struct pipe_constant_buffer *state)
    trace_dump_struct_begin("pipe_constant_buffer");
 
    trace_dump_member(ptr, state, buffer);
-   trace_dump_member(uint, state, size);
 
    trace_dump_struct_end();
 }
@@ -280,9 +279,9 @@ void trace_dump_depth_stencil_alpha_state(const struct pipe_depth_stencil_alpha_
       trace_dump_member(uint, &state->stencil[i], fail_op);
       trace_dump_member(uint, &state->stencil[i], zpass_op);
       trace_dump_member(uint, &state->stencil[i], zfail_op);
-      trace_dump_member(uint, &state->stencil[i], ref_value);    
-      trace_dump_member(uint, &state->stencil[i], value_mask);
-      trace_dump_member(uint, &state->stencil[i], write_mask);
+      trace_dump_member(uint, &state->stencil[i], ref_value);
+      trace_dump_member(uint, &state->stencil[i], valuemask);
+      trace_dump_member(uint, &state->stencil[i], writemask);
       trace_dump_struct_end();
       trace_dump_elem_end();
    }
@@ -293,7 +292,7 @@ void trace_dump_depth_stencil_alpha_state(const struct pipe_depth_stencil_alpha_
    trace_dump_struct_begin("pipe_alpha_state");
    trace_dump_member(bool, &state->alpha, enabled);
    trace_dump_member(uint, &state->alpha, func);
-   trace_dump_member(float, &state->alpha, ref);
+   trace_dump_member(float, &state->alpha, ref_value);
    trace_dump_struct_end();
    trace_dump_member_end();
 
@@ -351,7 +350,7 @@ void trace_dump_framebuffer_state(const struct pipe_framebuffer_state *state)
 
    trace_dump_member(uint, state, width);
    trace_dump_member(uint, state, height);
-   trace_dump_member(uint, state, num_cbufs);
+   trace_dump_member(uint, state, nr_cbufs);
    trace_dump_member_array(ptr, state, cbufs);
    trace_dump_member(ptr, state, zsbuf);
 
@@ -398,13 +397,39 @@ void trace_dump_surface(const struct pipe_surface *state)
 
    trace_dump_struct_begin("pipe_surface");
 
-   trace_dump_member(ptr, state, buffer);
    trace_dump_member(format, state, format);
    trace_dump_member(uint, state, status);
    trace_dump_member(uint, state, clear_value);
    trace_dump_member(uint, state, width);
    trace_dump_member(uint, state, height);
 
+   trace_dump_member(uint, state, layout);
+   trace_dump_member(uint, state, offset);
+   trace_dump_member(uint, state, refcount);
+   trace_dump_member(uint, state, usage);
+
+   trace_dump_member(ptr, state, texture);
+   trace_dump_member(uint, state, face);
+   trace_dump_member(uint, state, level);
+   trace_dump_member(uint, state, zslice);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_transfer(const struct pipe_transfer *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_transfer");
+
+   trace_dump_member(format, state, format);
+   trace_dump_member(uint, state, width);
+   trace_dump_member(uint, state, height);
+
    trace_dump_member_begin("block");
    trace_dump_block(&state->block);
    trace_dump_member_end();
@@ -412,8 +437,6 @@ void trace_dump_surface(const struct pipe_surface *state)
    trace_dump_member(uint, state, nblocksx);
    trace_dump_member(uint, state, nblocksy);
    trace_dump_member(uint, state, stride);
-   trace_dump_member(uint, state, layout);
-   trace_dump_member(uint, state, offset);
    trace_dump_member(uint, state, refcount);
    trace_dump_member(uint, state, usage);
 
@@ -435,7 +458,7 @@ void trace_dump_vertex_buffer(const struct pipe_vertex_buffer *state)
 
    trace_dump_struct_begin("pipe_vertex_buffer");
 
-   trace_dump_member(uint, state, pitch);
+   trace_dump_member(uint, state, stride);
    trace_dump_member(uint, state, max_index);
    trace_dump_member(uint, state, buffer_offset);
    trace_dump_member(ptr, state, buffer);
diff --git a/src/gallium/drivers/trace/tr_state.h b/src/gallium/drivers/trace/tr_state.h
index 5ae533dc66..513ed0ac98 100644
--- a/src/gallium/drivers/trace/tr_state.h
+++ b/src/gallium/drivers/trace/tr_state.h
@@ -68,6 +68,8 @@ void trace_dump_sampler_state(const struct pipe_sampler_state *state);
 
 void trace_dump_surface(const struct pipe_surface *state);
 
+void trace_dump_transfer(const struct pipe_transfer *state);
+
 void trace_dump_vertex_buffer(const struct pipe_vertex_buffer *state);
 
 void trace_dump_vertex_element(const struct pipe_vertex_element *state);
diff --git a/src/gallium/drivers/trace/tr_texture.c b/src/gallium/drivers/trace/tr_texture.c
index 440a78704a..120ba0dd31 100644
--- a/src/gallium/drivers/trace/tr_texture.c
+++ b/src/gallium/drivers/trace/tr_texture.c
@@ -87,7 +87,6 @@ trace_surface_create(struct trace_texture *tr_tex,
    
    memcpy(&tr_surf->base, surface, sizeof(struct pipe_surface));
    
-   tr_surf->base.winsys = tr_tex->base.screen->winsys;
    tr_surf->base.texture = NULL;
    pipe_texture_reference(&tr_surf->base.texture, &tr_tex->base);
    tr_surf->surface = surface;
@@ -110,3 +109,43 @@ trace_surface_destroy(struct trace_texture *tr_tex,
    FREE(tr_surf);
 }
 
+
+struct pipe_transfer *
+trace_transfer_create(struct trace_texture *tr_tex, 
+                     struct pipe_transfer *transfer)
+{
+   struct trace_transfer *tr_trans;
+   
+   if(!transfer)
+      goto error;
+   
+   assert(transfer->texture == tr_tex->texture);
+   
+   tr_trans = CALLOC_STRUCT(trace_transfer);
+   if(!tr_trans)
+      goto error;
+   
+   memcpy(&tr_trans->base, transfer, sizeof(struct pipe_transfer));
+   
+   tr_trans->base.texture = NULL;
+   pipe_texture_reference(&tr_trans->base.texture, &tr_tex->base);
+   tr_trans->transfer = transfer;
+
+   return &tr_trans->base;
+   
+error:
+   pipe_transfer_reference(&transfer, NULL);
+   return NULL;
+}
+
+
+void
+trace_transfer_destroy(struct trace_texture *tr_tex, 
+                      struct pipe_transfer *transfer)
+{
+   struct trace_transfer *tr_trans = trace_transfer(tr_tex, transfer);
+   pipe_texture_reference(&tr_trans->base.texture, NULL);
+   pipe_transfer_reference(&tr_trans->transfer, NULL);
+   FREE(tr_trans);
+}
+
diff --git a/src/gallium/drivers/trace/tr_texture.h b/src/gallium/drivers/trace/tr_texture.h
index 9e72edb8a3..168cefd53d 100644
--- a/src/gallium/drivers/trace/tr_texture.h
+++ b/src/gallium/drivers/trace/tr_texture.h
@@ -48,6 +48,14 @@ struct trace_surface
    struct pipe_surface base;
 
    struct pipe_surface *surface;
+};
+
+
+struct trace_transfer
+{
+   struct pipe_transfer base;
+
+   struct pipe_transfer *transfer;
    
    void *map;
 };
@@ -75,6 +83,17 @@ trace_surface(struct trace_texture *tr_tex,
 }
 
 
+static INLINE struct trace_transfer *
+trace_transfer(struct trace_texture *tr_tex, 
+               struct pipe_transfer *transfer)
+{
+   if(!transfer)
+      return NULL;
+   assert(transfer->texture == &tr_tex->base);
+   return (struct trace_transfer *)transfer;
+}
+
+
 struct pipe_texture *
 trace_texture_create(struct trace_screen *tr_scr, 
                      struct pipe_texture *texture);
@@ -91,5 +110,13 @@ void
 trace_surface_destroy(struct trace_texture *tr_tex,
                       struct pipe_surface *surface);
 
+struct pipe_transfer *
+trace_transfer_create(struct trace_texture *tr_tex, 
+                      struct pipe_transfer *transfer);
+
+void
+trace_transfer_destroy(struct trace_texture *tr_tex,
+                       struct pipe_transfer *transfer);
+
 
 #endif /* TR_TEXTURE_H_ */
diff --git a/src/gallium/drivers/trace/tr_winsys.c b/src/gallium/drivers/trace/tr_winsys.c
index 177835854e..c4148fe810 100644
--- a/src/gallium/drivers/trace/tr_winsys.c
+++ b/src/gallium/drivers/trace/tr_winsys.c
@@ -98,86 +98,41 @@ trace_winsys_flush_frontbuffer(struct pipe_winsys *_winsys,
 }
 
 
-static struct pipe_surface *
-trace_winsys_surface_alloc(struct pipe_winsys *_winsys)
-{
-   struct trace_winsys *tr_ws = trace_winsys(_winsys);
-   struct pipe_winsys *winsys = tr_ws->winsys;
-   struct pipe_surface *result;
-   
-   trace_dump_call_begin("pipe_winsys", "surface_alloc");
-   
-   trace_dump_arg(ptr, winsys);
-
-   result = winsys->surface_alloc(winsys);
-   
-   trace_dump_ret(ptr, result);
-   
-   trace_dump_call_end();
-   
-   assert(!result || !result->texture);
-
-   return result;
-}
-
-
-static int
-trace_winsys_surface_alloc_storage(struct pipe_winsys *_winsys,
-                                   struct pipe_surface *surface,
+static struct pipe_buffer *
+trace_winsys_surface_buffer_create(struct pipe_winsys *_winsys,
                                    unsigned width, unsigned height,
                                    enum pipe_format format,
-                                   unsigned flags,
-                                   unsigned tex_usage)
+                                   unsigned usage,
+                                   unsigned *pstride)
 {
    struct trace_winsys *tr_ws = trace_winsys(_winsys);
    struct pipe_winsys *winsys = tr_ws->winsys;
-   int result;
+   unsigned stride;
+   struct pipe_buffer *result;
    
-   assert(surface && !surface->texture);
-
-   trace_dump_call_begin("pipe_winsys", "surface_alloc_storage");
+   trace_dump_call_begin("pipe_winsys", "surface_buffer_create");
    
    trace_dump_arg(ptr, winsys);
-   trace_dump_arg(ptr, surface);
    trace_dump_arg(uint, width);
    trace_dump_arg(uint, height);
    trace_dump_arg(format, format);
-   trace_dump_arg(uint, flags);
-   trace_dump_arg(uint, tex_usage);
+   trace_dump_arg(uint, usage);
 
-   result = winsys->surface_alloc_storage(winsys,
-                                          surface,
+   result = winsys->surface_buffer_create(winsys,
                                           width, height,
                                           format,
-                                          flags,
-                                          tex_usage);
+                                          usage,
+                                          pstride);
    
-   trace_dump_ret(int, result);
+   stride = *pstride;
    
-   trace_dump_call_end();
+   trace_dump_arg(uint, stride);
    
-   return result;
-}
-
-
-static void
-trace_winsys_surface_release(struct pipe_winsys *_winsys, 
-                             struct pipe_surface **psurface)
-{
-   struct trace_winsys *tr_ws = trace_winsys(_winsys);
-   struct pipe_winsys *winsys = tr_ws->winsys;
-   struct pipe_surface *surface = *psurface;
-   
-   assert(psurface && *psurface && !(*psurface)->texture);
-   
-   trace_dump_call_begin("pipe_winsys", "surface_release");
-   
-   trace_dump_arg(ptr, winsys);
-   trace_dump_arg(ptr, surface);
-
-   winsys->surface_release(winsys, psurface);
+   trace_dump_ret(ptr, result);
    
    trace_dump_call_end();
+   
+   return result;
 }
 
 
@@ -465,9 +420,7 @@ trace_winsys_create(struct pipe_winsys *winsys)
    tr_ws->base.destroy = trace_winsys_destroy;
    tr_ws->base.get_name = trace_winsys_get_name;
    tr_ws->base.flush_frontbuffer = trace_winsys_flush_frontbuffer;
-   tr_ws->base.surface_alloc = trace_winsys_surface_alloc;
-   tr_ws->base.surface_alloc_storage = trace_winsys_surface_alloc_storage;
-   tr_ws->base.surface_release = trace_winsys_surface_release;
+   tr_ws->base.surface_buffer_create = trace_winsys_surface_buffer_create;
    tr_ws->base.buffer_create = trace_winsys_buffer_create;
    tr_ws->base.user_buffer_create = trace_winsys_user_buffer_create;
    tr_ws->base.buffer_map = trace_winsys_buffer_map;
diff --git a/src/gallium/drivers/trace/tr_winsys.h b/src/gallium/drivers/trace/tr_winsys.h
index 062ddf66a0..3670cb915e 100644
--- a/src/gallium/drivers/trace/tr_winsys.h
+++ b/src/gallium/drivers/trace/tr_winsys.h
@@ -30,8 +30,8 @@
 
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_debug.h"
-#include "pipe/p_winsys.h"
+#include "util/u_debug.h"
+#include "pipe/internal/p_winsys_screen.h"
 
 
 /**