51 files changed, 1434 insertions, 967 deletions
diff --git a/src/gallium/drivers/i915/i915_state_emit.c b/src/gallium/drivers/i915/i915_state_emit.c
index a3d4e3b04e..ecbcabb440 100644
--- a/src/gallium/drivers/i915/i915_state_emit.c
+++ b/src/gallium/drivers/i915/i915_state_emit.c
@@ -244,6 +244,7 @@ i915_emit_hardware_state(struct i915_context *i915 )
 
          OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
 
+         assert(tex);
          OUT_BATCH(BUF_3D_ID_DEPTH |
                    BUF_3D_PITCH(tex->stride) |  /* pitch in bytes */
                    ztile);
diff --git a/src/gallium/drivers/i915/i915_texture.c b/src/gallium/drivers/i915/i915_texture.c
index e101c8683e..7ba222c78b 100644
--- a/src/gallium/drivers/i915/i915_texture.c
+++ b/src/gallium/drivers/i915/i915_texture.c
@@ -223,7 +223,7 @@ i915_miptree_layout_2d(struct i915_texture *tex)
       if (i915_scanout_layout(tex))
          return;
 
-   /* for shared buffers we use some very like scanout */
+   /* for shared buffers we use something very like scanout */
    if (pt->tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET)
       if (i915_display_target_layout(tex))
          return;
diff --git a/src/gallium/drivers/i965/brw_draw_upload.c b/src/gallium/drivers/i965/brw_draw_upload.c
index d59261557b..9f136eec71 100644
--- a/src/gallium/drivers/i965/brw_draw_upload.c
+++ b/src/gallium/drivers/i965/brw_draw_upload.c
@@ -359,9 +359,9 @@ static int brw_emit_vertex_elements(struct brw_context *brw)
       uint32_t comp3 = BRW_VE1_COMPONENT_STORE_SRC;
 
       switch (input->nr_components) {
-      case 0: comp0 = BRW_VE1_COMPONENT_STORE_0;
-      case 1: comp1 = BRW_VE1_COMPONENT_STORE_0;
-      case 2: comp2 = BRW_VE1_COMPONENT_STORE_0;
+      case 0: comp0 = BRW_VE1_COMPONENT_STORE_0; /* fallthrough */
+      case 1: comp1 = BRW_VE1_COMPONENT_STORE_0; /* fallthrough */
+      case 2: comp2 = BRW_VE1_COMPONENT_STORE_0; /* fallthrough */
       case 3: comp3 = BRW_VE1_COMPONENT_STORE_1_FLT;
 	 break;
       }
diff --git a/src/gallium/drivers/i965/brw_screen_texture.c b/src/gallium/drivers/i965/brw_screen_texture.c
index 8bdd43cf14..38e9961398 100644
--- a/src/gallium/drivers/i965/brw_screen_texture.c
+++ b/src/gallium/drivers/i965/brw_screen_texture.c
@@ -250,7 +250,7 @@ static struct pipe_texture *brw_texture_create( struct pipe_screen *screen,
    tex->ss.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
    tex->ss.ss0.surface_type = translate_tex_target(tex->base.target);
 
-   format = translate_tex_target(tex->base.format);
+   format = translate_tex_format(tex->base.format);
    assert(format != BRW_SURFACEFORMAT_INVALID);
    tex->ss.ss0.surface_format = format;
 
diff --git a/src/gallium/drivers/identity/id_drm.c b/src/gallium/drivers/identity/id_drm.c
index b89724e4f3..f258c38cd7 100644
--- a/src/gallium/drivers/identity/id_drm.c
+++ b/src/gallium/drivers/identity/id_drm.c
@@ -28,11 +28,11 @@
 #include "state_tracker/drm_api.h"
 
 #include "util/u_memory.h"
-#include "identity/id_drm.h"
-#include "identity/id_screen.h"
-#include "identity/id_public.h"
-#include "identity/id_screen.h"
-#include "identity/id_objects.h"
+#include "id_drm.h"
+#include "id_screen.h"
+#include "id_public.h"
+#include "id_screen.h"
+#include "id_objects.h"
 
 struct identity_drm_api
 {
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index 3173251437..41ac1cee72 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -37,6 +37,7 @@ C_SOURCES = \
 	lp_surface.c \
 	lp_tex_sample_llvm.c \
 	lp_texture.c \
+	lp_tile_image.c \
 	lp_tile_soa.c
 
 CPP_SOURCES = \
diff --git a/src/gallium/drivers/llvmpipe/README b/src/gallium/drivers/llvmpipe/README
index 72d9f39658..ae2c1ba943 100644
--- a/src/gallium/drivers/llvmpipe/README
+++ b/src/gallium/drivers/llvmpipe/README
@@ -86,7 +86,7 @@ Building
 
 To build everything on Linux invoke scons as:
 
-  scons debug=yes statetrackers=mesa drivers=trace,llvmpipe winsys=xlib dri=false
+  scons debug=yes statetrackers=mesa drivers=llvmpipe winsys=xlib dri=false
 
 Alternatively, you can build it with GNU make, if you prefer, by invoking it as
 
@@ -96,7 +96,7 @@ but the rest of these instructions assume that scons is used.
 
 For windows is everything the except except the winsys:
 
-  scons debug=yes statetrackers=mesa drivers=trace,llvmpipe winsys=gdi dri=false
+  scons debug=yes statetrackers=mesa drivers=llvmpipe winsys=gdi dri=false
 
 Using
 =====
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index a39283e5e8..13c1a13e87 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -18,6 +18,13 @@ env.CodeGenerate(
 	command = 'python $SCRIPT $SOURCE > $TARGET'
 )
 
+# XXX: Our dependency scanner only finds depended modules in relative dirs.
+env.Depends('lp_tile_soa.c', [
+    '#src/gallium/auxiliary/util/u_format_parse.py', 
+    '#src/gallium/auxiliary/util/u_format_pack.py', 
+    '#src/gallium/auxiliary/util/u_format_access.py',
+])
+
 llvmpipe = env.ConvenienceLibrary(
 	target = 'llvmpipe',
 	source = [
@@ -52,6 +59,7 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_surface.c',
 		'lp_tex_sample_llvm.c',
 		'lp_texture.c',
+		'lp_tile_image.c',
 		'lp_tile_soa.c',
 	])
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index 6dbcb3c9b3..82c006d78b 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -156,14 +156,13 @@ lp_rast_end( struct lp_rasterizer *rast )
  * \param y  window Y position of the tile, in pixels
  */
 static void
-lp_rast_start_tile( struct lp_rasterizer *rast,
-                    unsigned thread_index,
-                    unsigned x, unsigned y )
+lp_rast_start_tile(struct lp_rasterizer_task *task,
+                   unsigned x, unsigned y)
 {
    LP_DBG(DEBUG_RAST, "%s %d,%d\n", __FUNCTION__, x, y);
 
-   rast->tasks[thread_index].x = x;
-   rast->tasks[thread_index].y = y;
+   task->x = x;
+   task->y = y;
 }
 
 
@@ -171,12 +170,13 @@ lp_rast_start_tile( struct lp_rasterizer *rast,
  * Clear the rasterizer's current color tile.
  * This is a bin command called during bin processing.
  */
-void lp_rast_clear_color( struct lp_rasterizer *rast,
-                          unsigned thread_index,
-                          const union lp_rast_cmd_arg arg )
+void
+lp_rast_clear_color(struct lp_rasterizer_task *task,
+                    const union lp_rast_cmd_arg arg)
 {
+   struct lp_rasterizer *rast = task->rast;
    const uint8_t *clear_color = arg.clear_color;
-   uint8_t **color_tile = rast->tasks[thread_index].tile.color;
+   uint8_t **color_tile = task->tile.color;
    unsigned i;
 
    LP_DBG(DEBUG_RAST, "%s 0x%x,0x%x,0x%x,0x%x\n", __FUNCTION__, 
@@ -225,11 +225,11 @@ void lp_rast_clear_color( struct lp_rasterizer *rast,
  * Clear the rasterizer's current z/stencil tile.
  * This is a bin command called during bin processing.
  */
-void lp_rast_clear_zstencil( struct lp_rasterizer *rast,
-                             unsigned thread_index,
-                             const union lp_rast_cmd_arg arg)
+void
+lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
+                       const union lp_rast_cmd_arg arg)
 {
-   struct lp_rasterizer_task *task = &rast->tasks[thread_index];
+   struct lp_rasterizer *rast = task->rast;
    const unsigned tile_x = task->x;
    const unsigned tile_y = task->y;
    const unsigned height = TILE_SIZE/TILE_VECTOR_HEIGHT;
@@ -288,13 +288,12 @@ void lp_rast_clear_zstencil( struct lp_rasterizer *rast,
  * Load tile color from the framebuffer surface.
  * This is a bin command called during bin processing.
  */
-void lp_rast_load_color( struct lp_rasterizer *rast,
-                         unsigned thread_index,
-                         const union lp_rast_cmd_arg arg)
+void
+lp_rast_load_color(struct lp_rasterizer_task *task,
+                   const union lp_rast_cmd_arg arg)
 {
-   struct lp_rasterizer_task *task = &rast->tasks[thread_index];
-   const unsigned x = task->x;
-   const unsigned y = task->y;
+   struct lp_rasterizer *rast = task->rast;
+   const unsigned x = task->x, y = task->y;
    unsigned i;
 
    LP_DBG(DEBUG_RAST, "%s at %u, %u\n", __FUNCTION__, x, y);
@@ -304,10 +303,7 @@ void lp_rast_load_color( struct lp_rasterizer *rast,
       int w = TILE_SIZE;
       int h = TILE_SIZE;
 
-      if (x >= transfer->width)
-	 continue;
-
-      if (y >= transfer->height)
+      if (x >= transfer->width || y >= transfer->height)
 	 continue;
 
       assert(w >= 0);
@@ -327,16 +323,16 @@ void lp_rast_load_color( struct lp_rasterizer *rast,
 }
 
 
-void lp_rast_set_state( struct lp_rasterizer *rast,
-                        unsigned thread_index,
-                        const union lp_rast_cmd_arg arg )
+void
+lp_rast_set_state(struct lp_rasterizer_task *task,
+                  const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_state *state = arg.set_state;
 
    LP_DBG(DEBUG_RAST, "%s %p\n", __FUNCTION__, (void *) state);
 
    /* just set the current state pointer for this rasterizer */
-   rast->tasks[thread_index].current_state = state;
+   task->current_state = state;
 }
 
 
@@ -346,16 +342,15 @@ void lp_rast_set_state( struct lp_rasterizer *rast,
  * completely contained inside a triangle.
  * This is a bin command called during bin processing.
  */
-void lp_rast_shade_tile( struct lp_rasterizer *rast,
-                         unsigned thread_index,
-                         const union lp_rast_cmd_arg arg )
+void
+lp_rast_shade_tile(struct lp_rasterizer_task *task,
+                   const union lp_rast_cmd_arg arg)
 {
-   struct lp_rasterizer_task *task = &rast->tasks[thread_index];
+   struct lp_rasterizer *rast = task->rast;
    const struct lp_rast_state *state = task->current_state;
    struct lp_rast_tile *tile = &task->tile;
    const struct lp_rast_shader_inputs *inputs = arg.shade_tile;
-   const unsigned tile_x = task->x;
-   const unsigned tile_y = task->y;
+   const unsigned tile_x = task->x, tile_y = task->y;
    unsigned x, y;
 
    LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
@@ -396,14 +391,13 @@ void lp_rast_shade_tile( struct lp_rasterizer *rast,
  * Compute shading for a 4x4 block of pixels.
  * This is a bin command called during bin processing.
  */
-void lp_rast_shade_quads( struct lp_rasterizer *rast,
-                          unsigned thread_index,
+void lp_rast_shade_quads( struct lp_rasterizer_task *task,
                           const struct lp_rast_shader_inputs *inputs,
                           unsigned x, unsigned y,
                           int32_t c1, int32_t c2, int32_t c3)
 {
-   struct lp_rasterizer_task *task = &rast->tasks[thread_index];
    const struct lp_rast_state *state = task->current_state;
+   struct lp_rasterizer *rast = task->rast;
    struct lp_rast_tile *tile = &task->tile;
    uint8_t *color[PIPE_MAX_COLOR_BUFS];
    void *depth;
@@ -515,12 +509,11 @@ outline_subtiles(uint8_t *tile)
 /**
  * Write the rasterizer's color tile to the framebuffer.
  */
-static void lp_rast_store_color( struct lp_rasterizer *rast,
-                                 unsigned thread_index)
+static void
+lp_rast_store_color(struct lp_rasterizer_task *task)
 {
-   struct lp_rasterizer_task *task = &rast->tasks[thread_index];
-   const unsigned x = task->x;
-   const unsigned y = task->y;
+   struct lp_rasterizer *rast = task->rast;
+   const unsigned x = task->x, y = task->y;
    unsigned i;
 
    for (i = 0; i < rast->state.fb.nr_cbufs; i++) {
@@ -535,7 +528,7 @@ static void lp_rast_store_color( struct lp_rasterizer *rast,
 	 continue;
 
       LP_DBG(DEBUG_RAST, "%s [%u] %d,%d %dx%d\n", __FUNCTION__,
-	     thread_index, x, y, w, h);
+	     task->thread_index, x, y, w, h);
 
       if (LP_DEBUG & DEBUG_SHOW_SUBTILES)
          outline_subtiles(task->tile.color[i]);
@@ -558,13 +551,14 @@ static void lp_rast_store_color( struct lp_rasterizer *rast,
  * Write the rasterizer's tiles to the framebuffer.
  */
 static void
-lp_rast_end_tile( struct lp_rasterizer *rast,
-                  unsigned thread_index )
+lp_rast_end_tile(struct lp_rasterizer_task *task)
 {
+   struct lp_rasterizer *rast = task->rast;
+
    LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
 
    if (rast->state.write_color)
-      lp_rast_store_color(rast, thread_index);
+      lp_rast_store_color(task);
 }
 
 
@@ -572,9 +566,9 @@ lp_rast_end_tile( struct lp_rasterizer *rast,
  * Signal on a fence.  This is called during bin execution/rasterization.
  * Called per thread.
  */
-void lp_rast_fence( struct lp_rasterizer *rast,
-                    unsigned thread_index,
-                    const union lp_rast_cmd_arg arg )
+void
+lp_rast_fence(struct lp_rasterizer_task *task,
+              const union lp_rast_cmd_arg arg)
 {
    struct lp_fence *fence = arg.fence;
 
@@ -603,6 +597,9 @@ release_scene( struct lp_rasterizer *rast,
    util_unreference_framebuffer_state( &scene->fb );
 
    lp_scene_reset( scene );
+
+   assert(lp_scene_is_empty(scene));
+
    lp_scene_enqueue( rast->empty_scenes, scene );
    rast->curr_scene = NULL;
 }
@@ -615,25 +612,24 @@ release_scene( struct lp_rasterizer *rast,
  * Called per thread.
  */
 static void
-rasterize_bin( struct lp_rasterizer *rast,
-               unsigned thread_index,
-               const struct cmd_bin *bin,
-               int x, int y)
+rasterize_bin(struct lp_rasterizer_task *task,
+              const struct cmd_bin *bin,
+              int x, int y)
 {
    const struct cmd_block_list *commands = &bin->commands;
    struct cmd_block *block;
    unsigned k;
 
-   lp_rast_start_tile( rast, thread_index, x, y );
+   lp_rast_start_tile( task, x, y );
 
    /* simply execute each of the commands in the block list */
    for (block = commands->head; block; block = block->next) {
       for (k = 0; k < block->count; k++) {
-         block->cmd[k]( rast, thread_index, block->arg[k] );
+         block->cmd[k]( task, block->arg[k] );
       }
    }
 
-   lp_rast_end_tile( rast, thread_index );
+   lp_rast_end_tile( task );
 }
 
 
@@ -717,10 +713,9 @@ is_empty_bin( const struct cmd_bin *bin )
  * Called per thread.
  */
 static void
-rasterize_scene( struct lp_rasterizer *rast,
-                unsigned thread_index,
+rasterize_scene(struct lp_rasterizer_task *task,
                 struct lp_scene *scene,
-                bool write_depth )
+                bool write_depth)
 {
    /* loop over scene bins, rasterize each */
 #if 0
@@ -728,9 +723,8 @@ rasterize_scene( struct lp_rasterizer *rast,
       unsigned i, j;
       for (i = 0; i < scene->tiles_x; i++) {
          for (j = 0; j < scene->tiles_y; j++) {
-            struct cmd_bin *bin = lp_get_bin(scene, i, j);
-            rasterize_bin( rast, thread_index,
-                           bin, i * TILE_SIZE, j * TILE_SIZE );
+            struct cmd_bin *bin = lp_scene_get_bin(scene, i, j);
+            rasterize_bin(task, bin, i * TILE_SIZE, j * TILE_SIZE);
          }
       }
    }
@@ -742,7 +736,7 @@ rasterize_scene( struct lp_rasterizer *rast,
       assert(scene);
       while ((bin = lp_scene_bin_iter_next(scene, &x, &y))) {
          if (!is_empty_bin( bin ))
-            rasterize_bin( rast, thread_index, bin, x * TILE_SIZE, y * TILE_SIZE);
+            rasterize_bin(task, bin, x * TILE_SIZE, y * TILE_SIZE);
       }
    }
 #endif
@@ -786,7 +780,7 @@ lp_rasterize_scene( struct lp_rasterizer *rast,
                      fb->zsbuf != NULL && write_depth );
 
       lp_scene_bin_iter_begin( scene );
-      rasterize_scene( rast, 0, scene, write_depth );
+      rasterize_scene( &rast->tasks[0], scene, write_depth );
 
       release_scene( rast, scene );
 
@@ -832,6 +826,9 @@ static PIPE_THREAD_ROUTINE( thread_func, init_data )
          debug_printf("thread %d waiting for work\n", task->thread_index);
       pipe_semaphore_wait(&task->work_ready);
 
+      if (rast->exit_flag)
+         break;
+
       if (task->thread_index == 0) {
          /* thread[0]:
           *  - get next scene to rasterize
@@ -860,10 +857,9 @@ static PIPE_THREAD_ROUTINE( thread_func, init_data )
       /* do work */
       if (debug)
          debug_printf("thread %d doing work\n", task->thread_index);
-      rasterize_scene(rast, 
-		     task->thread_index,
-                     rast->curr_scene, 
-		     rast->curr_scene->write_depth);
+      rasterize_scene(task,
+                      rast->curr_scene, 
+                      rast->curr_scene->write_depth);
       
       /* wait for all threads to finish with this scene */
       pipe_barrier_wait( &rast->barrier );
@@ -968,6 +964,20 @@ void lp_rast_destroy( struct lp_rasterizer *rast )
 	 align_free(rast->tasks[i].tile.color[cbuf]);
    }
 
+   /* Set exit_flag and signal each thread's work_ready semaphore.
+    * Each thread will be woken up, notice that the exit_flag is set and
+    * break out of its main loop.  The thread will then exit.
+    */
+   rast->exit_flag = TRUE;
+   for (i = 0; i < rast->num_threads; i++) {
+      pipe_semaphore_signal(&rast->tasks[i].work_ready);
+   }
+
+   for (i = 0; i < rast->num_threads; i++) {
+      pipe_semaphore_destroy(&rast->tasks[i].work_ready);
+      pipe_semaphore_destroy(&rast->tasks[i].work_done);
+   }
+
    /* for synchronizing rasterization threads */
    pipe_barrier_destroy( &rast->barrier );
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index 875f18e0c0..1ed2700191 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -53,6 +53,9 @@ struct pipe_screen;
 #define FIXED_ONE (1<<FIXED_ORDER)
 
 
+struct lp_rasterizer_task;
+
+
 /**
  * Rasterization state.
  * Objects of this type are put into the shared data bin and pointed
@@ -201,32 +204,25 @@ lp_rast_arg_null( void )
  * the bins are executed.
  */
 
-void lp_rast_clear_color( struct lp_rasterizer *, 
-                          unsigned thread_index,
+void lp_rast_clear_color( struct lp_rasterizer_task *, 
                           const union lp_rast_cmd_arg );
 
-void lp_rast_clear_zstencil( struct lp_rasterizer *, 
-                             unsigned thread_index,
+void lp_rast_clear_zstencil( struct lp_rasterizer_task *, 
                              const union lp_rast_cmd_arg );
 
-void lp_rast_load_color( struct lp_rasterizer *, 
-                         unsigned thread_index,
+void lp_rast_load_color( struct lp_rasterizer_task *, 
                          const union lp_rast_cmd_arg );
 
-void lp_rast_set_state( struct lp_rasterizer *, 
-                        unsigned thread_index,
+void lp_rast_set_state( struct lp_rasterizer_task *, 
                         const union lp_rast_cmd_arg );
 
-void lp_rast_triangle( struct lp_rasterizer *, 
-                       unsigned thread_index,
+void lp_rast_triangle( struct lp_rasterizer_task *, 
                        const union lp_rast_cmd_arg );
 
-void lp_rast_shade_tile( struct lp_rasterizer *,
-                         unsigned thread_index,
+void lp_rast_shade_tile( struct lp_rasterizer_task *,
                          const union lp_rast_cmd_arg );
 
-void lp_rast_fence( struct lp_rasterizer *,
-                    unsigned thread_index,
+void lp_rast_fence( struct lp_rasterizer_task *,
                     const union lp_rast_cmd_arg );
 
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index 5c5497e092..abc5a9ad89 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -84,6 +84,7 @@ struct lp_rasterizer
 {
    boolean clipped_tile;
    boolean check_for_clipped_tiles;
+   boolean exit_flag;
 
    /* Framebuffer stuff
     */
@@ -121,8 +122,7 @@ struct lp_rasterizer
 };
 
 
-void lp_rast_shade_quads( struct lp_rasterizer *rast,
-                          unsigned thread_index,
+void lp_rast_shade_quads( struct lp_rasterizer_task *task,
                           const struct lp_rast_shader_inputs *inputs,
                           unsigned x, unsigned y,
                           int32_t c1, int32_t c2, int32_t c3);
@@ -159,13 +159,13 @@ lp_rast_depth_pointer( struct lp_rasterizer *rast,
  * \param x, y location of 4x4 block in window coords
  */
 static INLINE void
-lp_rast_shade_quads_all( struct lp_rasterizer *rast,
-                         unsigned thread_index,
+lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
                          const struct lp_rast_shader_inputs *inputs,
                          unsigned x, unsigned y )
 {
-   const struct lp_rast_state *state = rast->tasks[thread_index].current_state;
-   struct lp_rast_tile *tile = &rast->tasks[thread_index].tile;
+   struct lp_rasterizer *rast = task->rast;
+   const struct lp_rast_state *state = task->current_state;
+   struct lp_rast_tile *tile = &task->tile;
    const unsigned ix = x % TILE_SIZE, iy = y % TILE_SIZE;
    uint8_t *color[PIPE_MAX_COLOR_BUFS];
    void *depth;
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 0334705ef7..a5f0d14c95 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -89,14 +89,11 @@ static const int pos_table16[16][2] = {
  * Shade all pixels in a 4x4 block.
  */
 static void
-block_full_4( struct lp_rasterizer_task *rast_task,
-              const struct lp_rast_triangle *tri,
-              int x, int y )
+block_full_4(struct lp_rasterizer_task *task,
+             const struct lp_rast_triangle *tri,
+             int x, int y)
 {
-   lp_rast_shade_quads_all(rast_task->rast,
-                           rast_task->thread_index,
-                           &tri->inputs, 
-                           x, y);
+   lp_rast_shade_quads_all(task, &tri->inputs, x, y);
 }
 
 
@@ -104,16 +101,16 @@ block_full_4( struct lp_rasterizer_task *rast_task,
  * Shade all pixels in a 16x16 block.
  */
 static void
-block_full_16( struct lp_rasterizer_task *rast_task,
-               const struct lp_rast_triangle *tri,
-               int x, int y )
+block_full_16(struct lp_rasterizer_task *task,
+              const struct lp_rast_triangle *tri,
+              int x, int y)
 {
    unsigned ix, iy;
    assert(x % 16 == 0);
    assert(y % 16 == 0);
    for (iy = 0; iy < 16; iy += 4)
       for (ix = 0; ix < 16; ix += 4)
-	 block_full_4(rast_task, tri, x + ix, y + iy);
+	 block_full_4(task, tri, x + ix, y + iy);
 }
 
 
@@ -123,18 +120,15 @@ block_full_16( struct lp_rasterizer_task *rast_task,
  * will be done as part of the fragment shader.
  */
 static void
-do_block_4( struct lp_rasterizer_task *rast_task,
-	    const struct lp_rast_triangle *tri,
-	    int x, int y,
-	    int c1,
-	    int c2,
-	    int c3 )
+do_block_4(struct lp_rasterizer_task *task,
+           const struct lp_rast_triangle *tri,
+           int x, int y,
+           int c1, int c2, int c3)
 {
-   lp_rast_shade_quads(rast_task->rast,
-                       rast_task->thread_index,
-                       &tri->inputs, 
-                       x, y,
-                       -c1, -c2, -c3);
+   assert(x >= 0);
+   assert(y >= 0);
+
+   lp_rast_shade_quads(task, &tri->inputs, x, y, -c1, -c2, -c3);
 }
 
 
@@ -143,18 +137,18 @@ do_block_4( struct lp_rasterizer_task *rast_task,
  * of the triangle's bounds.
  */
 static void
-do_block_16( struct lp_rasterizer_task *rast_task,
-             const struct lp_rast_triangle *tri,
-             int x, int y,
-             int c0,
-             int c1,
-             int c2 )
+do_block_16(struct lp_rasterizer_task *task,
+            const struct lp_rast_triangle *tri,
+            int x, int y,
+            int c0, int c1, int c2)
 {
    unsigned mask = 0;
    int eo[3];
    int c[3];
    int i, j;
 
+   assert(x >= 0);
+   assert(y >= 0);
    assert(x % 16 == 0);
    assert(y % 16 == 0);
 
@@ -193,7 +187,7 @@ do_block_16( struct lp_rasterizer_task *rast_task,
        * the triangle.  It's a little faster to do it in the jit code.
        */
       LP_COUNT(nr_non_empty_4);
-      do_block_4(rast_task, tri, px, py, cx1, cx2, cx3);
+      do_block_4(task, tri, px, py, cx1, cx2, cx3);
    }
 }
 
@@ -203,15 +197,11 @@ do_block_16( struct lp_rasterizer_task *rast_task,
  * for this triangle.
  */
 void
-lp_rast_triangle( struct lp_rasterizer *rast,
-                  unsigned thread_index,
-                  const union lp_rast_cmd_arg arg )
+lp_rast_triangle(struct lp_rasterizer_task *task,
+                 const union lp_rast_cmd_arg arg)
 {
-   struct lp_rasterizer_task *rast_task = &rast->tasks[thread_index];
    const struct lp_rast_triangle *tri = arg.triangle;
-
-   int x = rast_task->x;
-   int y = rast_task->y;
+   const int x = task->x, y = task->y;
    int ei[3], eo[3], c[3];
    unsigned outmask, inmask, partial_mask;
    unsigned i, j;
@@ -272,7 +262,7 @@ lp_rast_triangle( struct lp_rasterizer *rast,
       partial_mask &= ~(1 << i);
 
       LP_COUNT(nr_partially_covered_16);
-      do_block_16(rast_task, tri, px, py, cx1, cx2, cx3);
+      do_block_16(task, tri, px, py, cx1, cx2, cx3);
    }
 
    /* Iterate over fulls: 
@@ -285,6 +275,6 @@ lp_rast_triangle( struct lp_rasterizer *rast,
       inmask &= ~(1 << i);
 
       LP_COUNT(nr_fully_covered_16);
-      block_full_16(rast_task, tri, px, py);
+      block_full_16(task, tri, px, py);
    }
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c
index b7116297ec..cba0e21298 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.c
+++ b/src/gallium/drivers/llvmpipe/lp_scene.c
@@ -100,6 +100,9 @@ lp_scene_bin_reset(struct lp_scene *scene, unsigned x, unsigned y)
    struct cmd_block *block;
    struct cmd_block *tmp;
 
+   assert(x < TILES_X);
+   assert(y < TILES_Y);
+
    for (block = list->head; block != list->tail; block = tmp) {
       tmp = block->next;
       FREE(block);
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
index fb478cc2eb..8d725cd437 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.h
+++ b/src/gallium/drivers/llvmpipe/lp_scene.h
@@ -56,8 +56,7 @@
 
 /* switch to a non-pointer value for this:
  */
-typedef void (*lp_rast_cmd)( struct lp_rasterizer *,
-                             unsigned thread_index,
+typedef void (*lp_rast_cmd)( struct lp_rasterizer_task *,
                              const union lp_rast_cmd_arg );
 
 struct cmd_block {
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 1cd3ea9a84..f84ede675b 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -83,7 +83,7 @@ llvmpipe_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
       return PIPE_MAX_SAMPLERS;
    case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
-      return PIPE_MAX_VERTEX_SAMPLERS;
+      return 0;
    case PIPE_CAP_MAX_COMBINED_SAMPLERS:
       return PIPE_MAX_SAMPLERS + PIPE_MAX_VERTEX_SAMPLERS;
    case PIPE_CAP_NPOT_TEXTURES:
@@ -194,9 +194,7 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
          format_desc->block.height != 1)
          return FALSE;
 
-      if(format_desc->layout != UTIL_FORMAT_LAYOUT_SCALAR &&
-         format_desc->layout != UTIL_FORMAT_LAYOUT_ARITH &&
-         format_desc->layout != UTIL_FORMAT_LAYOUT_ARRAY)
+      if(format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
          return FALSE;
 
       if(format_desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB &&
@@ -224,15 +222,16 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
          format_desc->block.height != 1)
          return FALSE;
 
-      if(format_desc->layout != UTIL_FORMAT_LAYOUT_SCALAR &&
-         format_desc->layout != UTIL_FORMAT_LAYOUT_ARITH &&
-         format_desc->layout != UTIL_FORMAT_LAYOUT_ARRAY)
+      if(format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
          return FALSE;
 
       if(format_desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB &&
-         format_desc->colorspace != UTIL_FORMAT_COLORSPACE_SRGB &&
          format_desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
          return FALSE;
+
+      /* not supported yet */
+      if (format == PIPE_FORMAT_Z16_UNORM)
+         return FALSE;
    }
 
    return TRUE;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index cb873667a2..3aec9de373 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -64,6 +64,8 @@ lp_setup_get_current_scene(struct setup_context *setup)
        */
       setup->scene = lp_scene_dequeue(setup->empty_scenes, TRUE);
 
+      assert(lp_scene_is_empty(setup->scene));
+
       if(0)lp_scene_reset( setup->scene ); /* XXX temporary? */
 
       lp_scene_set_framebuffer_size(setup->scene,
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
index 022bf92cb4..7f45635542 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -124,12 +124,6 @@ llvmpipe_texture_create(struct pipe_screen *_screen,
    pipe_reference_init(&lpt->base.reference, 1);
    lpt->base.screen = &screen->base;
 
-   /* XXX: The xlib state tracker is brain-dead and will request
-    * PIPE_FORMAT_Z16_UNORM no matter how much we tell it we don't support it.
-    */
-   if (lpt->base.format == PIPE_FORMAT_Z16_UNORM)
-      lpt->base.format = PIPE_FORMAT_Z32_UNORM;
-
    if (lpt->base.tex_usage & (PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
                               PIPE_TEXTURE_USAGE_PRIMARY)) {
       if (!llvmpipe_displaytarget_layout(screen, lpt))
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_image.c b/src/gallium/drivers/llvmpipe/lp_tile_image.c
new file mode 100644
index 0000000000..c1980b316d
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tile_image.c
@@ -0,0 +1,126 @@
+/**************************************************************************
+ * 
+ * Copyright 2010 VMware, Inc.  All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "lp_tile_soa.h"
+#include "lp_tile_image.h"
+
+
+#define BYTES_PER_TILE (TILE_SIZE * TILE_SIZE * 4)
+
+
+/**
+ * Convert a tiled image into a linear image.
+ * \param src_stride  source row stride in bytes (bytes per row of tiles)
+ * \param dst_stride  dest row stride in bytes
+ */
+void
+lp_tiled_to_linear(const uint8_t *src,
+                   uint8_t *dst,
+                   unsigned width, unsigned height,
+                   enum pipe_format format,
+                   unsigned src_stride,
+                   unsigned dst_stride)
+{
+   const unsigned tiles_per_row = src_stride / BYTES_PER_TILE;
+   unsigned i, j;
+
+   for (j = 0; j < height; j += TILE_SIZE) {
+      for (i = 0; i < width; i += TILE_SIZE) {
+         unsigned tile_offset =
+            ((j / TILE_SIZE) * tiles_per_row + i / TILE_SIZE);
+         unsigned byte_offset = tile_offset * BYTES_PER_TILE;
+         const uint8_t *src_tile = src + byte_offset;
+
+         lp_tile_write_4ub(format,
+                           src_tile,
+                           dst,
+                           dst_stride,
+                           i, j, TILE_SIZE, TILE_SIZE);
+      }
+   }
+}
+
+
+/**
+ * Convert a linear image into a tiled image.
+ * \param src_stride  source row stride in bytes
+ * \param dst_stride  dest row stride in bytes (bytes per row of tiles)
+ */
+void
+lp_linear_to_tiled(const uint8_t *src,
+                   uint8_t *dst,
+                   unsigned width, unsigned height,
+                   enum pipe_format format,
+                   unsigned src_stride,
+                   unsigned dst_stride)
+{
+   const unsigned tiles_per_row = dst_stride / BYTES_PER_TILE;
+   unsigned i, j;
+
+   for (j = 0; j < height; j += TILE_SIZE) {
+      for (i = 0; i < width; i += TILE_SIZE) {
+         unsigned tile_offset =
+            ((j / TILE_SIZE) * tiles_per_row + i / TILE_SIZE);
+         unsigned byte_offset = tile_offset * BYTES_PER_TILE;
+         uint8_t *dst_tile = dst + byte_offset;
+
+         lp_tile_read_4ub(format,
+                          dst_tile,
+                          src,
+                          src_stride,
+                          i, j, TILE_SIZE, TILE_SIZE);
+      }
+   }
+}
+
+
+/**
+ * For testing only.
+ */
+void
+test_tiled_linear_conversion(uint8_t *data,
+                             enum pipe_format format,
+                             unsigned width, unsigned height,
+                             unsigned stride)
+{
+   /* size in tiles */
+   unsigned wt = (width + TILE_SIZE - 1) / TILE_SIZE;
+   unsigned ht = (height + TILE_SIZE - 1) / TILE_SIZE;
+
+   uint8_t *tiled = malloc(wt * ht * TILE_SIZE * TILE_SIZE * 4);
+
+   unsigned tiled_stride = wt * TILE_SIZE * TILE_SIZE * 4;
+
+   lp_linear_to_tiled(data, tiled, width, height, format,
+                      stride, tiled_stride);
+
+   lp_tiled_to_linear(tiled, data, width, height, format,
+                      tiled_stride, stride);
+
+   free(tiled);
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_image.h b/src/gallium/drivers/llvmpipe/lp_tile_image.h
new file mode 100644
index 0000000000..60d472e8c5
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tile_image.h
@@ -0,0 +1,57 @@
+/**************************************************************************
+ * 
+ * Copyright 2010 VMware, Inc.  All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef LP_TILE_IMAGE_H
+#define LP_TILE_IMAGE_H
+
+
+void
+lp_tiled_to_linear(const uint8_t *src,
+                   uint8_t *dst,
+                   unsigned width, unsigned height,
+                   enum pipe_format format,
+                   unsigned src_stride,
+                   unsigned dst_stride);
+
+
+void
+lp_linear_to_tiled(const uint8_t *src,
+                   uint8_t *dst,
+                   unsigned width, unsigned height,
+                   enum pipe_format format,
+                   unsigned src_stride,
+                   unsigned dst_stride);
+
+
+void
+test_tiled_linear_conversion(uint8_t *data,
+                             enum pipe_format format,
+                             unsigned width, unsigned height,
+                             unsigned stride);
+
+
+#endif /* LP_TILE_IMAGE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.py b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
index 5d53689a3d..00b8d4fc38 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_soa.py
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
@@ -45,10 +45,10 @@ sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), '../../auxiliary/u
 from u_format_access import *
 
 
-def generate_format_read(format, dst_type, dst_native_type, dst_suffix):
+def generate_format_read(format, dst_channel, dst_native_type, dst_suffix):
     '''Generate the function to read pixels from a particular format'''
 
-    name = short_name(format)
+    name = format.short_name()
 
     src_native_type = native_type(format)
 
@@ -64,11 +64,11 @@ def generate_format_read(format, dst_type, dst_native_type, dst_suffix):
     names = ['']*4
     if format.colorspace == 'rgb':
         for i in range(4):
-            swizzle = format.out_swizzle[i]
+            swizzle = format.swizzles[i]
             if swizzle < 4:
                 names[swizzle] += 'rgba'[i]
     elif format.colorspace == 'zs':
-        swizzle = format.out_swizzle[0]
+        swizzle = format.swizzles[0]
         if swizzle < 4:
             names[swizzle] = 'z'
         else:
@@ -76,48 +76,49 @@ def generate_format_read(format, dst_type, dst_native_type, dst_suffix):
     else:
         assert False
 
-    if format.layout == ARITH:
-        print '         %s pixel = *src_pixel++;' % src_native_type
-        shift = 0;
-        for i in range(4):
-            src_type = format.in_types[i]
-            width = src_type.size
-            if names[i]:
-                value = 'pixel'
-                mask = (1 << width) - 1
-                if shift:
-                    value = '(%s >> %u)' % (value, shift)
-                if shift + width < format.block_size():
-                    value = '(%s & 0x%x)' % (value, mask)
-                value = conversion_expr(src_type, dst_type, dst_native_type, value)
-                print '         %s %s = %s;' % (dst_native_type, names[i], value)
-            shift += width
-    elif format.layout == ARRAY:
-        for i in range(4):
-            src_type = format.in_types[i]
-            if names[i]:
-                value = '(*src_pixel++)'
-                value = conversion_expr(src_type, dst_type, dst_native_type, value)
-                print '         %s %s = %s;' % (dst_native_type, names[i], value)
+    if format.layout == PLAIN:
+        if not format.is_array():
+            print '         %s pixel = *src_pixel++;' % src_native_type
+            shift = 0;
+            for i in range(4):
+                src_channel = format.channels[i]
+                width = src_channel.size
+                if names[i]:
+                    value = 'pixel'
+                    mask = (1 << width) - 1
+                    if shift:
+                        value = '(%s >> %u)' % (value, shift)
+                    if shift + width < format.block_size():
+                        value = '(%s & 0x%x)' % (value, mask)
+                    value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
+                    print '         %s %s = %s;' % (dst_native_type, names[i], value)
+                shift += width
+        else:
+            for i in range(4):
+                src_channel = format.channels[i]
+                if names[i]:
+                    value = '(*src_pixel++)'
+                    value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
+                    print '         %s %s = %s;' % (dst_native_type, names[i], value)
     else:
         assert False
 
     for i in range(4):
         if format.colorspace == 'rgb':
-            swizzle = format.out_swizzle[i]
+            swizzle = format.swizzles[i]
             if swizzle < 4:
                 value = names[swizzle]
             elif swizzle == SWIZZLE_0:
                 value = '0'
             elif swizzle == SWIZZLE_1:
-                value = '1'
+                value = get_one(dst_channel)
             else:
                 assert False
         elif format.colorspace == 'zs':
             if i < 3:
                 value = 'z'
             else:
-                value = '1'
+                value = get_one(dst_channel)
         else:
             assert False
         print '         TILE_PIXEL(dst, x, y, %u) = %s; /* %s */' % (i, value, 'rgba'[i])
@@ -129,31 +130,16 @@ def generate_format_read(format, dst_type, dst_native_type, dst_suffix):
     print
     
 
-def compute_inverse_swizzle(format):
-    '''Return an array[4] of inverse swizzle terms'''
-    inv_swizzle = [None]*4
-    if format.colorspace == 'rgb':
-        for i in range(4):
-            swizzle = format.out_swizzle[i]
-            if swizzle < 4:
-                inv_swizzle[swizzle] = i
-    elif format.colorspace == 'zs':
-        swizzle = format.out_swizzle[0]
-        if swizzle < 4:
-            inv_swizzle[swizzle] = 0
-    return inv_swizzle
-
-
-def pack_rgba(format, src_type, r, g, b, a):
+def pack_rgba(format, src_channel, r, g, b, a):
     """Return an expression for packing r, g, b, a into a pixel of the
     given format.  Ex: '(b << 24) | (g << 16) | (r << 8) | (a << 0)'
     """
     assert format.colorspace == 'rgb'
-    inv_swizzle = compute_inverse_swizzle(format)
+    inv_swizzle = format.inv_swizzles()
     shift = 0
     expr = None
     for i in range(4):
-		# choose r, g, b, or a depending on the inverse swizzle term
+        # choose r, g, b, or a depending on the inverse swizzle term
         if inv_swizzle[i] == 0:
             value = r
         elif inv_swizzle[i] == 1:
@@ -166,25 +152,25 @@ def pack_rgba(format, src_type, r, g, b, a):
             value = None
 
         if value:
-            dst_type = format.in_types[i]
+            dst_channel = format.channels[i]
             dst_native_type = native_type(format)
-            value = conversion_expr(src_type, dst_type, dst_native_type, value)
+            value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
             term = "((%s) << %d)" % (value, shift)
             if expr:
                 expr = expr + " | " + term
             else:
                 expr = term
 
-        width = format.in_types[i].size
+        width = format.channels[i].size
         shift = shift + width
     return expr
 
 
-def emit_unrolled_write_code(format, src_type):
+def emit_unrolled_write_code(format, src_channel):
     '''Emit code for writing a block based on unrolled loops.
     This is considerably faster than the TILE_PIXEL-based code below.
     '''
-    dst_native_type = native_type(format)
+    dst_native_type = 'uint%u_t' % format.block_size()
     print '   const unsigned dstpix_stride = dst_stride / %d;' % format.stride()
     print '   %s *dstpix = (%s *) dst;' % (dst_native_type, dst_native_type)
     print '   unsigned int qx, qy, i;'
@@ -199,8 +185,8 @@ def emit_unrolled_write_code(format, src_type):
     print '         const uint8_t *a = src + 3 * TILE_C_STRIDE;'
     print '         (void) r; (void) g; (void) b; (void) a; /* silence warnings */'
     print '         for (i = 0; i < TILE_C_STRIDE; i += 2) {'
-    print '            const uint32_t pixel0 = %s;' % pack_rgba(format, src_type, "r[i+0]", "g[i+0]", "b[i+0]", "a[i+0]")
-    print '            const uint32_t pixel1 = %s;' % pack_rgba(format, src_type, "r[i+1]", "g[i+1]", "b[i+1]", "a[i+1]")
+    print '            const uint32_t pixel0 = %s;' % pack_rgba(format, src_channel, "r[i+0]", "g[i+0]", "b[i+0]", "a[i+0]")
+    print '            const uint32_t pixel1 = %s;' % pack_rgba(format, src_channel, "r[i+1]", "g[i+1]", "b[i+1]", "a[i+1]")
     print '            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);'
     print '            dstpix[offset + 0] = pixel0;'
     print '            dstpix[offset + 1] = pixel1;'
@@ -210,11 +196,11 @@ def emit_unrolled_write_code(format, src_type):
     print '   }'
 
 
-def emit_tile_pixel_write_code(format, src_type):
+def emit_tile_pixel_write_code(format, src_channel):
     '''Emit code for writing a block based on the TILE_PIXEL macro.'''
     dst_native_type = native_type(format)
 
-    inv_swizzle = compute_inverse_swizzle(format)
+    inv_swizzle = format.inv_swizzles()
 
     print '   unsigned x, y;'
     print '   uint8_t *dst_row = dst + y0*dst_stride;'
@@ -222,27 +208,28 @@ def emit_tile_pixel_write_code(format, src_type):
     print '      %s *dst_pixel = (%s *)(dst_row + x0*%u);' % (dst_native_type, dst_native_type, format.stride())
     print '      for (x = 0; x < w; ++x) {'
 
-    if format.layout == ARITH:
-        print '         %s pixel = 0;' % dst_native_type
-        shift = 0;
-        for i in range(4):
-            dst_type = format.in_types[i]
-            width = dst_type.size
-            if inv_swizzle[i] is not None:
-                value = 'TILE_PIXEL(src, x, y, %u)' % inv_swizzle[i]
-                value = conversion_expr(src_type, dst_type, dst_native_type, value)
-                if shift:
-                    value = '(%s << %u)' % (value, shift)
-                print '         pixel |= %s;' % value
-            shift += width
-        print '         *dst_pixel++ = pixel;'
-    elif format.layout == ARRAY:
-        for i in range(4):
-            dst_type = format.in_types[i]
-            if inv_swizzle[i] is not None:
-                value = 'TILE_PIXEL(src, x, y, %u)' % inv_swizzle[i]
-                value = conversion_expr(src_type, dst_type, dst_native_type, value)
-                print '         *dst_pixel++ = %s;' % value
+    if format.layout == PLAIN:
+        if not format.is_array():
+            print '         %s pixel = 0;' % dst_native_type
+            shift = 0;
+            for i in range(4):
+                dst_channel = format.channels[i]
+                width = dst_channel.size
+                if inv_swizzle[i] is not None:
+                    value = 'TILE_PIXEL(src, x, y, %u)' % inv_swizzle[i]
+                    value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
+                    if shift:
+                        value = '(%s << %u)' % (value, shift)
+                    print '         pixel |= %s;' % value
+                shift += width
+            print '         *dst_pixel++ = pixel;'
+        else:
+            for i in range(4):
+                dst_channel = format.channels[i]
+                if inv_swizzle[i] is not None:
+                    value = 'TILE_PIXEL(src, x, y, %u)' % inv_swizzle[i]
+                    value = conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=False)
+                    print '         *dst_pixel++ = %s;' % value
     else:
         assert False
 
@@ -251,28 +238,33 @@ def emit_tile_pixel_write_code(format, src_type):
     print '   }'
 
 
-def generate_format_write(format, src_type, src_native_type, src_suffix):
+def generate_format_write(format, src_channel, src_native_type, src_suffix):
     '''Generate the function to write pixels to a particular format'''
 
-    name = short_name(format)
+    name = format.short_name()
 
     print 'static void'
     print 'lp_tile_%s_write_%s(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)' % (name, src_suffix, src_native_type)
     print '{'
-    if format.layout == ARITH and format.colorspace == 'rgb':
-        emit_unrolled_write_code(format, src_type)
+    if format.layout == PLAIN \
+        and format.colorspace == 'rgb' \
+        and format.block_size() <= 32 \
+        and format.is_pot() \
+        and not format.is_mixed() \
+        and format.channels[0].type == UNSIGNED:
+        emit_unrolled_write_code(format, src_channel)
     else:
-        emit_tile_pixel_write_code(format, src_type)
+        emit_tile_pixel_write_code(format, src_channel)
     print '}'
     print
     
 
-def generate_read(formats, dst_type, dst_native_type, dst_suffix):
+def generate_read(formats, dst_channel, dst_native_type, dst_suffix):
     '''Generate the dispatch function to read pixels from any format'''
 
     for format in formats:
         if is_format_supported(format):
-            generate_format_read(format, dst_type, dst_native_type, dst_suffix)
+            generate_format_read(format, dst_channel, dst_native_type, dst_suffix)
 
     print 'void'
     print 'lp_tile_read_%s(enum pipe_format format, %s *dst, const void *src, unsigned src_stride, unsigned x, unsigned y, unsigned w, unsigned h)' % (dst_suffix, dst_native_type)
@@ -282,7 +274,7 @@ def generate_read(formats, dst_type, dst_native_type, dst_suffix):
     for format in formats:
         if is_format_supported(format):
             print '   case %s:' % format.name
-            print '      func = &lp_tile_%s_read_%s;' % (short_name(format), dst_suffix)
+            print '      func = &lp_tile_%s_read_%s;' % (format.short_name(), dst_suffix)
             print '      break;'
     print '   default:'
     print '      debug_printf("unsupported format\\n");'
@@ -293,12 +285,12 @@ def generate_read(formats, dst_type, dst_native_type, dst_suffix):
     print
 
 
-def generate_write(formats, src_type, src_native_type, src_suffix):
+def generate_write(formats, src_channel, src_native_type, src_suffix):
     '''Generate the dispatch function to write pixels to any format'''
 
     for format in formats:
         if is_format_supported(format):
-            generate_format_write(format, src_type, src_native_type, src_suffix)
+            generate_format_write(format, src_channel, src_native_type, src_suffix)
 
     print 'void'
     print 'lp_tile_write_%s(enum pipe_format format, const %s *src, void *dst, unsigned dst_stride, unsigned x, unsigned y, unsigned w, unsigned h)' % (src_suffix, src_native_type)
@@ -309,7 +301,7 @@ def generate_write(formats, src_type, src_native_type, src_suffix):
     for format in formats:
         if is_format_supported(format):
             print '   case %s:' % format.name
-            print '      func = &lp_tile_%s_write_%s;' % (short_name(format), src_suffix)
+            print '      func = &lp_tile_%s_write_%s;' % (format.short_name(), src_suffix)
             print '      break;'
     print '   default:'
     print '      debug_printf("unsupported format\\n");'
@@ -359,12 +351,12 @@ def main():
 
     generate_clamp()
 
-    type = Type(UNSIGNED, True, 8)
+    channel = Channel(UNSIGNED, True, 8)
     native_type = 'uint8_t'
     suffix = '4ub'
 
-    generate_read(formats, type, native_type, suffix)
-    generate_write(formats, type, native_type, suffix)
+    generate_read(formats, channel, native_type, suffix)
+    generate_write(formats, channel, native_type, suffix)
 
 
 if __name__ == '__main__':
diff --git a/src/gallium/drivers/nouveau/nouveau_context.c b/src/gallium/drivers/nouveau/nouveau_context.c
index 23443869e6..15174983e7 100644
--- a/src/gallium/drivers/nouveau/nouveau_context.c
+++ b/src/gallium/drivers/nouveau/nouveau_context.c
@@ -1,5 +1,5 @@
-#include <pipe/p_defines.h>
-#include <pipe/p_context.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
 
 #include "nouveau/nouveau_screen.h"
 #include "nouveau/nouveau_context.h"
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index 81bc296ab4..3c2f771b51 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -1,9 +1,9 @@
-#include <pipe/p_defines.h>
-#include <pipe/p_screen.h>
-#include <pipe/p_state.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
 
-#include <util/u_memory.h>
-#include <util/u_inlines.h>
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
 
 #include <stdio.h>
 #include <errno.h>
diff --git a/src/gallium/drivers/nv30/nv30_transfer.c b/src/gallium/drivers/nv30/nv30_transfer.c
index 554bcbbdd0..3aeda51ea1 100644
--- a/src/gallium/drivers/nv30/nv30_transfer.c
+++ b/src/gallium/drivers/nv30/nv30_transfer.c
@@ -1,10 +1,10 @@
-#include <pipe/p_state.h>
-#include <pipe/p_defines.h>
-#include <util/u_inlines.h>
-#include <util/u_format.h>
-#include <util/u_memory.h>
-#include <util/u_math.h>
-#include <nouveau/nouveau_winsys.h>
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "nouveau/nouveau_winsys.h"
 #include "nv30_context.h"
 #include "nv30_screen.h"
 #include "nv30_state.h"
diff --git a/src/gallium/drivers/nv40/nv40_transfer.c b/src/gallium/drivers/nv40/nv40_transfer.c
index ee266c6cfb..0462a042c3 100644
--- a/src/gallium/drivers/nv40/nv40_transfer.c
+++ b/src/gallium/drivers/nv40/nv40_transfer.c
@@ -1,10 +1,10 @@
-#include <pipe/p_state.h>
-#include <pipe/p_defines.h>
-#include <util/u_inlines.h>
-#include <util/u_format.h>
-#include <util/u_memory.h>
-#include <util/u_math.h>
-#include <nouveau/nouveau_winsys.h>
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "nouveau/nouveau_winsys.h"
 #include "nv40_context.h"
 #include "nv40_screen.h"
 #include "nv40_state.h"
diff --git a/src/gallium/drivers/r300/Makefile b/src/gallium/drivers/r300/Makefile
index afddcb161f..1f69daec81 100644
--- a/src/gallium/drivers/r300/Makefile
+++ b/src/gallium/drivers/r300/Makefile
@@ -32,7 +32,5 @@ EXTRA_OBJECTS = \
 
 include ../../Makefile.template
 
-.PHONY : $(COMPILER_ARCHIVE)
-
 $(COMPILER_ARCHIVE):
 	$(MAKE) -C $(TOP)/src/mesa/drivers/dri/r300/compiler
diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
index ec7414dc36..8acb1098b9 100644
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -33,7 +33,7 @@ static void r300_blitter_save_states(struct r300_context* r300)
     util_blitter_save_stencil_ref(r300->blitter, &(r300->stencil_ref));
     util_blitter_save_rasterizer(r300->blitter, r300->rs_state.state);
     util_blitter_save_fragment_shader(r300->blitter, r300->fs);
-    util_blitter_save_vertex_shader(r300->blitter, r300->vs);
+    util_blitter_save_vertex_shader(r300->blitter, r300->vs_state.state);
     util_blitter_save_viewport(r300->blitter, &r300->viewport);
     util_blitter_save_clip(r300->blitter, &r300->clip);
 }
@@ -100,6 +100,8 @@ static void r300_hw_copy(struct pipe_context* pipe,
                          unsigned width, unsigned height)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct r300_textures_state* state =
+        (struct r300_textures_state*)r300->textures_state.state;
 
     /* Yeah we have to save all those states to ensure this blitter operation
      * is really transparent. The states will be restored by the blitter once
@@ -108,11 +110,11 @@ static void r300_hw_copy(struct pipe_context* pipe,
     util_blitter_save_framebuffer(r300->blitter, r300->fb_state.state);
 
     util_blitter_save_fragment_sampler_states(
-        r300->blitter, r300->sampler_count, (void**)r300->sampler_states);
+        r300->blitter, state->sampler_count, (void**)state->sampler_states);
 
     util_blitter_save_fragment_sampler_textures(
-        r300->blitter, r300->texture_count,
-        (struct pipe_texture**)r300->textures);
+        r300->blitter, state->texture_count,
+        (struct pipe_texture**)state->textures);
 
     /* Do a copy */
     util_blitter_copy(r300->blitter,
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index f631b4ed27..86b98a4ba5 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -59,7 +59,9 @@ static void r300_destroy_context(struct pipe_context* context)
     FREE(r300->fb_state.state);
     FREE(r300->rs_block_state.state);
     FREE(r300->scissor_state.state);
-    FREE(r300->vertex_format_state.state);
+    FREE(r300->textures_state.state);
+    FREE(r300->vertex_stream_state.state);
+    FREE(r300->vap_output_state.state);
     FREE(r300->viewport_state.state);
     FREE(r300->ztop_state.state);
     FREE(r300);
@@ -96,39 +98,55 @@ static void r300_flush_cb(void *data)
 }
 
 #define R300_INIT_ATOM(atomname, atomsize) \
-    r300->atomname##_state.name = #atomname; \
-    r300->atomname##_state.state = NULL; \
-    r300->atomname##_state.size = atomsize; \
-    r300->atomname##_state.emit = r300_emit_##atomname##_state; \
-    r300->atomname##_state.dirty = FALSE; \
-    insert_at_tail(&r300->atom_list, &r300->atomname##_state);
+    r300->atomname.name = #atomname; \
+    r300->atomname.state = NULL; \
+    r300->atomname.size = atomsize; \
+    r300->atomname.emit = r300_emit_##atomname; \
+    r300->atomname.dirty = FALSE; \
+    insert_at_tail(&r300->atom_list, &r300->atomname);
 
 static void r300_setup_atoms(struct r300_context* r300)
 {
+    boolean is_r500 = r300_screen(r300->context.screen)->caps->is_r500;
+    boolean has_tcl = r300_screen(r300->context.screen)->caps->has_tcl;
+
     /* Create the actual atom list.
      *
      * Each atom is examined and emitted in the order it appears here, which
      * can affect performance and conformance if not handled with care.
      *
-     * Some atoms never change size, others change every emit. This is just
-     * an upper bound on each atom, to keep the emission machinery from
-     * underallocating space. */
+     * Some atoms never change size, others change every emit - those have
+     * the size of 0 here. */
     make_empty_list(&r300->atom_list);
-    R300_INIT_ATOM(invariant, 71);
-    R300_INIT_ATOM(ztop, 2);
-    R300_INIT_ATOM(blend, 8);
-    R300_INIT_ATOM(blend_color, 3);
-    R300_INIT_ATOM(clip, 29);
-    R300_INIT_ATOM(dsa, 8);
-    R300_INIT_ATOM(fb, 56);
-    R300_INIT_ATOM(rs, 25);
-    R300_INIT_ATOM(scissor, 3);
-    R300_INIT_ATOM(viewport, 9);
-    R300_INIT_ATOM(rs_block, 21);
-    R300_INIT_ATOM(vertex_format, 26);
+    R300_INIT_ATOM(invariant_state, 71);
+    R300_INIT_ATOM(ztop_state, 2);
+    R300_INIT_ATOM(blend_state, 8);
+    R300_INIT_ATOM(blend_color_state, is_r500 ? 3 : 2);
+    R300_INIT_ATOM(clip_state, has_tcl ? 5 + (6 * 4) : 2);
+    R300_INIT_ATOM(dsa_state, is_r500 ? 8 : 6);
+    R300_INIT_ATOM(fb_state, 0);
+    R300_INIT_ATOM(rs_state, 0);
+    R300_INIT_ATOM(scissor_state, 3);
+    R300_INIT_ATOM(viewport_state, 9);
+    R300_INIT_ATOM(rs_block_state, 0);
+    R300_INIT_ATOM(vertex_stream_state, 0);
+    R300_INIT_ATOM(vap_output_state, 6);
+    R300_INIT_ATOM(pvs_flush, 2);
+    R300_INIT_ATOM(vs_state, 0);
+    R300_INIT_ATOM(texture_cache_inval, 2);
+    R300_INIT_ATOM(textures_state, 0);
 
     /* Some non-CSO atoms need explicit space to store the state locally. */
+    r300->blend_color_state.state = CALLOC_STRUCT(r300_blend_color_state);
+    r300->clip_state.state = CALLOC_STRUCT(pipe_clip_state);
     r300->fb_state.state = CALLOC_STRUCT(pipe_framebuffer_state);
+    r300->rs_block_state.state = CALLOC_STRUCT(r300_rs_block);
+    r300->scissor_state.state = CALLOC_STRUCT(pipe_scissor_state);
+    r300->textures_state.state = CALLOC_STRUCT(r300_textures_state);
+    r300->vertex_stream_state.state = CALLOC_STRUCT(r300_vertex_stream_state);
+    r300->vap_output_state.state = CALLOC_STRUCT(r300_vap_output_state);
+    r300->viewport_state.state = CALLOC_STRUCT(r300_viewport_state);
+    r300->ztop_state.state = CALLOC_STRUCT(r300_ztop_state);
 }
 
 struct pipe_context* r300_create_context(struct pipe_screen* screen,
@@ -178,14 +196,6 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
 
     r300_setup_atoms(r300);
 
-    r300->blend_color_state.state = CALLOC_STRUCT(r300_blend_color_state);
-    r300->clip_state.state = CALLOC_STRUCT(pipe_clip_state);
-    r300->rs_block_state.state = CALLOC_STRUCT(r300_rs_block);
-    r300->scissor_state.state = CALLOC_STRUCT(pipe_scissor_state);
-    r300->vertex_format_state.state = CALLOC_STRUCT(r300_vertex_info);
-    r300->viewport_state.state = CALLOC_STRUCT(r300_viewport_state);
-    r300->ztop_state.state = CALLOC_STRUCT(r300_ztop_state);
-
     /* Open up the OQ BO. */
     r300->oqbo = screen->buffer_create(screen, 4096,
             PIPE_BUFFER_USAGE_VERTEX, 4096);
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index 97100c08cc..0d1518a05b 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -45,7 +45,7 @@ struct r300_atom {
     /* Opaque state. */
     void* state;
     /* Emit the state to the context. */
-    void (*emit)(struct r300_context*, void*);
+    void (*emit)(struct r300_context*, unsigned, void*);
     /* Upper bound on number of dwords to emit. */
     unsigned size;
     /* Whether this atom should be emitted. */
@@ -86,7 +86,6 @@ struct r300_rs_state {
     uint32_t vap_control_status;    /* R300_VAP_CNTL_STATUS: 0x2140 */
     uint32_t antialiasing_config;   /* R300_GB_AA_CONFIG: 0x4020 */
     uint32_t point_size;            /* R300_GA_POINT_SIZE: 0x421c */
-    uint32_t point_minmax;          /* R300_GA_POINT_MINMAX: 0x4230 */
     uint32_t line_control;          /* R300_GA_LINE_CNTL: 0x4234 */
     float depth_scale;            /* R300_SU_POLY_OFFSET_FRONT_SCALE: 0x42a4 */
                                   /* R300_SU_POLY_OFFSET_BACK_SCALE: 0x42ac */
@@ -119,7 +118,7 @@ struct r300_sampler_state {
     unsigned min_lod, max_lod;
 };
 
-struct r300_texture_state {
+struct r300_texture_format_state {
     uint32_t format0; /* R300_TX_FORMAT0: 0x4480 */
     uint32_t format1; /* R300_TX_FORMAT1: 0x44c0 */
     uint32_t format2; /* R300_TX_FORMAT2: 0x4500 */
@@ -135,6 +134,40 @@ struct r300_texture_fb_state {
     uint32_t zb_format; /* R300_ZB_FORMAT */
 };
 
+struct r300_textures_state {
+    /* Textures. */
+    struct r300_texture *textures[8];
+    int texture_count;
+    /* Sampler states. */
+    struct r300_sampler_state *sampler_states[8];
+    int sampler_count;
+
+    /* These is the merge of the texture and sampler states. */
+    unsigned count;
+    uint32_t tx_enable;         /* R300_TX_ENABLE: 0x4101 */
+    struct r300_texture_sampler_state {
+        uint32_t format[3];     /* R300_TX_FORMAT[0-2] */
+        uint32_t filter[2];     /* R300_TX_FILTER[0-1] */
+        uint32_t border_color;  /* R300_TX_BORDER_COLOR: 0x45c0 */
+        uint32_t tile_config;   /* R300_TX_OFFSET (subset thereof) */
+    } regs[8];
+};
+
+struct r300_vertex_stream_state {
+    /* R300_VAP_PROG_STREAK_CNTL_[0-7] */
+    uint32_t vap_prog_stream_cntl[8];
+    /* R300_VAP_PROG_STREAK_CNTL_EXT_[0-7] */
+    uint32_t vap_prog_stream_cntl_ext[8];
+
+    unsigned count;
+};
+
+struct r300_vap_output_state {
+    uint32_t vap_vtx_state_cntl;  /* R300_VAP_VTX_STATE_CNTL: 0x2180 */
+    uint32_t vap_vsm_vtx_assm;    /* R300_VAP_VSM_VTX_ASSM: 0x2184 */
+    uint32_t vap_out_vtx_fmt[2];  /* R300_VAP_OUTPUT_VTX_FMT_[0-1]: 0x2090 */
+};
+
 struct r300_viewport_state {
     float xscale;         /* R300_VAP_VPORT_XSCALE:  0x2098 */
     float xoffset;        /* R300_VAP_VPORT_XOFFSET: 0x209c */
@@ -151,11 +184,6 @@ struct r300_ztop_state {
 
 #define R300_NEW_FRAGMENT_SHADER 0x00000020
 #define R300_NEW_FRAGMENT_SHADER_CONSTANTS    0x00000040
-#define R300_NEW_SAMPLER         0x00000200
-#define R300_ANY_NEW_SAMPLERS    0x0001fe00
-#define R300_NEW_TEXTURE         0x00040000
-#define R300_ANY_NEW_TEXTURES    0x03fc0000
-#define R300_NEW_VERTEX_SHADER   0x08000000
 #define R300_NEW_VERTEX_SHADER_CONSTANTS    0x10000000
 #define R300_NEW_QUERY           0x40000000
 #define R300_NEW_KITCHEN_SINK    0x7fffffff
@@ -241,23 +269,13 @@ struct r300_texture {
     struct pipe_buffer* buffer;
 
     /* Registers carrying texture format data. */
-    struct r300_texture_state state;
+    struct r300_texture_format_state state;
     struct r300_texture_fb_state fb_state;
 
     /* Buffer tiling */
     enum r300_buffer_tiling microtile, macrotile;
 };
 
-struct r300_vertex_info {
-    /* Parent class */
-    struct vertex_info vinfo;
-
-    /* R300_VAP_PROG_STREAK_CNTL_[0-7] */
-    uint32_t vap_prog_stream_cntl[8];
-    /* R300_VAP_PROG_STREAK_CNTL_EXT_[0-7] */
-    uint32_t vap_prog_stream_cntl_ext[8];
-};
-
 extern struct pipe_viewport_state r300_viewport_identity;
 
 struct r300_context {
@@ -282,9 +300,6 @@ struct r300_context {
     struct r300_query *query_current;
     struct r300_query query_list;
 
-    /* Vertex formatting information. */
-    struct r300_atom vertex_format_state;
-
     /* Various CSO state objects. */
     /* Beginning of atom list. */
     struct r300_atom atom_list;
@@ -306,20 +321,24 @@ struct r300_context {
     struct r300_atom rs_state;
     /* RS block state. */
     struct r300_atom rs_block_state;
-    /* Sampler states. */
-    struct r300_sampler_state* sampler_states[8];
-    int sampler_count;
     /* Scissor state. */
     struct r300_atom scissor_state;
-    /* Texture states. */
-    struct r300_texture* textures[8];
-    int texture_count;
+    /* Textures state. */
+    struct r300_atom textures_state;
+    /* Vertex stream formatting state. */
+    struct r300_atom vertex_stream_state;
+    /* VAP (vertex shader) output mapping state. */
+    struct r300_atom vap_output_state;
     /* Vertex shader. */
-    struct r300_vertex_shader* vs;
+    struct r300_atom vs_state;
     /* Viewport state. */
     struct r300_atom viewport_state;
     /* ZTOP state. */
     struct r300_atom ztop_state;
+    /* PVS flush. */
+    struct r300_atom pvs_flush;
+    /* Texture cache invalidate. */
+    struct r300_atom texture_cache_inval;
 
     /* Invariant state. This must be emitted to get the engine started. */
     struct r300_atom invariant_state;
@@ -327,10 +346,14 @@ struct r300_context {
     /* Vertex buffers for Gallium. */
     struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
     int vertex_buffer_count;
+    int vertex_buffer_max_index;
     /* Vertex elements for Gallium. */
     struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
     int vertex_element_count;
 
+    /* Vertex info for Draw. */
+    struct vertex_info vertex_info;
+
     struct pipe_stencil_ref stencil_ref;
 
     struct pipe_clip_state clip;
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index 37ebe6c49d..addb28bded 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -34,14 +34,15 @@
 #include "r300_screen.h"
 #include "r300_vs.h"
 
-void r300_emit_blend_state(struct r300_context* r300, void* state)
+void r300_emit_blend_state(struct r300_context* r300,
+                           unsigned size, void* state)
 {
     struct r300_blend_state* blend = (struct r300_blend_state*)state;
     struct pipe_framebuffer_state* fb =
         (struct pipe_framebuffer_state*)r300->fb_state.state;
     CS_LOCALS(r300);
 
-    BEGIN_CS(8);
+    BEGIN_CS(size);
     OUT_CS_REG(R300_RB3D_ROPCNTL, blend->rop);
     OUT_CS_REG_SEQ(R300_RB3D_CBLEND, 3);
     if (fb->nr_cbufs) {
@@ -58,26 +59,28 @@ void r300_emit_blend_state(struct r300_context* r300, void* state)
     END_CS;
 }
 
-void r300_emit_blend_color_state(struct r300_context* r300, void* state)
+void r300_emit_blend_color_state(struct r300_context* r300,
+                                 unsigned size, void* state)
 {
     struct r300_blend_color_state* bc = (struct r300_blend_color_state*)state;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     CS_LOCALS(r300);
 
     if (r300screen->caps->is_r500) {
-        BEGIN_CS(3);
+        BEGIN_CS(size);
         OUT_CS_REG_SEQ(R500_RB3D_CONSTANT_COLOR_AR, 2);
         OUT_CS(bc->blend_color_red_alpha);
         OUT_CS(bc->blend_color_green_blue);
         END_CS;
     } else {
-        BEGIN_CS(2);
+        BEGIN_CS(size);
         OUT_CS_REG(R300_RB3D_BLEND_COLOR, bc->blend_color);
         END_CS;
     }
 }
 
-void r300_emit_clip_state(struct r300_context* r300, void* state)
+void r300_emit_clip_state(struct r300_context* r300,
+                          unsigned size, void* state)
 {
     struct pipe_clip_state* clip = (struct pipe_clip_state*)state;
     int i;
@@ -85,7 +88,7 @@ void r300_emit_clip_state(struct r300_context* r300, void* state)
     CS_LOCALS(r300);
 
     if (r300screen->caps->has_tcl) {
-        BEGIN_CS(5 + (6 * 4));
+        BEGIN_CS(size);
         OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG,
                 (r300screen->caps->is_r500 ?
                  R500_PVS_UCP_START : R300_PVS_UCP_START));
@@ -100,14 +103,14 @@ void r300_emit_clip_state(struct r300_context* r300, void* state)
                 R300_PS_UCP_MODE_CLIP_AS_TRIFAN);
         END_CS;
     } else {
-        BEGIN_CS(2);
+        BEGIN_CS(size);
         OUT_CS_REG(R300_VAP_CLIP_CNTL, R300_CLIP_DISABLE);
         END_CS;
     }
 
 }
 
-void r300_emit_dsa_state(struct r300_context* r300, void* state)
+void r300_emit_dsa_state(struct r300_context* r300, unsigned size, void* state)
 {
     struct r300_dsa_state* dsa = (struct r300_dsa_state*)state;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
@@ -116,7 +119,7 @@ void r300_emit_dsa_state(struct r300_context* r300, void* state)
     struct pipe_stencil_ref stencil_ref = r300->stencil_ref;
     CS_LOCALS(r300);
 
-    BEGIN_CS(r300screen->caps->is_r500 ? 8 : 6);
+    BEGIN_CS(size);
     OUT_CS_REG(R300_FG_ALPHA_FUNC, dsa->alpha_function);
     OUT_CS_REG_SEQ(R300_ZB_CNTL, 3);
 
@@ -143,6 +146,8 @@ static const float * get_shader_constant(
 {
     struct r300_viewport_state* viewport =
         (struct r300_viewport_state*)r300->viewport_state.state;
+    struct r300_textures_state* texstate =
+        (struct r300_textures_state*)r300->textures_state.state;
     static float vec[4] = { 0.0, 0.0, 0.0, 1.0 };
     struct pipe_texture *tex;
 
@@ -158,7 +163,7 @@ static const float * get_shader_constant(
                 /* Factor for converting rectangle coords to
                  * normalized coords. Should only show up on non-r500. */
                 case RC_STATE_R300_TEXRECT_FACTOR:
-                    tex = &r300->textures[constant->u.State[1]]->tex;
+                    tex = &texstate->textures[constant->u.State[1]]->tex;
                     vec[0] = 1.0 / tex->width0;
                     vec[1] = 1.0 / tex->height0;
                     break;
@@ -370,7 +375,7 @@ void r500_emit_fs_constant_buffer(struct r300_context* r300,
     END_CS;
 }
 
-void r300_emit_fb_state(struct r300_context* r300, void* state)
+void r300_emit_fb_state(struct r300_context* r300, unsigned size, void* state)
 {
     struct pipe_framebuffer_state* fb = (struct pipe_framebuffer_state*)state;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
@@ -379,7 +384,7 @@ void r300_emit_fb_state(struct r300_context* r300, void* state)
     int i;
     CS_LOCALS(r300);
 
-    BEGIN_CS((10 * fb->nr_cbufs) + (fb->zsbuf ? 10 : 0) + 6);
+    BEGIN_CS(size);
 
     /* Flush and free renderbuffer caches. */
     OUT_CS_REG(R300_RB3D_DSTCACHE_CTLSTAT,
@@ -418,6 +423,9 @@ void r300_emit_fb_state(struct r300_context* r300, void* state)
 
         OUT_CS_REG(R300_US_OUT_FMT_0 + (4 * i), tex->fb_state.us_out_fmt);
     }
+    for (; i < 4; i++) {
+        OUT_CS_REG(R300_US_OUT_FMT_0 + (4 * i), R300_US_OUT_FMT_UNUSED);
+    }
 
     /* Set up a zbuffer. */
     if (fb->zsbuf) {
@@ -435,6 +443,8 @@ void r300_emit_fb_state(struct r300_context* r300, void* state)
                      0, RADEON_GEM_DOMAIN_VRAM, 0);
     }
 
+    OUT_CS_REG(R300_GA_POINT_MINMAX,
+        (MAX2(fb->width, fb->height) * 6) << R300_GA_POINT_MINMAX_MAX_SHIFT);
     END_CS;
 }
 
@@ -564,21 +574,19 @@ void r300_emit_query_end(struct r300_context* r300)
         r300_emit_query_finish(r300, query);
 }
 
-void r300_emit_rs_state(struct r300_context* r300, void* state)
+void r300_emit_rs_state(struct r300_context* r300, unsigned size, void* state)
 {
     struct r300_rs_state* rs = (struct r300_rs_state*)state;
     float scale, offset;
     CS_LOCALS(r300);
 
-    BEGIN_CS(18 + (rs->polygon_offset_enable ? 5 : 0));
+    BEGIN_CS(size);
     OUT_CS_REG(R300_VAP_CNTL_STATUS, rs->vap_control_status);
 
     OUT_CS_REG(R300_GB_AA_CONFIG, rs->antialiasing_config);
 
     OUT_CS_REG(R300_GA_POINT_SIZE, rs->point_size);
-    OUT_CS_REG_SEQ(R300_GA_POINT_MINMAX, 2);
-    OUT_CS(rs->point_minmax);
-    OUT_CS(rs->line_control);
+    OUT_CS_REG(R300_GA_LINE_CNTL, rs->line_control);
 
     if (rs->polygon_offset_enable) {
         scale = rs->depth_scale * 12;
@@ -609,7 +617,8 @@ void r300_emit_rs_state(struct r300_context* r300, void* state)
     END_CS;
 }
 
-void r300_emit_rs_block_state(struct r300_context* r300, void* state)
+void r300_emit_rs_block_state(struct r300_context* r300,
+                              unsigned size, void* state)
 {
     struct r300_rs_block* rs = (struct r300_rs_block*)state;
     unsigned i;
@@ -620,7 +629,7 @@ void r300_emit_rs_block_state(struct r300_context* r300, void* state)
 
     DBG(r300, DBG_DRAW, "r300: RS emit:\n");
 
-    BEGIN_CS(5 + count*2);
+    BEGIN_CS(size);
     if (r300screen->caps->is_r500) {
         OUT_CS_REG_SEQ(R500_RS_IP_0, count);
     } else {
@@ -651,7 +660,8 @@ void r300_emit_rs_block_state(struct r300_context* r300, void* state)
     END_CS;
 }
 
-void r300_emit_scissor_state(struct r300_context* r300, void* state)
+void r300_emit_scissor_state(struct r300_context* r300,
+                             unsigned size, void* state)
 {
     unsigned minx, miny, maxx, maxy;
     uint32_t top_left, bottom_right;
@@ -705,56 +715,42 @@ void r300_emit_scissor_state(struct r300_context* r300, void* state)
             (((maxy - 1) + 1440) << R300_SCISSORS_Y_SHIFT);
     }
 
-    BEGIN_CS(3);
+    BEGIN_CS(size);
     OUT_CS_REG_SEQ(R300_SC_SCISSORS_TL, 2);
     OUT_CS(top_left);
     OUT_CS(bottom_right);
     END_CS;
 }
 
-void r300_emit_texture(struct r300_context* r300,
-                       struct r300_sampler_state* sampler,
-                       struct r300_texture* tex,
-                       unsigned offset)
+void r300_emit_textures_state(struct r300_context *r300,
+                              unsigned size, void *state)
 {
-    uint32_t filter0 = sampler->filter0;
-    uint32_t format0 = tex->state.format0;
-    unsigned min_level, max_level;
+    struct r300_textures_state *allstate = (struct r300_textures_state*)state;
+    struct r300_texture_sampler_state *texstate;
+    unsigned i;
     CS_LOCALS(r300);
 
-    /* to emulate 1D textures through 2D ones correctly */
-    if (tex->tex.target == PIPE_TEXTURE_1D) {
-        filter0 &= ~R300_TX_WRAP_T_MASK;
-        filter0 |= R300_TX_WRAP_T(R300_TX_CLAMP_TO_EDGE);
-    }
+    BEGIN_CS(size);
+    OUT_CS_REG(R300_TX_ENABLE, allstate->tx_enable);
 
-    if (tex->is_npot) {
-        /* NPOT textures don't support mip filter, unfortunately.
-         * This prevents incorrect rendering. */
-        filter0 &= ~R300_TX_MIN_FILTER_MIP_MASK;
-    } else {
-        /* determine min/max levels */
-        /* the MAX_MIP level is the largest (finest) one */
-        max_level = MIN2(sampler->max_lod, tex->tex.last_level);
-        min_level = MIN2(sampler->min_lod, max_level);
-        format0 |= R300_TX_NUM_LEVELS(max_level);
-        filter0 |= R300_TX_MAX_MIP_LEVEL(min_level);
-    }
+    for (i = 0; i < allstate->count; i++) {
+        if ((1 << i) & allstate->tx_enable) {
+            texstate = &allstate->regs[i];
 
-    BEGIN_CS(16);
-    OUT_CS_REG(R300_TX_FILTER0_0 + (offset * 4), filter0 |
-        (offset << 28));
-    OUT_CS_REG(R300_TX_FILTER1_0 + (offset * 4), sampler->filter1);
-    OUT_CS_REG(R300_TX_BORDER_COLOR_0 + (offset * 4), sampler->border_color);
-
-    OUT_CS_REG(R300_TX_FORMAT0_0 + (offset * 4), format0);
-    OUT_CS_REG(R300_TX_FORMAT1_0 + (offset * 4), tex->state.format1);
-    OUT_CS_REG(R300_TX_FORMAT2_0 + (offset * 4), tex->state.format2);
-    OUT_CS_REG_SEQ(R300_TX_OFFSET_0 + (offset * 4), 1);
-    OUT_CS_RELOC(tex->buffer,
-                 R300_TXO_MACRO_TILE(tex->macrotile) |
-                 R300_TXO_MICRO_TILE(tex->microtile),
-                 RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0, 0);
+            OUT_CS_REG(R300_TX_FILTER0_0 + (i * 4), texstate->filter[0]);
+            OUT_CS_REG(R300_TX_FILTER1_0 + (i * 4), texstate->filter[1]);
+            OUT_CS_REG(R300_TX_BORDER_COLOR_0 + (i * 4),
+                       texstate->border_color);
+
+            OUT_CS_REG(R300_TX_FORMAT0_0 + (i * 4), texstate->format[0]);
+            OUT_CS_REG(R300_TX_FORMAT1_0 + (i * 4), texstate->format[1]);
+            OUT_CS_REG(R300_TX_FORMAT2_0 + (i * 4), texstate->format[2]);
+
+            OUT_CS_REG_SEQ(R300_TX_OFFSET_0 + (i * 4), 1);
+            OUT_CS_RELOC(allstate->textures[i]->buffer, texstate->tile_config,
+                         RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0, 0);
+        }
+    }
     END_CS;
 }
 
@@ -798,58 +794,76 @@ void r300_emit_aos(struct r300_context* r300, unsigned offset)
     END_CS;
 }
 
-void r300_emit_vertex_format_state(struct r300_context* r300, void* state)
+void r300_emit_vertex_stream_state(struct r300_context* r300,
+                                   unsigned size, void* state)
 {
-    struct r300_vertex_info* vertex_info = (struct r300_vertex_info*)state;
+    struct r300_vertex_stream_state *streams =
+        (struct r300_vertex_stream_state*)state;
     unsigned i;
     CS_LOCALS(r300);
 
-    DBG(r300, DBG_DRAW, "r300: VAP/PSC emit:\n");
-
-    BEGIN_CS(26);
-    OUT_CS_REG(R300_VAP_VTX_SIZE, vertex_info->vinfo.size);
+    DBG(r300, DBG_DRAW, "r300: PSC emit:\n");
 
-    OUT_CS_REG_SEQ(R300_VAP_VTX_STATE_CNTL, 2);
-    OUT_CS(vertex_info->vinfo.hwfmt[0]);
-    OUT_CS(vertex_info->vinfo.hwfmt[1]);
-    OUT_CS_REG_SEQ(R300_VAP_OUTPUT_VTX_FMT_0, 2);
-    OUT_CS(vertex_info->vinfo.hwfmt[2]);
-    OUT_CS(vertex_info->vinfo.hwfmt[3]);
-    for (i = 0; i < 4; i++) {
-       DBG(r300, DBG_DRAW, "    : hwfmt%d: 0x%08x\n", i,
-               vertex_info->vinfo.hwfmt[i]);
-    }
-
-    OUT_CS_REG_SEQ(R300_VAP_PROG_STREAM_CNTL_0, 8);
-    for (i = 0; i < 8; i++) {
-        OUT_CS(vertex_info->vap_prog_stream_cntl[i]);
+    BEGIN_CS(size);
+    OUT_CS_REG_SEQ(R300_VAP_PROG_STREAM_CNTL_0, streams->count);
+    for (i = 0; i < streams->count; i++) {
+        OUT_CS(streams->vap_prog_stream_cntl[i]);
         DBG(r300, DBG_DRAW, "    : prog_stream_cntl%d: 0x%08x\n", i,
-               vertex_info->vap_prog_stream_cntl[i]);
+               streams->vap_prog_stream_cntl[i]);
     }
-    OUT_CS_REG_SEQ(R300_VAP_PROG_STREAM_CNTL_EXT_0, 8);
-    for (i = 0; i < 8; i++) {
-        OUT_CS(vertex_info->vap_prog_stream_cntl_ext[i]);
+    OUT_CS_REG_SEQ(R300_VAP_PROG_STREAM_CNTL_EXT_0, streams->count);
+    for (i = 0; i < streams->count; i++) {
+        OUT_CS(streams->vap_prog_stream_cntl_ext[i]);
         DBG(r300, DBG_DRAW, "    : prog_stream_cntl_ext%d: 0x%08x\n", i,
-               vertex_info->vap_prog_stream_cntl_ext[i]);
+               streams->vap_prog_stream_cntl_ext[i]);
     }
     END_CS;
 }
 
+void r300_emit_vap_output_state(struct r300_context* r300,
+                               unsigned size, void* state)
+{
+    struct r300_vap_output_state *vap_out_state =
+        (struct r300_vap_output_state*)state;
+    CS_LOCALS(r300);
+
+    DBG(r300, DBG_DRAW, "r300: VAP emit:\n");
+
+    BEGIN_CS(size);
+    OUT_CS_REG_SEQ(R300_VAP_VTX_STATE_CNTL, 2);
+    OUT_CS(vap_out_state->vap_vtx_state_cntl);
+    OUT_CS(vap_out_state->vap_vsm_vtx_assm);
+    OUT_CS_REG_SEQ(R300_VAP_OUTPUT_VTX_FMT_0, 2);
+    OUT_CS(vap_out_state->vap_out_vtx_fmt[0]);
+    OUT_CS(vap_out_state->vap_out_vtx_fmt[1]);
+    END_CS;
+}
 
-void r300_emit_vertex_program_code(struct r300_context* r300,
-                                   struct r300_vertex_program_code* code)
+void r300_emit_pvs_flush(struct r300_context* r300, unsigned size, void* state)
 {
-    int i;
+    CS_LOCALS(r300);
+
+    BEGIN_CS(size);
+    OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0x0);
+    END_CS;
+}
+
+void r300_emit_vs_state(struct r300_context* r300, unsigned size, void* state)
+{
+    struct r300_vertex_shader* vs = (struct r300_vertex_shader*)state;
+    struct r300_vertex_program_code* code = &vs->code;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     unsigned instruction_count = code->length / 4;
+    unsigned i;
 
-    int vtx_mem_size = r300screen->caps->is_r500 ? 128 : 72;
-    int input_count = MAX2(util_bitcount(code->InputsRead), 1);
-    int output_count = MAX2(util_bitcount(code->OutputsWritten), 1);
-    int temp_count = MAX2(code->num_temporaries, 1);
-    int pvs_num_slots = MIN3(vtx_mem_size / input_count,
-                             vtx_mem_size / output_count, 10);
-    int pvs_num_controllers = MIN2(vtx_mem_size / temp_count, 6);
+    unsigned vtx_mem_size = r300screen->caps->is_r500 ? 128 : 72;
+    unsigned input_count = MAX2(util_bitcount(code->InputsRead), 1);
+    unsigned output_count = MAX2(util_bitcount(code->OutputsWritten), 1);
+    unsigned temp_count = MAX2(code->num_temporaries, 1);
+
+    unsigned pvs_num_slots = MIN3(vtx_mem_size / input_count,
+                                  vtx_mem_size / output_count, 10);
+    unsigned pvs_num_controllers = MIN2(vtx_mem_size / temp_count, 6);
 
     CS_LOCALS(r300);
 
@@ -859,7 +873,7 @@ void r300_emit_vertex_program_code(struct r300_context* r300,
         return;
     }
 
-    BEGIN_CS(9 + code->length);
+    BEGIN_CS(size);
     /* R300_VAP_PVS_CODE_CNTL_0
      * R300_VAP_PVS_CONST_CNTL
      * R300_VAP_PVS_CODE_CNTL_1
@@ -873,8 +887,9 @@ void r300_emit_vertex_program_code(struct r300_context* r300,
 
     OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG, 0);
     OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, code->length);
-    for (i = 0; i < code->length; i++)
+    for (i = 0; i < code->length; i++) {
         OUT_CS(code->body.d[i]);
+    }
 
     OUT_CS_REG(R300_VAP_CNTL, R300_PVS_NUM_SLOTS(pvs_num_slots) |
             R300_PVS_NUM_CNTLRS(pvs_num_controllers) |
@@ -884,12 +899,6 @@ void r300_emit_vertex_program_code(struct r300_context* r300,
     END_CS;
 }
 
-void r300_emit_vertex_shader(struct r300_context* r300,
-                             struct r300_vertex_shader* vs)
-{
-    r300_emit_vertex_program_code(r300, &vs->code);
-}
-
 void r300_emit_vs_constant_buffer(struct r300_context* r300,
                                   struct rc_constant_list* constants)
 {
@@ -923,77 +932,56 @@ void r300_emit_vs_constant_buffer(struct r300_context* r300,
     END_CS;
 }
 
-void r300_emit_viewport_state(struct r300_context* r300, void* state)
+void r300_emit_viewport_state(struct r300_context* r300,
+                              unsigned size, void* state)
 {
     struct r300_viewport_state* viewport = (struct r300_viewport_state*)state;
     CS_LOCALS(r300);
 
-    BEGIN_CS(9);
-    OUT_CS_REG_SEQ(R300_SE_VPORT_XSCALE, 6);
-    OUT_CS_32F(viewport->xscale);
-    OUT_CS_32F(viewport->xoffset);
-    OUT_CS_32F(viewport->yscale);
-    OUT_CS_32F(viewport->yoffset);
-    OUT_CS_32F(viewport->zscale);
-    OUT_CS_32F(viewport->zoffset);
-    OUT_CS_REG(R300_VAP_VTE_CNTL, viewport->vte_control);
-    END_CS;
-}
-
-void r300_emit_texture_count(struct r300_context* r300)
-{
-    uint32_t tx_enable = 0;
-    int i;
-    CS_LOCALS(r300);
-
-    /* Notice that texture_count and sampler_count are just sizes
-     * of the respective arrays. We still have to check for the individual
-     * elements. */
-    for (i = 0; i < MIN2(r300->sampler_count, r300->texture_count); i++) {
-        if (r300->textures[i]) {
-            tx_enable |= 1 << i;
-        }
-    }
-
-    BEGIN_CS(2);
-    OUT_CS_REG(R300_TX_ENABLE, tx_enable);
-    END_CS;
-
+     BEGIN_CS(size);
+     OUT_CS_REG_SEQ(R300_SE_VPORT_XSCALE, 6);
+     OUT_CS_32F(viewport->xscale);
+     OUT_CS_32F(viewport->xoffset);
+     OUT_CS_32F(viewport->yscale);
+     OUT_CS_32F(viewport->yoffset);
+     OUT_CS_32F(viewport->zscale);
+     OUT_CS_32F(viewport->zoffset);
+     OUT_CS_REG(R300_VAP_VTE_CNTL, viewport->vte_control);
+     END_CS;
 }
 
-void r300_emit_ztop_state(struct r300_context* r300, void* state)
+void r300_emit_ztop_state(struct r300_context* r300,
+                          unsigned size, void* state)
 {
     struct r300_ztop_state* ztop = (struct r300_ztop_state*)state;
     CS_LOCALS(r300);
 
-    BEGIN_CS(2);
+    BEGIN_CS(size);
     OUT_CS_REG(R300_ZB_ZTOP, ztop->z_buffer_top);
     END_CS;
 }
 
-void r300_flush_textures(struct r300_context* r300)
+void r300_emit_texture_cache_inval(struct r300_context* r300, unsigned size, void* state)
 {
     CS_LOCALS(r300);
 
-    BEGIN_CS(2);
+    BEGIN_CS(size);
     OUT_CS_REG(R300_TX_INVALTAGS, 0);
     END_CS;
 }
 
-static void r300_flush_pvs(struct r300_context* r300)
-{
-    CS_LOCALS(r300);
-
-    BEGIN_CS(2);
-    OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0x0);
-    END_CS;
-}
-
-void r300_emit_buffer_validate(struct r300_context *r300)
+void r300_emit_buffer_validate(struct r300_context *r300,
+                               boolean do_validate_vertex_buffers,
+                               struct pipe_buffer *index_buffer)
 {
     struct pipe_framebuffer_state* fb =
         (struct pipe_framebuffer_state*)r300->fb_state.state;
+    struct r300_textures_state *texstate =
+        (struct r300_textures_state*)r300->textures_state.state;
     struct r300_texture* tex;
+    struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
+    struct pipe_vertex_element *velem = r300->vertex_element;
+    struct pipe_buffer *pbuf;
     unsigned i;
     boolean invalid = FALSE;
 
@@ -1022,9 +1010,9 @@ validate:
         }
     }
     /* ...textures... */
-    for (i = 0; i < r300->texture_count; i++) {
-        tex = r300->textures[i];
-        if (!tex)
+    for (i = 0; i < texstate->count; i++) {
+        tex = texstate->textures[i];
+        if (!tex || !texstate->sampler_states[i])
             continue;
         if (!r300->winsys->add_buffer(r300->winsys, tex->buffer,
                     RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0)) {
@@ -1040,16 +1028,35 @@ validate:
             goto validate;
         }
     }
-    /* ...and vertex buffer. */
+    /* ...vertex buffer for SWTCL path... */
     if (r300->vbo) {
         if (!r300->winsys->add_buffer(r300->winsys, r300->vbo,
                     RADEON_GEM_DOMAIN_GTT, 0)) {
             r300->context.flush(&r300->context, 0, NULL);
             goto validate;
         }
-    } else {
-        /* debug_printf("No VBO while emitting dirty state!\n"); */
     }
+    /* ...vertex buffers for HWTCL path... */
+    if (do_validate_vertex_buffers) {
+        for (i = 0; i < r300->vertex_element_count; i++) {
+            pbuf = vbuf[velem[i].vertex_buffer_index].buffer;
+
+            if (!r300->winsys->add_buffer(r300->winsys, pbuf,
+                                          RADEON_GEM_DOMAIN_GTT, 0)) {
+                r300->context.flush(&r300->context, 0, NULL);
+                goto validate;
+            }
+        }
+    }
+    /* ...and index buffer for HWTCL path. */
+    if (index_buffer) {
+        if (!r300->winsys->add_buffer(r300->winsys, index_buffer,
+                                      RADEON_GEM_DOMAIN_GTT, 0)) {
+            r300->context.flush(&r300->context, 0, NULL);
+            goto validate;
+        }
+    }
+
     if (!r300->winsys->validate(r300->winsys)) {
         r300->context.flush(&r300->context, 0, NULL);
         if (invalid) {
@@ -1062,16 +1069,10 @@ validate:
     }
 }
 
-/* Emit all dirty state. */
-void r300_emit_dirty_state(struct r300_context* r300)
+unsigned r300_get_num_dirty_dwords(struct r300_context *r300)
 {
-    struct r300_screen* r300screen = r300_screen(r300->context.screen);
     struct r300_atom* atom;
-    unsigned i, dwords = 1024;
-    int dirty_tex = 0;
-
-    /* Check the required number of dwords against the space remaining in the
-     * current CS object. If we need more, then flush. */
+    unsigned dwords = 0;
 
     foreach(atom, &r300->atom_list) {
         if (atom->dirty || atom->always_dirty) {
@@ -1079,12 +1080,17 @@ void r300_emit_dirty_state(struct r300_context* r300)
         }
     }
 
-    /* Make sure we have at least 2*1024 spare dwords. */
-    /* XXX It would be nice to know the number of dwords we really need to
-     * XXX emit. */
-    while (!r300->winsys->check_cs(r300->winsys, dwords)) {
-        r300->context.flush(&r300->context, 0, NULL);
-    }
+    /* XXX This is the compensation for the non-atomized states. */
+    dwords += 1024;
+
+    return dwords;
+}
+
+/* Emit all dirty state. */
+void r300_emit_dirty_state(struct r300_context* r300)
+{
+    struct r300_screen* r300screen = r300_screen(r300->context.screen);
+    struct r300_atom* atom;
 
     if (r300->dirty_state & R300_NEW_QUERY) {
         r300_emit_query_start(r300);
@@ -1093,7 +1099,7 @@ void r300_emit_dirty_state(struct r300_context* r300)
 
     foreach(atom, &r300->atom_list) {
         if (atom->dirty || atom->always_dirty) {
-            atom->emit(r300, atom->state);
+            atom->emit(r300, atom->size, atom->state);
             atom->dirty = FALSE;
         }
     }
@@ -1119,43 +1125,9 @@ void r300_emit_dirty_state(struct r300_context* r300)
         r300->dirty_state &= ~R300_NEW_FRAGMENT_SHADER_CONSTANTS;
     }
 
-    /* Samplers and textures are tracked separately but emitted together. */
-    if (r300->dirty_state &
-            (R300_ANY_NEW_SAMPLERS | R300_ANY_NEW_TEXTURES)) {
-        r300_emit_texture_count(r300);
-
-        for (i = 0; i < MIN2(r300->sampler_count, r300->texture_count); i++) {
-  	    if (r300->dirty_state &
-		((R300_NEW_SAMPLER << i) | (R300_NEW_TEXTURE << i))) {
-		if (r300->textures[i]) {
-		    r300_emit_texture(r300,
-				      r300->sampler_states[i],
-				      r300->textures[i],
-				      i);
-                    dirty_tex |= r300->dirty_state & (R300_NEW_TEXTURE << i);
-                }
-                r300->dirty_state &=
-                    ~((R300_NEW_SAMPLER << i) | (R300_NEW_TEXTURE << i));
-            }
-        }
-        r300->dirty_state &= ~(R300_ANY_NEW_SAMPLERS | R300_ANY_NEW_TEXTURES);
-    }
-
-    if (dirty_tex) {
-        r300_flush_textures(r300);
-    }
-
-    if (r300->dirty_state & (R300_NEW_VERTEX_SHADER | R300_NEW_VERTEX_SHADER_CONSTANTS)) {
-        r300_flush_pvs(r300);
-    }
-
-    if (r300->dirty_state & R300_NEW_VERTEX_SHADER) {
-        r300_emit_vertex_shader(r300, r300->vs);
-        r300->dirty_state &= ~R300_NEW_VERTEX_SHADER;
-    }
-
     if (r300->dirty_state & R300_NEW_VERTEX_SHADER_CONSTANTS) {
-        r300_emit_vs_constant_buffer(r300, &r300->vs->code.constants);
+        struct r300_vertex_shader* vs = r300->vs_state.state;
+        r300_emit_vs_constant_buffer(r300, &vs->code.constants);
         r300->dirty_state &= ~R300_NEW_VERTEX_SHADER_CONSTANTS;
     }
 
diff --git a/src/gallium/drivers/r300/r300_emit.h b/src/gallium/drivers/r300/r300_emit.h
index 6b96d9b57c..449e640a88 100644
--- a/src/gallium/drivers/r300/r300_emit.h
+++ b/src/gallium/drivers/r300/r300_emit.h
@@ -31,13 +31,17 @@ struct r300_vertex_program_code;
 
 void r300_emit_aos(struct r300_context* r300, unsigned offset);
 
-void r300_emit_blend_state(struct r300_context* r300, void* state);
+void r300_emit_blend_state(struct r300_context* r300,
+                           unsigned size, void* state);
 
-void r300_emit_blend_color_state(struct r300_context* r300, void* state);
+void r300_emit_blend_color_state(struct r300_context* r300,
+                                 unsigned size, void* state);
 
-void r300_emit_clip_state(struct r300_context* r300, void* state);
+void r300_emit_clip_state(struct r300_context* r300,
+                          unsigned size, void* state);
 
-void r300_emit_dsa_state(struct r300_context* r300, void* state);
+void r300_emit_dsa_state(struct r300_context* r300,
+                         unsigned size, void* state);
 
 void r300_emit_fragment_program_code(struct r300_context* r300,
                                      struct rX00_fragment_program_code* generic_code);
@@ -51,48 +55,54 @@ void r500_emit_fragment_program_code(struct r300_context* r300,
 void r500_emit_fs_constant_buffer(struct r300_context* r300,
                                   struct rc_constant_list* constants);
 
-void r300_emit_fb_state(struct r300_context* r300, void* state);
+void r300_emit_fb_state(struct r300_context* r300, unsigned size, void* state);
 
 void r300_emit_query_begin(struct r300_context* r300,
                            struct r300_query* query);
 
 void r300_emit_query_end(struct r300_context* r300);
 
-void r300_emit_rs_state(struct r300_context* r300, void* state);
+void r300_emit_rs_state(struct r300_context* r300, unsigned size, void* state);
 
-void r300_emit_rs_block_state(struct r300_context* r300, void* state);
+void r300_emit_rs_block_state(struct r300_context* r300,
+                              unsigned size, void* state);
 
-void r300_emit_scissor_state(struct r300_context* r300, void* state);
+void r300_emit_scissor_state(struct r300_context* r300,
+                             unsigned size, void* state);
 
-void r300_emit_texture(struct r300_context* r300,
-                       struct r300_sampler_state* sampler,
-                       struct r300_texture* tex,
-                       unsigned offset);
+void r300_emit_textures_state(struct r300_context *r300,
+                              unsigned size, void *state);
 
 void r300_emit_vertex_buffer(struct r300_context* r300);
 
-void r300_emit_vertex_format_state(struct r300_context* r300, void* state);
+void r300_emit_vertex_stream_state(struct r300_context* r300,
+                                   unsigned size, void* state);
 
-void r300_emit_vertex_program_code(struct r300_context* r300,
-                                   struct r300_vertex_program_code* code);
+void r300_emit_vap_output_state(struct r300_context* r300,
+                               unsigned size, void* state);
 
 void r300_emit_vs_constant_buffer(struct r300_context* r300,
                                   struct rc_constant_list* constants);
 
-void r300_emit_vertex_shader(struct r300_context* r300,
-                             struct r300_vertex_shader* vs);
+void r300_emit_vs_state(struct r300_context* r300, unsigned size, void* state);
 
-void r300_emit_viewport_state(struct r300_context* r300, void* state);
+void r300_emit_viewport_state(struct r300_context* r300,
+                              unsigned size, void* state);
 
-void r300_emit_texture_count(struct r300_context* r300);
+void r300_emit_ztop_state(struct r300_context* r300,
+                          unsigned size, void* state);
 
-void r300_emit_ztop_state(struct r300_context* r300, void* state);
+void r300_emit_pvs_flush(struct r300_context* r300, unsigned size, void* state);
 
-void r300_flush_textures(struct r300_context* r300);
+void r300_emit_texture_cache_inval(struct r300_context* r300, unsigned size, void* state);
+
+unsigned r300_get_num_dirty_dwords(struct r300_context *r300);
 
 /* Emit all dirty state. */
 void r300_emit_dirty_state(struct r300_context* r300);
 
-void r300_emit_buffer_validate(struct r300_context *r300);
+void r300_emit_buffer_validate(struct r300_context *r300,
+                               boolean do_validate_vertex_buffers,
+                               struct pipe_buffer *index_buffer);
 
 #endif /* R300_EMIT_H */
diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index ae4c62b2f1..3c2625269b 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -133,10 +133,13 @@ static void get_compare_state(
     struct r300_fragment_program_external_state* state,
     unsigned shadow_samplers)
 {
+    struct r300_textures_state *texstate =
+        (struct r300_textures_state*)r300->textures_state.state;
+
     memset(state, 0, sizeof(*state));
 
-    for (int i = 0; i < r300->sampler_count; i++) {
-        struct r300_sampler_state* s = r300->sampler_states[i];
+    for (int i = 0; i < texstate->sampler_count; i++) {
+        struct r300_sampler_state* s = texstate->sampler_states[i];
 
         if (s && s->state.compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
             /* XXX Gallium doesn't provide us with any information regarding
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 754eb4dc76..770a92be74 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -118,6 +118,18 @@ static uint32_t r300_provoking_vertex_fixes(struct r300_context *r300,
     return color_control;
 }
 
+/* Check if the requested number of dwords is available in the CS and
+ * if not, flush. Return TRUE if the flush occured. */
+static boolean r300_reserve_cs_space(struct r300_context *r300,
+                                     unsigned dwords)
+{
+    if (!r300->winsys->check_cs(r300->winsys, dwords)) {
+        r300->context.flush(&r300->context, 0, NULL);
+        return TRUE;
+    }
+    return FALSE;
+}
+
 static boolean immd_is_good_idea(struct r300_context *r300,
                                       unsigned count)
 {
@@ -132,7 +144,7 @@ static void r300_emit_draw_arrays_immediate(struct r300_context *r300,
     struct pipe_vertex_element* velem;
     struct pipe_vertex_buffer* vbuf;
     unsigned vertex_element_count = r300->vertex_element_count;
-    unsigned i, v, vbi, dw, elem_offset;
+    unsigned i, v, vbi, dw, elem_offset, dwords;
 
     /* Size of the vertex, in dwords. */
     unsigned vertex_size = 0;
@@ -171,9 +183,13 @@ static void r300_emit_draw_arrays_immediate(struct r300_context *r300,
         }
     }
 
+    dwords = 10 + count * vertex_size;
+
+    r300_reserve_cs_space(r300, r300_get_num_dirty_dwords(r300) + dwords);
+    r300_emit_buffer_validate(r300, FALSE, 0);
     r300_emit_dirty_state(r300);
 
-    BEGIN_CS(10 + count * vertex_size);
+    BEGIN_CS(dwords);
     OUT_CS_REG(R300_GA_COLOR_CONTROL,
             r300_provoking_vertex_fixes(r300, mode));
     OUT_CS_REG(R300_VAP_VTX_SIZE, vertex_size);
@@ -258,11 +274,6 @@ static void r300_emit_draw_elements(struct r300_context *r300,
 
     assert((start * indexSize)  % 4 == 0);
 
-    /* XXX Non-zero offset locks up. */
-    if (offset_dwords != 0) {
-        return;
-    }
-
     if (alt_num_verts) {
         assert(count < (1 << 24));
         BEGIN_CS(16);
@@ -276,13 +287,13 @@ static void r300_emit_draw_elements(struct r300_context *r300,
     OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, maxIndex);
     OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, 0);
     if (indexSize == 4) {
-        count_dwords = count + start;
+        count_dwords = count;
         OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
                R300_VAP_VF_CNTL__INDEX_SIZE_32bit |
                r300_translate_primitive(mode) |
                (alt_num_verts ? R500_VAP_VF_CNTL__USE_ALT_NUM_VERTS : 0));
     } else {
-        count_dwords = (count + start + 1) / 2;
+        count_dwords = (count + 1) / 2;
         OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
                r300_translate_primitive(mode) |
                (alt_num_verts ? R500_VAP_VF_CNTL__USE_ALT_NUM_VERTS : 0));
@@ -303,31 +314,6 @@ static void r300_emit_draw_elements(struct r300_context *r300,
     END_CS;
 }
 
-static boolean r300_setup_vertex_buffers(struct r300_context *r300)
-{
-    struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
-    struct pipe_vertex_element *velem = r300->vertex_element;
-    struct pipe_buffer *pbuf;
-
-validate:
-    for (int i = 0; i < r300->vertex_element_count; i++) {
-        pbuf = vbuf[velem[i].vertex_buffer_index].buffer;
-
-        if (!r300->winsys->add_buffer(r300->winsys, pbuf,
-                                      RADEON_GEM_DOMAIN_GTT, 0)) {
-            r300->context.flush(&r300->context, 0, NULL);
-            goto validate;
-        }
-    }
-
-    if (!r300->winsys->validate(r300->winsys)) {
-        r300->context.flush(&r300->context, 0, NULL);
-        return r300->winsys->validate(r300->winsys);
-    }
-
-    return TRUE;
-}
-
 static void r300_shorten_ubyte_elts(struct r300_context* r300,
                                     struct pipe_buffer** elts,
                                     unsigned count)
@@ -383,30 +369,17 @@ void r300_draw_range_elements(struct pipe_context* pipe,
         return;
     }
 
-    r300_update_derived_state(r300);
-
-    r300_emit_buffer_validate(r300);
-
-    if (!r300_setup_vertex_buffers(r300)) {
-        return;
-    }
-
     if (indexSize == 1) {
         r300_shorten_ubyte_elts(r300, &indexBuffer, count);
         indexSize = 2;
     }
 
-    if (!r300->winsys->add_buffer(r300->winsys, indexBuffer,
-                                  RADEON_GEM_DOMAIN_GTT, 0)) {
-        goto cleanup;
-    }
-
-    if (!r300->winsys->validate(r300->winsys)) {
-        goto cleanup;
-    }
+    r300_update_derived_state(r300);
 
+    /* 128 dwords for emit_aos and emit_draw_elements */
+    r300_reserve_cs_space(r300, r300_get_num_dirty_dwords(r300) + 128);
+    r300_emit_buffer_validate(r300, TRUE, indexBuffer);
     r300_emit_dirty_state(r300);
-
     r300_emit_aos(r300, 0);
 
     if (alt_num_verts || count <= 65535) {
@@ -420,10 +393,16 @@ void r300_draw_range_elements(struct pipe_context* pipe,
 
             start += short_count;
             count -= short_count;
+
+            /* 16 spare dwords are enough for emit_draw_elements. */
+            if (count && r300_reserve_cs_space(r300, 16)) {
+                r300_emit_buffer_validate(r300, TRUE, indexBuffer);
+                r300_emit_dirty_state(r300);
+                r300_emit_aos(r300, 0);
+            }
         } while (count);
     }
 
-cleanup:
     if (indexBuffer != orgIndexBuffer) {
         pipe->screen->buffer_destroy(indexBuffer);
     }
@@ -435,8 +414,11 @@ void r300_draw_elements(struct pipe_context* pipe,
                         unsigned indexSize, unsigned mode,
                         unsigned start, unsigned count)
 {
-   pipe->draw_range_elements(pipe, indexBuffer, indexSize, 0, ~0,
-                             mode, start, count);
+    struct r300_context *r300 = r300_context(pipe);
+
+    pipe->draw_range_elements(pipe, indexBuffer, indexSize, 0,
+                              r300->vertex_buffer_max_index,
+                              mode, start, count);
 }
 
 void r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
@@ -457,15 +439,13 @@ void r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
 
     r300_update_derived_state(r300);
 
-    r300_emit_buffer_validate(r300);
-
     if (immd_is_good_idea(r300, count)) {
         r300_emit_draw_arrays_immediate(r300, mode, start, count);
     } else {
-        if (!r300_setup_vertex_buffers(r300)) {
-            return;
-        }
-
+        /* Make sure there are at least 128 spare dwords in the command buffer.
+         * (most of it being consumed by emit_aos) */
+        r300_reserve_cs_space(r300, r300_get_num_dirty_dwords(r300) + 128);
+        r300_emit_buffer_validate(r300, TRUE, 0);
         r300_emit_dirty_state(r300);
 
         if (alt_num_verts || count <= 65535) {
@@ -479,6 +459,13 @@ void r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
 
                 start += short_count;
                 count -= short_count;
+
+                /* Again, we emit both AOS and draw_arrays so there should be
+                 * at least 128 spare dwords. */
+                if (count && r300_reserve_cs_space(r300, 128)) {
+                    r300_emit_buffer_validate(r300, TRUE, 0);
+                    r300_emit_dirty_state(r300);
+                }
             } while (count);
         }
     }
@@ -610,7 +597,7 @@ r300_render_get_vertex_info(struct vbuf_render* render)
 
     r300_update_derived_state(r300);
 
-    return (struct vertex_info*)r300->vertex_format_state.state;
+    return &r300->vertex_info;
 }
 
 static boolean r300_render_allocate_vertices(struct vbuf_render* render,
@@ -695,6 +682,7 @@ static void r300_render_draw_arrays(struct vbuf_render* render,
 
     CS_LOCALS(r300);
 
+    r300_reserve_cs_space(r300, r300_get_num_dirty_dwords(r300) + 2);
     r300_emit_dirty_state(r300);
 
     DBG(r300, DBG_DRAW, "r300: Doing vbuf render, count %d\n", count);
@@ -713,12 +701,14 @@ static void r300_render_draw(struct vbuf_render* render,
     struct r300_render* r300render = r300_render(render);
     struct r300_context* r300 = r300render->r300;
     int i;
+    unsigned dwords = 2 + (count+1)/2;
 
     CS_LOCALS(r300);
 
+    r300_reserve_cs_space(r300, r300_get_num_dirty_dwords(r300) + dwords);
     r300_emit_dirty_state(r300);
 
-    BEGIN_CS(2 + (count+1)/2);
+    BEGIN_CS(dwords);
     OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, (count+1)/2);
     OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
            r300render->hwprim);
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index ebb859138f..b732380a14 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -600,7 +600,8 @@ static void
 
     memcpy(r300->fb_state.state, state, sizeof(struct pipe_framebuffer_state));
 
-    r300->fb_state.size = (10 * state->nr_cbufs) + (state->zsbuf ? 10 : 0) + 6;
+    r300->fb_state.size = (10 * state->nr_cbufs) + (2 * (4 - state->nr_cbufs)) +
+                          (state->zsbuf ? 10 : 0) + 8;
 
     r300_fb_update_tiling_flags(r300, r300->fb_state.state, state);
 
@@ -660,8 +661,10 @@ static void r300_bind_fs_state(struct pipe_context* pipe, void* shader)
     r300->fs = fs;
     r300_pick_fragment_shader(r300);
 
-    if (r300->vs && r300_vertex_shader_setup_wpos(r300)) {
-        r300->vertex_format_state.dirty = TRUE;
+    r300->rs_block_state.dirty = TRUE; /* Will be updated before the emission. */
+
+    if (r300->vs_state.state && r300_vertex_shader_setup_wpos(r300)) {
+        r300->vap_output_state.dirty = TRUE;
     }
 
     r300->dirty_state |= R300_NEW_FRAGMENT_SHADER | R300_NEW_FRAGMENT_SHADER_CONSTANTS;
@@ -720,22 +723,6 @@ static void* r300_create_rs_state(struct pipe_context* pipe,
     rs->point_size = pack_float_16_6x(state->point_size) |
         (pack_float_16_6x(state->point_size) << R300_POINTSIZE_X_SHIFT);
 
-        /* Point minimum and maximum sizes. This register has to be emitted,
-         * and it'd be a step backwards to put it in invariant state. */
-        if (r300screen->caps->is_r500) {
-            rs->point_minmax =
-            ((int)(0.0 * 6.0) << R300_GA_POINT_MINMAX_MIN_SHIFT) |
-            ((int)(4096.0 * 6.0) << R300_GA_POINT_MINMAX_MAX_SHIFT);
-        } else if (r300screen->caps->is_r400) {
-            rs->point_minmax =
-            ((int)(0.0 * 6.0) << R300_GA_POINT_MINMAX_MIN_SHIFT) |
-            ((int)(4021.0 * 6.0) << R300_GA_POINT_MINMAX_MAX_SHIFT);
-        } else {
-            rs->point_minmax =
-            ((int)(0.0 * 6.0) << R300_GA_POINT_MINMAX_MIN_SHIFT) |
-            ((int)(2560.0 * 6.0) << R300_GA_POINT_MINMAX_MAX_SHIFT);
-        }
-
     rs->line_control = pack_float_16_6x(state->line_width) |
         R300_GA_LINE_CNTL_END_TYPE_COMP;
 
@@ -826,12 +813,13 @@ static void r300_bind_rs_state(struct pipe_context* pipe, void* state)
 
     if (rs) {
         r300->polygon_offset_enabled = rs->rs.offset_cw || rs->rs.offset_ccw;
+        r300->rs_state.dirty = TRUE;
     } else {
         r300->polygon_offset_enabled = FALSE;
     }
 
     r300->rs_state.state = rs;
-    r300->rs_state.dirty = TRUE;
+    r300->rs_state.size = 17 + (r300->polygon_offset_enabled ? 5 : 0);
     /* XXX Why is this still needed, dammit!? */
     r300->scissor_state.dirty = TRUE;
     r300->viewport_state.dirty = TRUE;
@@ -870,7 +858,7 @@ static void*
                                                    state->max_anisotropy > 0);
 
     /* Unfortunately, r300-r500 don't support floating-point mipmap lods. */
-    /* We must pass these to the emit function to clamp them properly. */
+    /* We must pass these to the merge function to clamp them properly. */
     sampler->min_lod = MAX2((unsigned)state->min_lod, 0);
     sampler->max_lod = MAX2((unsigned)ceilf(state->max_lod), 0);
 
@@ -896,23 +884,20 @@ static void r300_bind_sampler_states(struct pipe_context* pipe,
                                      void** states)
 {
     struct r300_context* r300 = r300_context(pipe);
-    int i;
+    struct r300_textures_state* state =
+        (struct r300_textures_state*)r300->textures_state.state;
 
     if (count > 8) {
         return;
     }
 
-    for (i = 0; i < count; i++) {
-        if (r300->sampler_states[i] != states[i]) {
-            r300->sampler_states[i] = (struct r300_sampler_state*)states[i];
-            r300->dirty_state |= (R300_NEW_SAMPLER << i);
-        }
-    }
+    memcpy(state->sampler_states, states, sizeof(void*) * count);
+    state->sampler_count = count;
 
-    r300->sampler_count = count;
+    r300->textures_state.dirty = TRUE;
 
     /* Pick a fragment shader based on the texture compare state. */
-    if (r300->fs && (r300->dirty_state & R300_ANY_NEW_SAMPLERS)) {
+    if (r300->fs && count) {
         if (r300_pick_fragment_shader(r300)) {
             r300->dirty_state |= R300_NEW_FRAGMENT_SHADER |
                                  R300_NEW_FRAGMENT_SHADER_CONSTANTS;
@@ -936,22 +921,25 @@ static void r300_set_sampler_textures(struct pipe_context* pipe,
                                       struct pipe_texture** texture)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct r300_textures_state* state =
+        (struct r300_textures_state*)r300->textures_state.state;
+    unsigned i;
     boolean is_r500 = r300_screen(r300->context.screen)->caps->is_r500;
-    int i;
+    boolean dirty_tex = FALSE;
 
     /* XXX magic num */
     if (count > 8) {
         return;
     }
-    
+
     for (i = 0; i < count; i++) {
-        if (r300->textures[i] != (struct r300_texture*)texture[i]) {
-            pipe_texture_reference((struct pipe_texture**)&r300->textures[i],
-                texture[i]);
-            r300->dirty_state |= (R300_NEW_TEXTURE << i);
+        if (state->textures[i] != (struct r300_texture*)texture[i]) {
+            pipe_texture_reference((struct pipe_texture**)&state->textures[i],
+                                   texture[i]);
+            dirty_tex = TRUE;
 
-            /* R300-specific - set the texrect factor in a fragment shader */
-            if (!is_r500 && r300->textures[i]->is_npot) {
+            /* R300-specific - set the texrect factor in the fragment shader */
+            if (!is_r500 && state->textures[i]->is_npot) {
                 /* XXX It would be nice to re-emit just 1 constant,
                  * XXX not all of them */
                 r300->dirty_state |= R300_NEW_FRAGMENT_SHADER_CONSTANTS;
@@ -960,14 +948,19 @@ static void r300_set_sampler_textures(struct pipe_context* pipe,
     }
 
     for (i = count; i < 8; i++) {
-        if (r300->textures[i]) {
-            pipe_texture_reference((struct pipe_texture**)&r300->textures[i],
+        if (state->textures[i]) {
+            pipe_texture_reference((struct pipe_texture**)&state->textures[i],
                 NULL);
-            r300->dirty_state |= (R300_NEW_TEXTURE << i);
         }
     }
 
-    r300->texture_count = count;
+    state->texture_count = count;
+
+    r300->textures_state.dirty = TRUE;
+
+    if (dirty_tex) {
+        r300->texture_cache_inval.dirty = TRUE;
+    }
 }
 
 static void r300_set_scissor_state(struct pipe_context* pipe,
@@ -1029,17 +1022,24 @@ static void r300_set_vertex_buffers(struct pipe_context* pipe,
                                     const struct pipe_vertex_buffer* buffers)
 {
     struct r300_context* r300 = r300_context(pipe);
+    unsigned i, max_index = ~0;
 
     memcpy(r300->vertex_buffer, buffers,
         sizeof(struct pipe_vertex_buffer) * count);
+
+    for (i = 0; i < count; i++) {
+        max_index = MIN2(buffers[i].max_index, max_index);
+    }
+
     r300->vertex_buffer_count = count;
+    r300->vertex_buffer_max_index = max_index;
 
     if (r300->draw) {
         draw_flush(r300->draw);
         draw_set_vertex_buffers(r300->draw, count, buffers);
+    } else {
+        r300->vertex_stream_state.dirty = TRUE;
     }
-
-    r300->vertex_format_state.dirty = TRUE;
 }
 
 static boolean r300_validate_aos(struct r300_context *r300)
@@ -1108,21 +1108,26 @@ static void r300_bind_vs_state(struct pipe_context* pipe, void* shader)
         struct r300_vertex_shader* vs = (struct r300_vertex_shader*)shader;
 
         if (vs == NULL) {
-            r300->vs = NULL;
+            r300->vs_state.state = NULL;
             return;
         } else if (!vs->translated) {
             r300_translate_vertex_shader(r300, vs);
         }
 
-        r300->vs = vs;
+        r300->vs_state.state = vs;
+        r300->vs_state.size = vs->code.length + 9;
+        r300->vs_state.dirty = TRUE;
+
+        r300->rs_block_state.dirty = TRUE; /* Will be updated before the emission. */
+        r300->vap_output_state.dirty = TRUE;
+        r300->vertex_stream_state.dirty = TRUE; /* XXX needed for TCL bypass */
+        r300->pvs_flush.dirty = TRUE;
+
         if (r300->fs) {
             r300_vertex_shader_setup_wpos(r300);
         }
 
-        r300->vertex_format_state.dirty = TRUE;
-
-        r300->dirty_state |=
-            R300_NEW_VERTEX_SHADER | R300_NEW_VERTEX_SHADER_CONSTANTS;
+        r300->dirty_state |= R300_NEW_VERTEX_SHADER_CONSTANTS;
     } else {
         draw_flush(r300->draw);
         draw_bind_vertex_shader(r300->draw,
@@ -1194,8 +1199,10 @@ static void r300_set_constant_buffer(struct pipe_context *pipe,
     r300->shader_constants[shader].count = buf->size / (4 * sizeof(float));
     pipe_buffer_unmap(pipe->screen, buf);
 
-    if (shader == PIPE_SHADER_VERTEX)
+    if (shader == PIPE_SHADER_VERTEX) {
         r300->dirty_state |= R300_NEW_VERTEX_SHADER_CONSTANTS;
+        r300->pvs_flush.dirty = TRUE;
+    }
     else if (shader == PIPE_SHADER_FRAGMENT)
         r300->dirty_state |= R300_NEW_FRAGMENT_SHADER_CONSTANTS;
 }
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index 778eaaacd9..9c8e907fdf 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -42,20 +42,20 @@ static void r300_draw_emit_attrib(struct r300_context* r300,
                                   enum interp_mode interp,
                                   int index)
 {
-    struct tgsi_shader_info* info = &r300->vs->info;
+    struct r300_vertex_shader* vs = r300->vs_state.state;
+    struct tgsi_shader_info* info = &vs->info;
     int output;
 
     output = draw_find_shader_output(r300->draw,
                                      info->output_semantic_name[index],
                                      info->output_semantic_index[index]);
-    draw_emit_vertex_attr(
-        (struct vertex_info*)r300->vertex_format_state.state,
-        emit, interp, output);
+    draw_emit_vertex_attr(&r300->vertex_info, emit, interp, output);
 }
 
 static void r300_draw_emit_all_attribs(struct r300_context* r300)
 {
-    struct r300_shader_semantics* vs_outputs = &r300->vs->outputs;
+    struct r300_vertex_shader* vs = r300->vs_state.state;
+    struct r300_shader_semantics* vs_outputs = &vs->outputs;
     int i, gen_count;
 
     /* Position. */
@@ -104,16 +104,21 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300)
 }
 
 /* Update the PSC tables. */
+/* XXX move this function into r300_state.c after TCL-bypass gets removed
+ * XXX because this one is dependent only on vertex elements. */
 static void r300_vertex_psc(struct r300_context* r300)
 {
-    struct r300_vertex_info *vformat =
-        (struct r300_vertex_info*)r300->vertex_format_state.state;
+    struct r300_vertex_shader* vs = r300->vs_state.state;
+    struct r300_vertex_stream_state *vformat =
+        (struct r300_vertex_stream_state*)r300->vertex_stream_state.state;
     uint16_t type, swizzle;
     enum pipe_format format;
     unsigned i;
     int identity[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
     int* stream_tab;
 
+    memset(vformat, 0, sizeof(struct r300_vertex_stream_state));
+
     stream_tab = identity;
 
     /* Vertex shaders have no semantics on their inputs,
@@ -121,7 +126,7 @@ static void r300_vertex_psc(struct r300_context* r300)
      * and not on attrib information. */
     DBG(r300, DBG_DRAW, "r300: vs expects %d attribs, routing %d elements"
             " in psc\n",
-            r300->vs->info.num_inputs,
+            vs->info.num_inputs,
             r300->vertex_element_count);
 
     for (i = 0; i < r300->vertex_element_count; i++) {
@@ -148,18 +153,24 @@ static void r300_vertex_psc(struct r300_context* r300)
     }
     vformat->vap_prog_stream_cntl[i >> 1] |=
         (R300_LAST_VEC << (i & 1 ? 16 : 0));
+
+    vformat->count = (i >> 1) + 1;
+    r300->vertex_stream_state.size = (1 + vformat->count) * 2;
 }
 
 /* Update the PSC tables for SW TCL, using Draw. */
 static void r300_swtcl_vertex_psc(struct r300_context* r300)
 {
-    struct r300_vertex_info *vformat =
-        (struct r300_vertex_info*)r300->vertex_format_state.state;
-    struct vertex_info* vinfo = &vformat->vinfo;
+    struct r300_vertex_shader* vs = r300->vs_state.state;
+    struct r300_vertex_stream_state *vformat =
+        (struct r300_vertex_stream_state*)r300->vertex_stream_state.state;
+    struct vertex_info* vinfo = &r300->vertex_info;
     uint16_t type, swizzle;
     enum pipe_format format;
     unsigned i, attrib_count;
-    int* vs_output_tab = r300->vs->stream_loc_notcl;
+    int* vs_output_tab = vs->stream_loc_notcl;
+
+    memset(vformat, 0, sizeof(struct r300_vertex_stream_state));
 
     /* For each Draw attribute, route it to the fragment shader according
      * to the vs_output_tab. */
@@ -202,6 +213,9 @@ static void r300_swtcl_vertex_psc(struct r300_context* r300)
     }
     vformat->vap_prog_stream_cntl[i >> 1] |=
         (R300_LAST_VEC << (i & 1 ? 16 : 0));
+
+    vformat->count = (i >> 1) + 1;
+    r300->vertex_stream_state.size = (1 + vformat->count) * 2;
 }
 
 static void r300_rs_col(struct r300_rs_block* rs, int id, int ptr,
@@ -410,31 +424,29 @@ static void r300_update_rs_block(struct r300_context* r300,
     /* Now, after all that, see if we actually need to update the state. */
     if (memcmp(r300->rs_block_state.state, &rs, sizeof(struct r300_rs_block))) {
         memcpy(r300->rs_block_state.state, &rs, sizeof(struct r300_rs_block));
-        r300->rs_block_state.size = 5 + count;
-        r300->rs_block_state.dirty = TRUE;
+        r300->rs_block_state.size = 5 + count*2;
     }
 }
 
 /* Update the shader-dependant states. */
 static void r300_update_derived_shader_state(struct r300_context* r300)
 {
+    struct r300_vertex_shader* vs = r300->vs_state.state;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
-    struct r300_vertex_info *vformat =
-        (struct r300_vertex_info*)r300->vertex_format_state.state;
-    struct vertex_info* vinfo = &vformat->vinfo;
+    struct r300_vap_output_state *vap_out =
+        (struct r300_vap_output_state*)r300->vap_output_state.state;
 
-    /* Mmm, delicious hax */
-    memset(r300->vertex_format_state.state, 0, sizeof(struct r300_vertex_info));
-    memcpy(vinfo->hwfmt, r300->vs->hwfmt, sizeof(uint)*4);
+    /* XXX Mmm, delicious hax */
+    memset(&r300->vertex_info, 0, sizeof(struct vertex_info));
+    memcpy(vap_out, vs->hwfmt, sizeof(uint)*4);
 
-    r300_update_rs_block(r300, &r300->vs->outputs, &r300->fs->inputs);
+    r300_update_rs_block(r300, &vs->outputs, &r300->fs->inputs);
 
     if (r300screen->caps->has_tcl) {
         r300_vertex_psc(r300);
     } else {
         r300_draw_emit_all_attribs(r300);
-        draw_compute_vertex_size(
-            (struct vertex_info*)r300->vertex_format_state.state);
+        draw_compute_vertex_size(&r300->vertex_info);
         r300_swtcl_vertex_psc(r300);
     }
 }
@@ -510,14 +522,74 @@ static void r300_update_ztop(struct r300_context* r300)
     r300->ztop_state.dirty = TRUE;
 }
 
+static void r300_merge_textures_and_samplers(struct r300_context* r300)
+{
+    struct r300_textures_state *state =
+        (struct r300_textures_state*)r300->textures_state.state;
+    struct r300_texture_sampler_state *texstate;
+    struct r300_sampler_state *sampler;
+    struct r300_texture *tex;
+    unsigned min_level, max_level, i, size;
+    unsigned count = MIN2(state->texture_count, state->sampler_count);
+
+    state->tx_enable = 0;
+    size = 2;
+
+    for (i = 0; i < count; i++) {
+        if (state->textures[i] && state->sampler_states[i]) {
+            state->tx_enable |= 1 << i;
+
+            tex = state->textures[i];
+            sampler = state->sampler_states[i];
+
+            texstate = &state->regs[i];
+            memcpy(texstate->format, &tex->state, sizeof(uint32_t)*3);
+            texstate->filter[0] = sampler->filter0;
+            texstate->filter[1] = sampler->filter1;
+            texstate->border_color = sampler->border_color;
+            texstate->tile_config = R300_TXO_MACRO_TILE(tex->macrotile) |
+                                    R300_TXO_MICRO_TILE(tex->microtile);
+
+            /* to emulate 1D textures through 2D ones correctly */
+            if (tex->tex.target == PIPE_TEXTURE_1D) {
+                texstate->filter[0] &= ~R300_TX_WRAP_T_MASK;
+                texstate->filter[0] |= R300_TX_WRAP_T(R300_TX_CLAMP_TO_EDGE);
+            }
+
+            if (tex->is_npot) {
+                /* NPOT textures don't support mip filter, unfortunately.
+                 * This prevents incorrect rendering. */
+                texstate->filter[0] &= ~R300_TX_MIN_FILTER_MIP_MASK;
+            } else {
+                /* determine min/max levels */
+                /* the MAX_MIP level is the largest (finest) one */
+                max_level = MIN2(sampler->max_lod, tex->tex.last_level);
+                min_level = MIN2(sampler->min_lod, max_level);
+                texstate->format[0] |= R300_TX_NUM_LEVELS(max_level);
+                texstate->filter[0] |= R300_TX_MAX_MIP_LEVEL(min_level);
+            }
+
+            texstate->filter[0] |= i << 28;
+
+            size += 16;
+            state->count = i+1;
+        }
+    }
+
+    r300->textures_state.size = size;
+}
+
 void r300_update_derived_state(struct r300_context* r300)
 {
-    /* XXX */
-    if (r300->dirty_state &
-        (R300_NEW_FRAGMENT_SHADER | R300_NEW_VERTEX_SHADER) ||
-        r300->vertex_format_state.dirty || r300->rs_state.dirty) {
+    if (r300->rs_block_state.dirty ||
+        r300->vertex_stream_state.dirty || /* XXX put updating this state out of this file */
+        r300->rs_state.dirty) {  /* XXX and remove this one (tcl_bypass dependency) */
         r300_update_derived_shader_state(r300);
     }
 
+    if (r300->textures_state.dirty) {
+        r300_merge_textures_and_samplers(r300);
+    }
+
     r300_update_ztop(r300);
 }
diff --git a/src/gallium/drivers/r300/r300_state_inlines.h b/src/gallium/drivers/r300/r300_state_inlines.h
index 0e1cb328d1..2f3a56e1fb 100644
--- a/src/gallium/drivers/r300/r300_state_inlines.h
+++ b/src/gallium/drivers/r300/r300_state_inlines.h
@@ -384,8 +384,7 @@ r300_translate_vertex_data_type(enum pipe_format format) {
 
     desc = util_format_description(format);
 
-    if (desc->layout != UTIL_FORMAT_LAYOUT_ARITH &&
-        desc->layout != UTIL_FORMAT_LAYOUT_ARRAY) {
+    if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
         debug_printf("r300: Bad format %s in %s:%d\n", util_format_name(format),
             __FUNCTION__, __LINE__);
         assert(0);
@@ -458,8 +457,7 @@ r300_translate_vertex_data_swizzle(enum pipe_format format) {
 
     assert(format);
 
-    if (desc->layout != UTIL_FORMAT_LAYOUT_ARITH &&
-        desc->layout != UTIL_FORMAT_LAYOUT_ARRAY) {
+    if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
         debug_printf("r300: Bad format %s in %s:%d\n",
             util_format_name(format), __FUNCTION__, __LINE__);
         return 0;
diff --git a/src/gallium/drivers/r300/r300_state_invariant.c b/src/gallium/drivers/r300/r300_state_invariant.c
index 97927acf1b..4a2c68269b 100644
--- a/src/gallium/drivers/r300/r300_state_invariant.c
+++ b/src/gallium/drivers/r300/r300_state_invariant.c
@@ -38,7 +38,8 @@ struct pipe_viewport_state r300_viewport_identity = {
  *
  * Note that eventually this should be empty, but it's useful for development
  * and general unduplication of code. */
-void r300_emit_invariant_state(struct r300_context* r300, void* state)
+void r300_emit_invariant_state(struct r300_context* r300,
+                               unsigned size, void* state)
 {
     struct r300_capabilities* caps = r300_screen(r300->context.screen)->caps;
     CS_LOCALS(r300);
diff --git a/src/gallium/drivers/r300/r300_state_invariant.h b/src/gallium/drivers/r300/r300_state_invariant.h
index 5d1a963654..83d031c7fe 100644
--- a/src/gallium/drivers/r300/r300_state_invariant.h
+++ b/src/gallium/drivers/r300/r300_state_invariant.h
@@ -25,6 +25,7 @@
 
 struct r300_context;
 
-void r300_emit_invariant_state(struct r300_context* r300, void* state);
+void r300_emit_invariant_state(struct r300_context* r300,
+                               unsigned size, void* state);
 
 #endif /* R300_STATE_INVARIANT_H */
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index ed2be06254..2246c75056 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -70,6 +70,12 @@ static uint32_t r300_translate_texformat(enum pipe_format format)
         R300_TX_FORMAT_B_SHIFT,
         R300_TX_FORMAT_A_SHIFT
     };
+    const uint32_t swizzle[4] = {
+        R300_TX_FORMAT_X,
+        R300_TX_FORMAT_Y,
+        R300_TX_FORMAT_Z,
+        R300_TX_FORMAT_W
+    };
     const uint32_t sign_bit[4] = {
         R300_TX_FORMAT_SIGNED_X,
         R300_TX_FORMAT_SIGNED_Y,
@@ -119,16 +125,16 @@ static uint32_t r300_translate_texformat(enum pipe_format format)
         switch (desc->swizzle[i]) {
             case UTIL_FORMAT_SWIZZLE_X:
             case UTIL_FORMAT_SWIZZLE_NONE:
-                result |= R300_TX_FORMAT_X << swizzle_shift[i];
+                result |= swizzle[0] << swizzle_shift[i];
                 break;
             case UTIL_FORMAT_SWIZZLE_Y:
-                result |= R300_TX_FORMAT_Y << swizzle_shift[i];
+                result |= swizzle[1] << swizzle_shift[i];
                 break;
             case UTIL_FORMAT_SWIZZLE_Z:
-                result |= R300_TX_FORMAT_Z << swizzle_shift[i];
+                result |= swizzle[2] << swizzle_shift[i];
                 break;
             case UTIL_FORMAT_SWIZZLE_W:
-                result |= R300_TX_FORMAT_W << swizzle_shift[i];
+                result |= swizzle[3] << swizzle_shift[i];
                 break;
             case UTIL_FORMAT_SWIZZLE_0:
                 result |= R300_TX_FORMAT_ZERO << swizzle_shift[i];
@@ -142,7 +148,7 @@ static uint32_t r300_translate_texformat(enum pipe_format format)
     }
 
     /* Compressed formats. */
-    if (desc->layout == UTIL_FORMAT_LAYOUT_DXT) {
+    if (desc->layout == UTIL_FORMAT_LAYOUT_COMPRESSED) {
         switch (format) {
             case PIPE_FORMAT_DXT1_RGB:
             case PIPE_FORMAT_DXT1_RGBA:
@@ -499,7 +505,7 @@ boolean r300_is_sampler_format_supported(enum pipe_format format)
 
 static void r300_setup_texture_state(struct r300_screen* screen, struct r300_texture* tex)
 {
-    struct r300_texture_state* state = &tex->state;
+    struct r300_texture_format_state* state = &tex->state;
     struct pipe_texture *pt = &tex->tex;
     unsigned i;
     boolean is_r500 = screen->caps->is_r500;
diff --git a/src/gallium/drivers/r300/r300_vs.c b/src/gallium/drivers/r300/r300_vs.c
index a6786c321c..60a04bbfed 100644
--- a/src/gallium/drivers/r300/r300_vs.c
+++ b/src/gallium/drivers/r300/r300_vs.c
@@ -368,8 +368,8 @@ void r300_translate_vertex_shader(struct r300_context* r300,
 
 boolean r300_vertex_shader_setup_wpos(struct r300_context* r300)
 {
-    struct r300_vertex_shader* vs = r300->vs;
-    int tex_output = r300->vs->wpos_tex_output;
+    struct r300_vertex_shader* vs = r300->vs_state.state;
+    int tex_output = vs->wpos_tex_output;
     uint32_t tex_fmt = R300_INPUT_CNTL_TC0 << tex_output;
     uint32_t* hwfmt = vs->hwfmt;
 
diff --git a/src/gallium/drivers/softpipe/sp_video_context.c b/src/gallium/drivers/softpipe/sp_video_context.c
index 8e4867a904..242aaac466 100644
--- a/src/gallium/drivers/softpipe/sp_video_context.c
+++ b/src/gallium/drivers/softpipe/sp_video_context.c
@@ -25,10 +25,12 @@
  * 
  **************************************************************************/
 
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+
 #include "sp_video_context.h"
-#include <util/u_inlines.h>
-#include <util/u_memory.h>
-#include "softpipe/sp_texture.h"
+#include "sp_texture.h"
+
 
 static void
 sp_mpeg12_destroy(struct pipe_video_context *vpipe)
diff --git a/src/gallium/drivers/softpipe/sp_winsys.c b/src/gallium/drivers/softpipe/sp_winsys.c
index f6598927d3..0a6245ed2c 100644
--- a/src/gallium/drivers/softpipe/sp_winsys.c
+++ b/src/gallium/drivers/softpipe/sp_winsys.c
@@ -37,13 +37,13 @@
 
 
 #include "util/u_simple_screen.h"/* port to just p_screen */
-#include "pipe/p_format.h"
-#include "pipe/p_context.h"
 #include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
-#include "softpipe/sp_winsys.h"
+#include "pipe/p_format.h"
+#include "pipe/p_context.h"
+#include "sp_winsys.h"
 
 
 struct st_softpipe_buffer
diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c
index a0da7d7e5d..04307d17fe 100644
--- a/src/gallium/drivers/svga/svga_cmd.c
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -478,7 +478,8 @@ SVGA3D_BufferDMA(struct svga_winsys_context *swc,
                  struct svga_winsys_surface *host,
                  SVGA3dTransferType transfer,      // IN
                  uint32 size,                      // IN
-                 uint32 offset,                    // IN
+                 uint32 guest_offset,              // IN
+                 uint32 host_offset,               // IN
                  SVGA3dSurfaceDMAFlags flags)      // IN
 {
    SVGA3dCmdSurfaceDMA *cmd;
@@ -517,19 +518,19 @@ SVGA3D_BufferDMA(struct svga_winsys_context *swc,
    cmd->transfer = transfer;
 
    box = (SVGA3dCopyBox *)&cmd[1];
-   box->x = offset;
+   box->x = host_offset;
    box->y = 0;
    box->z = 0;
    box->w = size;
    box->h = 1;
    box->d = 1;
-   box->srcx = offset;
+   box->srcx = guest_offset;
    box->srcy = 0;
    box->srcz = 0;
    
    pSuffix = (SVGA3dCmdSurfaceDMASuffix *)((uint8_t*)cmd + sizeof *cmd + sizeof *box);
    pSuffix->suffixSize = sizeof *pSuffix;
-   pSuffix->maximumOffset = offset + size;
+   pSuffix->maximumOffset = guest_offset + size;
    pSuffix->flags = flags;
 
    swc->commit(swc);
diff --git a/src/gallium/drivers/svga/svga_cmd.h b/src/gallium/drivers/svga/svga_cmd.h
index 8041054769..da9fc4355f 100644
--- a/src/gallium/drivers/svga/svga_cmd.h
+++ b/src/gallium/drivers/svga/svga_cmd.h
@@ -111,7 +111,8 @@ SVGA3D_BufferDMA(struct svga_winsys_context *swc,
                  struct svga_winsys_surface *host,
                  SVGA3dTransferType transfer,
                  uint32 size,
-                 uint32 offset,
+                 uint32 guest_offset,
+                 uint32 host_offset,
                  SVGA3dSurfaceDMAFlags flags);
 
 /*
diff --git a/src/gallium/drivers/svga/svga_pipe_blend.c b/src/gallium/drivers/svga/svga_pipe_blend.c
index b60117f090..594eec7166 100644
--- a/src/gallium/drivers/svga/svga_pipe_blend.c
+++ b/src/gallium/drivers/svga/svga_pipe_blend.c
@@ -92,6 +92,7 @@ svga_create_blend_state(struct pipe_context *pipe,
       if (templ->logicop_enable) {
          switch (templ->logicop_func) {
          case PIPE_LOGICOP_XOR:
+         case PIPE_LOGICOP_INVERT:
             blend->need_white_fragments = TRUE;
             blend->rt[i].blend_enable = TRUE;
             blend->rt[i].srcblend       = SVGA3D_BLENDOP_ONE;
@@ -125,12 +126,6 @@ svga_create_blend_state(struct pipe_context *pipe,
             blend->rt[i].dstblend       = SVGA3D_BLENDOP_ONE;
             blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
             break;
-         case PIPE_LOGICOP_INVERT:
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ZERO;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_ADD;
-            break;
          case PIPE_LOGICOP_AND:
             /* Approximate with minimum - works for the 0 & anything case: */
             blend->rt[i].blend_enable = TRUE;
diff --git a/src/gallium/drivers/svga/svga_pipe_vertex.c b/src/gallium/drivers/svga/svga_pipe_vertex.c
index ffc0f99565..836b8441da 100644
--- a/src/gallium/drivers/svga/svga_pipe_vertex.c
+++ b/src/gallium/drivers/svga/svga_pipe_vertex.c
@@ -49,7 +49,7 @@ static void svga_set_vertex_buffers(struct pipe_context *pipe,
    /* Adjust refcounts */
    for (i = 0; i < count; i++) {
       pipe_buffer_reference(&svga->curr.vb[i].buffer, buffers[i].buffer);
-      if (svga_buffer(buffers[i].buffer)->user)
+      if (svga_buffer_is_user_buffer(buffers[i].buffer))
          any_user_buffer = TRUE;
    }
 
diff --git a/src/gallium/drivers/svga/svga_screen_buffer.c b/src/gallium/drivers/svga/svga_screen_buffer.c
index c9e9bef540..1ff6a3a5b3 100644
--- a/src/gallium/drivers/svga/svga_screen_buffer.c
+++ b/src/gallium/drivers/svga/svga_screen_buffer.c
@@ -83,7 +83,7 @@ svga_buffer_create_host_surface(struct svga_screen *ss,
        * as svga_screen_surface_create might have passed a recycled host
        * buffer.
        */
-      sbuf->hw.flags.discard = TRUE;
+      sbuf->dma.flags.discard = TRUE;
 
       SVGA_DBG(DEBUG_DMA, "   --> got sid %p sz %d (buffer)\n", sbuf->handle, sbuf->base.size);
    }
@@ -109,10 +109,10 @@ svga_buffer_destroy_hw_storage(struct svga_screen *ss, struct svga_buffer *sbuf)
    struct svga_winsys_screen *sws = ss->sws;
 
    assert(!sbuf->map.count);
-   assert(sbuf->hw.buf);
-   if(sbuf->hw.buf) {
-      sws->buffer_destroy(sws, sbuf->hw.buf);
-      sbuf->hw.buf = NULL;
+   assert(sbuf->hwbuf);
+   if(sbuf->hwbuf) {
+      sws->buffer_destroy(sws, sbuf->hwbuf);
+      sbuf->hwbuf = NULL;
    }
 }
 
@@ -151,16 +151,18 @@ static INLINE enum pipe_error
 svga_buffer_create_hw_storage(struct svga_screen *ss,
                               struct svga_buffer *sbuf)
 {
-   if(!sbuf->hw.buf) {
+   assert(!sbuf->user);
+
+   if(!sbuf->hwbuf) {
       unsigned alignment = sbuf->base.alignment;
       unsigned usage = 0;
       unsigned size = sbuf->base.size;
       
-      sbuf->hw.buf = svga_winsys_buffer_create(ss, alignment, usage, size);
-      if(!sbuf->hw.buf)
+      sbuf->hwbuf = svga_winsys_buffer_create(ss, alignment, usage, size);
+      if(!sbuf->hwbuf)
          return PIPE_ERROR_OUT_OF_MEMORY;
       
-      assert(!sbuf->needs_flush);
+      assert(!sbuf->dma.pending);
    }
    
    return PIPE_OK;
@@ -175,12 +177,11 @@ svga_buffer_upload_command(struct svga_context *svga,
                            struct svga_buffer *sbuf)
 {
    struct svga_winsys_context *swc = svga->swc;
-   struct svga_winsys_buffer *guest = sbuf->hw.buf;
+   struct svga_winsys_buffer *guest = sbuf->hwbuf;
    struct svga_winsys_surface *host = sbuf->handle;
    SVGA3dTransferType transfer = SVGA3D_WRITE_HOST_VRAM;
-   SVGA3dSurfaceDMAFlags flags = sbuf->hw.flags;
    SVGA3dCmdSurfaceDMA *cmd;
-   uint32 numBoxes = sbuf->hw.num_ranges;
+   uint32 numBoxes = sbuf->map.num_ranges;
    SVGA3dCopyBox *boxes;
    SVGA3dCmdSurfaceDMASuffix *pSuffix;
    unsigned region_flags;
@@ -218,8 +219,8 @@ svga_buffer_upload_command(struct svga_context *svga,
 
    cmd->transfer = transfer;
 
-   sbuf->hw.boxes = (SVGA3dCopyBox *)&cmd[1];
-   sbuf->hw.svga = svga;
+   sbuf->dma.boxes = (SVGA3dCopyBox *)&cmd[1];
+   sbuf->dma.svga = svga;
 
    /* Increment reference count */
    dummy = NULL;
@@ -228,9 +229,11 @@ svga_buffer_upload_command(struct svga_context *svga,
    pSuffix = (SVGA3dCmdSurfaceDMASuffix *)((uint8_t*)cmd + sizeof *cmd + numBoxes * sizeof *boxes);
    pSuffix->suffixSize = sizeof *pSuffix;
    pSuffix->maximumOffset = sbuf->base.size;
-   pSuffix->flags = flags;
+   pSuffix->flags = sbuf->dma.flags;
+
+   SVGA_FIFOCommitAll(swc);
 
-   swc->commit(swc);
+   sbuf->dma.flags.discard = FALSE;
 
    return PIPE_OK;
 }
@@ -248,10 +251,10 @@ svga_buffer_upload_flush(struct svga_context *svga,
    unsigned i;
 
    assert(sbuf->handle); 
-   assert(sbuf->hw.buf);
-   assert(sbuf->hw.num_ranges);
-   assert(sbuf->hw.svga == svga);
-   assert(sbuf->hw.boxes);
+   assert(sbuf->hwbuf);
+   assert(sbuf->map.num_ranges);
+   assert(sbuf->dma.svga == svga);
+   assert(sbuf->dma.boxes);
    
    /*
     * Patch the DMA command with the final copy box.
@@ -259,36 +262,33 @@ svga_buffer_upload_flush(struct svga_context *svga,
 
    SVGA_DBG(DEBUG_DMA, "dma to sid %p\n", sbuf->handle);
 
-   boxes = sbuf->hw.boxes;
-   for(i = 0; i < sbuf->hw.num_ranges; ++i) {
+   boxes = sbuf->dma.boxes;
+   for(i = 0; i < sbuf->map.num_ranges; ++i) {
       SVGA_DBG(DEBUG_DMA, "  bytes %u - %u\n",
-               sbuf->hw.ranges[i].start, sbuf->hw.ranges[i].end);
+               sbuf->map.ranges[i].start, sbuf->map.ranges[i].end);
 
-      boxes[i].x = sbuf->hw.ranges[i].start;
+      boxes[i].x = sbuf->map.ranges[i].start;
       boxes[i].y = 0;
       boxes[i].z = 0;
-      boxes[i].w = sbuf->hw.ranges[i].end - sbuf->hw.ranges[i].start;
+      boxes[i].w = sbuf->map.ranges[i].end - sbuf->map.ranges[i].start;
       boxes[i].h = 1;
       boxes[i].d = 1;
-      boxes[i].srcx = sbuf->hw.ranges[i].start;
+      boxes[i].srcx = sbuf->map.ranges[i].start;
       boxes[i].srcy = 0;
       boxes[i].srcz = 0;
    }
 
-   sbuf->hw.num_ranges = 0;
-   memset(&sbuf->hw.flags, 0, sizeof sbuf->hw.flags);
+   sbuf->map.num_ranges = 0;
 
    assert(sbuf->head.prev && sbuf->head.next);
    LIST_DEL(&sbuf->head);
 #ifdef DEBUG
    sbuf->head.next = sbuf->head.prev = NULL; 
 #endif
-   sbuf->needs_flush = FALSE;
-
-   sbuf->hw.svga = NULL;
-   sbuf->hw.boxes = NULL;
+   sbuf->dma.pending = FALSE;
 
-   sbuf->host_written = TRUE;
+   sbuf->dma.svga = NULL;
+   sbuf->dma.boxes = NULL;
 
    /* Decrement reference count */
    pipe_reference(&(sbuf->base.reference), NULL);
@@ -297,7 +297,7 @@ svga_buffer_upload_flush(struct svga_context *svga,
 
 
 /**
- * Queue a DMA upload of a range of this buffer to the host.
+ * Note a dirty range.
  *
  * This function only notes the range down. It doesn't actually emit a DMA
  * upload command. That only happens when a context tries to refer to this
@@ -306,15 +306,24 @@ svga_buffer_upload_flush(struct svga_context *svga,
  * We try to lump as many contiguous DMA transfers together as possible.
  */
 static void
-svga_buffer_upload_queue(struct svga_buffer *sbuf,
-                         unsigned start,
-                         unsigned end)
+svga_buffer_add_range(struct svga_buffer *sbuf,
+                      unsigned start,
+                      unsigned end)
 {
    unsigned i;
+   unsigned nearest_range;
+   unsigned nearest_dist;
 
-   assert(sbuf->hw.buf);
    assert(end > start);
    
+   if (sbuf->map.num_ranges < SVGA_BUFFER_MAX_RANGES) {
+      nearest_range = sbuf->map.num_ranges;
+      nearest_dist = ~0;
+   } else {
+      nearest_range = SVGA_BUFFER_MAX_RANGES - 1;
+      nearest_dist = 0;
+   }
+
    /*
     * Try to grow one of the ranges.
     *
@@ -325,12 +334,34 @@ svga_buffer_upload_queue(struct svga_buffer *sbuf,
     * buffer should be flushed.
     */
 
-   for(i = 0; i < sbuf->hw.num_ranges; ++i) {
-      if(start <= sbuf->hw.ranges[i].end && sbuf->hw.ranges[i].start <= end) {
-         sbuf->hw.ranges[i].start = MIN2(sbuf->hw.ranges[i].start, start);
-         sbuf->hw.ranges[i].end   = MAX2(sbuf->hw.ranges[i].end,    end);
+   for(i = 0; i < sbuf->map.num_ranges; ++i) {
+      int left_dist;
+      int right_dist;
+      int dist;
+
+      left_dist = start - sbuf->map.ranges[i].end;
+      right_dist = sbuf->map.ranges[i].start - end;
+      dist = MAX2(left_dist, right_dist);
+
+      if (dist <= 0) {
+         /*
+          * Ranges are contiguous or overlapping -- extend this one and return.
+          */
+
+         sbuf->map.ranges[i].start = MIN2(sbuf->map.ranges[i].start, start);
+         sbuf->map.ranges[i].end   = MAX2(sbuf->map.ranges[i].end,   end);
          return;
       }
+      else {
+         /*
+          * Discontiguous ranges -- keep track of the nearest range.
+          */
+
+         if (dist < nearest_dist) {
+            nearest_range = i;
+            nearest_dist = dist;
+         }
+      }
    }
 
    /*
@@ -338,20 +369,34 @@ svga_buffer_upload_queue(struct svga_buffer *sbuf,
     * pending DMA upload and start clean.
     */
 
-   if(sbuf->needs_flush)
-      svga_buffer_upload_flush(sbuf->hw.svga, sbuf);
+   if(sbuf->dma.pending)
+      svga_buffer_upload_flush(sbuf->dma.svga, sbuf);
 
-   assert(!sbuf->needs_flush);
-   assert(!sbuf->hw.svga);
-   assert(!sbuf->hw.boxes);
+   assert(!sbuf->dma.pending);
+   assert(!sbuf->dma.svga);
+   assert(!sbuf->dma.boxes);
 
-   /*
-    * Add a new range.
-    */
+   if (sbuf->map.num_ranges < SVGA_BUFFER_MAX_RANGES) {
+      /*
+       * Add a new range.
+       */
+
+      sbuf->map.ranges[sbuf->map.num_ranges].start = start;
+      sbuf->map.ranges[sbuf->map.num_ranges].end = end;
+      ++sbuf->map.num_ranges;
+   } else {
+      /*
+       * Everything else failed, so just extend the nearest range.
+       *
+       * It is OK to do this because we always keep a local copy of the
+       * host buffer data, for SW TNL, and the host never modifies the buffer.
+       */
 
-   sbuf->hw.ranges[sbuf->hw.num_ranges].start = start;
-   sbuf->hw.ranges[sbuf->hw.num_ranges].end = end;
-   ++sbuf->hw.num_ranges;
+      assert(nearest_range < SVGA_BUFFER_MAX_RANGES);
+      assert(nearest_range < sbuf->map.num_ranges);
+      sbuf->map.ranges[nearest_range].start = MIN2(sbuf->map.ranges[nearest_range].start, start);
+      sbuf->map.ranges[nearest_range].end   = MAX2(sbuf->map.ranges[nearest_range].end,   end);
+   }
 }
 
 
@@ -366,55 +411,30 @@ svga_buffer_map_range( struct pipe_screen *screen,
    struct svga_buffer *sbuf = svga_buffer( buf );
    void *map;
 
-   if(sbuf->swbuf) {
+   if (!sbuf->swbuf && !sbuf->hwbuf) {
+      if (svga_buffer_create_hw_storage(ss, sbuf) != PIPE_OK) {
+         /*
+          * We can't create a hardware buffer big enough, so create a malloc
+          * buffer instead.
+          */
+
+         debug_printf("%s: failed to allocate %u KB of DMA, splitting DMA transfers\n",
+                      __FUNCTION__,
+                      (sbuf->base.size + 1023)/1024);
+
+         sbuf->swbuf = align_malloc(sbuf->base.size, sbuf->base.alignment);
+      }
+   }
+
+   if (sbuf->swbuf) {
       /* User/malloc buffer */
       map = sbuf->swbuf;
    }
+   else if (sbuf->hwbuf) {
+      map = sws->buffer_map(sws, sbuf->hwbuf, usage);
+   }
    else {
-      if(!sbuf->hw.buf) {
-         if(svga_buffer_create_hw_storage(ss, sbuf) != PIPE_OK)
-            return NULL;
-         
-         /* Populate the hardware storage if the host surface pre-existed */
-         if(sbuf->host_written) {
-            SVGA3dSurfaceDMAFlags flags;
-            enum pipe_error ret;
-            struct pipe_fence_handle *fence = NULL;
-            
-            assert(sbuf->handle);
-
-            SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "dma from sid %p (buffer), bytes %u - %u\n", 
-                     sbuf->handle, 0, sbuf->base.size);
-
-            memset(&flags, 0, sizeof flags);
-            
-            ret = SVGA3D_BufferDMA(ss->swc,
-                                   sbuf->hw.buf,
-                                   sbuf->handle,
-                                   SVGA3D_READ_HOST_VRAM,
-                                   sbuf->base.size,
-                                   0,
-                                   flags);
-            if(ret != PIPE_OK) {
-               ss->swc->flush(ss->swc, NULL);
-               
-               ret = SVGA3D_BufferDMA(ss->swc,
-                                      sbuf->hw.buf,
-                                      sbuf->handle,
-                                      SVGA3D_READ_HOST_VRAM,
-                                      sbuf->base.size,
-                                      0,
-                                      flags);
-               assert(ret == PIPE_OK);
-            }
-            
-            ss->swc->flush(ss->swc, &fence);
-            sws->fence_finish(sws, fence, 0);
-            sws->fence_reference(sws, &fence, NULL);
-         }
-      }
-         
-      map = sws->buffer_map(sws, sbuf->hw.buf, usage);
+      map = NULL;
    }
 
    if(map) {
@@ -447,8 +467,7 @@ svga_buffer_flush_mapped_range( struct pipe_screen *screen,
    assert(sbuf->map.writing);
    if(sbuf->map.writing) {
       assert(sbuf->map.flush_explicit);
-      if(sbuf->hw.buf)
-         svga_buffer_upload_queue(sbuf, offset, offset + length);
+      svga_buffer_add_range(sbuf, offset, offset + length);
    }
    pipe_mutex_unlock(ss->swc_mutex);
 }
@@ -467,16 +486,15 @@ svga_buffer_unmap( struct pipe_screen *screen,
    if(sbuf->map.count)
       --sbuf->map.count;
 
-   if(sbuf->hw.buf)
-      sws->buffer_unmap(sws, sbuf->hw.buf);
+   if(sbuf->hwbuf)
+      sws->buffer_unmap(sws, sbuf->hwbuf);
 
    if(sbuf->map.writing) {
       if(!sbuf->map.flush_explicit) {
          /* No mapped range was flushed -- flush the whole buffer */
          SVGA_DBG(DEBUG_DMA, "flushing the whole buffer\n");
    
-         if(sbuf->hw.buf)
-            svga_buffer_upload_queue(sbuf, 0, sbuf->base.size);
+         svga_buffer_add_range(sbuf, 0, sbuf->base.size);
       }
       
       sbuf->map.writing = FALSE;
@@ -494,12 +512,15 @@ svga_buffer_destroy( struct pipe_buffer *buf )
 
    assert(!p_atomic_read(&buf->reference.count));
    
-   assert(!sbuf->needs_flush);
+   assert(!sbuf->dma.pending);
 
    if(sbuf->handle)
       svga_buffer_destroy_host_surface(ss, sbuf);
    
-   if(sbuf->hw.buf)
+   if(sbuf->uploaded.buffer)
+      pipe_buffer_reference(&sbuf->uploaded.buffer, NULL);
+
+   if(sbuf->hwbuf)
       svga_buffer_destroy_hw_storage(ss, sbuf);
    
    if(sbuf->swbuf && !sbuf->user)
@@ -596,13 +617,14 @@ svga_screen_init_buffer_functions(struct pipe_screen *screen)
 }
 
 
-/** 
- * Copy the contents of the user buffer / malloc buffer to a hardware buffer.
+/**
+ * Copy the contents of the malloc buffer to a hardware buffer.
  */
 static INLINE enum pipe_error
 svga_buffer_update_hw(struct svga_screen *ss, struct svga_buffer *sbuf)
 {
-   if(!sbuf->hw.buf) {
+   assert(!sbuf->user);
+   if(!sbuf->hwbuf) {
       enum pipe_error ret;
       void *map;
       
@@ -611,20 +633,20 @@ svga_buffer_update_hw(struct svga_screen *ss, struct svga_buffer *sbuf)
          return PIPE_ERROR;
       
       ret = svga_buffer_create_hw_storage(ss, sbuf);
-      assert(ret == PIPE_OK);
       if(ret != PIPE_OK)
          return ret;
 
       pipe_mutex_lock(ss->swc_mutex);
-      map = ss->sws->buffer_map(ss->sws, sbuf->hw.buf, PIPE_BUFFER_USAGE_CPU_WRITE);
+      map = ss->sws->buffer_map(ss->sws, sbuf->hwbuf, PIPE_BUFFER_USAGE_CPU_WRITE);
       assert(map);
       if(!map) {
 	 pipe_mutex_unlock(ss->swc_mutex);
-         return PIPE_ERROR_OUT_OF_MEMORY;
+         svga_buffer_destroy_hw_storage(ss, sbuf);
+         return PIPE_ERROR;
       }
 
       memcpy(map, sbuf->swbuf, sbuf->base.size);
-      ss->sws->buffer_unmap(ss->sws, sbuf->hw.buf);
+      ss->sws->buffer_unmap(ss->sws, sbuf->hwbuf);
 
       /* This user/malloc buffer is now indistinguishable from a gpu buffer */
       assert(!sbuf->map.count);
@@ -636,10 +658,89 @@ svga_buffer_update_hw(struct svga_screen *ss, struct svga_buffer *sbuf)
          sbuf->swbuf = NULL;
       }
       
-      svga_buffer_upload_queue(sbuf, 0, sbuf->base.size);
+      pipe_mutex_unlock(ss->swc_mutex);
    }
    
-   pipe_mutex_unlock(ss->swc_mutex);
+   return PIPE_OK;
+}
+
+
+/**
+ * Upload the buffer to the host in a piecewise fashion.
+ *
+ * Used when the buffer is too big to fit in the GMR aperture.
+ */
+static INLINE enum pipe_error
+svga_buffer_upload_piecewise(struct svga_screen *ss,
+                             struct svga_context *svga,
+                             struct svga_buffer *sbuf)
+{
+   struct svga_winsys_screen *sws = ss->sws;
+   const unsigned alignment = sizeof(void *);
+   const unsigned usage = 0;
+   unsigned i;
+
+   assert(sbuf->map.num_ranges);
+   assert(!sbuf->dma.pending);
+
+   SVGA_DBG(DEBUG_DMA, "dma to sid %p\n", sbuf->handle);
+
+   for (i = 0; i < sbuf->map.num_ranges; ++i) {
+      struct svga_buffer_range *range = &sbuf->map.ranges[i];
+      unsigned offset = range->start;
+      unsigned size = range->end - range->start;
+
+      while (offset < range->end) {
+         struct svga_winsys_buffer *hwbuf;
+         uint8_t *map;
+         enum pipe_error ret;
+
+         if (offset + size > range->end)
+            size = range->end - offset;
+
+         hwbuf = svga_winsys_buffer_create(ss, alignment, usage, size);
+         while (!hwbuf) {
+            size /= 2;
+            if (!size)
+               return PIPE_ERROR_OUT_OF_MEMORY;
+            hwbuf = svga_winsys_buffer_create(ss, alignment, usage, size);
+         }
+
+         SVGA_DBG(DEBUG_DMA, "  bytes %u - %u\n",
+                  offset, offset + size);
+
+         map = sws->buffer_map(sws, hwbuf,
+                               PIPE_BUFFER_USAGE_CPU_WRITE |
+                               PIPE_BUFFER_USAGE_DISCARD);
+         assert(map);
+         if (map) {
+            memcpy(map, sbuf->swbuf, size);
+            sws->buffer_unmap(sws, hwbuf);
+         }
+
+         ret = SVGA3D_BufferDMA(svga->swc,
+                                hwbuf, sbuf->handle,
+                                SVGA3D_WRITE_HOST_VRAM,
+                                size, 0, offset, sbuf->dma.flags);
+         if(ret != PIPE_OK) {
+            svga_context_flush(svga, NULL);
+            ret =  SVGA3D_BufferDMA(svga->swc,
+                                    hwbuf, sbuf->handle,
+                                    SVGA3D_WRITE_HOST_VRAM,
+                                    size, 0, offset, sbuf->dma.flags);
+            assert(ret == PIPE_OK);
+         }
+
+         sbuf->dma.flags.discard = FALSE;
+
+         sws->buffer_destroy(sws, hwbuf);
+
+         offset += size;
+      }
+   }
+
+   sbuf->map.num_ranges = 0;
+
    return PIPE_OK;
 }
 
@@ -659,34 +760,74 @@ svga_buffer_handle(struct svga_context *svga,
    sbuf = svga_buffer(buf);
    
    assert(!sbuf->map.count);
+   assert(!sbuf->user);
    
    if(!sbuf->handle) {
       ret = svga_buffer_create_host_surface(ss, sbuf);
       if(ret != PIPE_OK)
 	 return NULL;
-
-      ret = svga_buffer_update_hw(ss, sbuf);
-      if(ret != PIPE_OK)
-	 return NULL;
    }
 
-   if(!sbuf->needs_flush && sbuf->hw.num_ranges) {
-      /* Queue the buffer for flushing */
-      ret = svga_buffer_upload_command(svga, sbuf);
-      if(ret != PIPE_OK)
-         /* XXX: Should probably have a richer return value */
-         return NULL;
-
-      assert(sbuf->hw.svga == svga);
+   assert(sbuf->handle);
+
+   if (sbuf->map.num_ranges) {
+      if (!sbuf->dma.pending) {
+         /*
+          * No pending DMA upload yet, so insert a DMA upload command now.
+          */
+
+         /*
+          * Migrate the data from swbuf -> hwbuf if necessary.
+          */
+         ret = svga_buffer_update_hw(ss, sbuf);
+         if (ret == PIPE_OK) {
+            /*
+             * Queue a dma command.
+             */
+
+            ret = svga_buffer_upload_command(svga, sbuf);
+            if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
+               svga_context_flush(svga, NULL);
+               ret = svga_buffer_upload_command(svga, sbuf);
+               assert(ret == PIPE_OK);
+            }
+            if (ret == PIPE_OK) {
+               sbuf->dma.pending = TRUE;
+               assert(!sbuf->head.prev && !sbuf->head.next);
+               LIST_ADDTAIL(&sbuf->head, &svga->dirty_buffers);
+            }
+         }
+         else if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
+            /*
+             * The buffer is too big to fit in the GMR aperture, so break it in
+             * smaller pieces.
+             */
+            ret = svga_buffer_upload_piecewise(ss, svga, sbuf);
+         }
 
-      sbuf->needs_flush = TRUE;
-      assert(!sbuf->head.prev && !sbuf->head.next);
-      LIST_ADDTAIL(&sbuf->head, &svga->dirty_buffers);
+         if (ret != PIPE_OK) {
+            /*
+             * Something unexpected happened above. There is very little that
+             * we can do other than proceeding while ignoring the dirty ranges.
+             */
+            assert(0);
+            sbuf->map.num_ranges = 0;
+         }
+      }
+      else {
+         /*
+          * There a pending dma already. Make sure it is from this context.
+          */
+         assert(sbuf->dma.svga == svga);
+      }
    }
 
+   assert(!sbuf->map.num_ranges || sbuf->dma.pending);
+
    return sbuf->handle;
 }
 
+
 struct pipe_buffer *
 svga_screen_buffer_wrap_surface(struct pipe_screen *screen,
 				enum SVGA3dSurfaceFormat format,
@@ -739,7 +880,7 @@ svga_context_flush_buffers(struct svga_context *svga)
       sbuf = LIST_ENTRY(struct svga_buffer, curr, head);
 
       assert(p_atomic_read(&sbuf->base.reference.count) != 0);
-      assert(sbuf->needs_flush);
+      assert(sbuf->dma.pending);
       
       svga_buffer_upload_flush(svga, sbuf);
 
diff --git a/src/gallium/drivers/svga/svga_screen_buffer.h b/src/gallium/drivers/svga/svga_screen_buffer.h
index 448ac107c7..8c862fa62d 100644
--- a/src/gallium/drivers/svga/svga_screen_buffer.h
+++ b/src/gallium/drivers/svga/svga_screen_buffer.h
@@ -57,35 +57,6 @@ struct svga_buffer_range
 
 
 /**
- * Describe a
- *
- * This holds the information to emit a SVGA3dCmdSurfaceDMA.
- */
-struct svga_buffer_upload
-{
-   /**
-    * Guest memory region.
-    */
-   struct svga_winsys_buffer *buf;
-
-   struct svga_buffer_range ranges[SVGA_BUFFER_MAX_RANGES];
-   unsigned num_ranges;
-
-   SVGA3dSurfaceDMAFlags flags;
-
-   /**
-    * Pointer to the DMA copy box *inside* the command buffer.
-    */
-   SVGA3dCopyBox *boxes;
-
-   /**
-    * Context that has the pending DMA to this buffer.
-    */
-   struct svga_context *svga;
-};
-
-
-/**
  * SVGA pipe buffer.
  */
 struct svga_buffer 
@@ -111,14 +82,6 @@ struct svga_buffer
    boolean user;
    
    /**
-    * DMA'ble memory.
-    * 
-    * A piece of GMR memory. It is created when mapping the buffer, and will be
-    * used to upload/download vertex data from the host.
-    */
-   struct svga_buffer_upload hw;
-
-   /**
     * Creation key for the host surface handle.
     * 
     * This structure describes all the host surface characteristics so that it 
@@ -134,19 +97,94 @@ struct svga_buffer
     * trying to bind
     */
    struct svga_winsys_surface *handle;
-   
+
    /**
-    * Whether the host has been ever written.
+    * Information about ongoing and past map operations.
     */
-   boolean host_written;
-
    struct {
+      /**
+       * Number of concurrent mappings.
+       *
+       * XXX: It is impossible to guarantee concurrent maps work in all
+       * circumstances -- pipe_buffers really need transfer objects too.
+       */
       unsigned count;
+
+      /**
+       * Whether this buffer is currently mapped for writing.
+       */
       boolean writing;
+
+      /**
+       * Whether the application will tell us explicity which ranges it touched
+       * or not.
+       */
       boolean flush_explicit;
+
+      /**
+       * Dirty ranges.
+       *
+       * Ranges that were touched by the application and need to be uploaded to
+       * the host.
+       *
+       * This information will be copied into dma.boxes, when emiting the
+       * SVGA3dCmdSurfaceDMA command.
+       */
+      struct svga_buffer_range ranges[SVGA_BUFFER_MAX_RANGES];
+      unsigned num_ranges;
    } map;
-   
-   boolean needs_flush;
+
+   /**
+    * Information about uploaded version of user buffers.
+    */
+   struct {
+      struct pipe_buffer *buffer;
+
+      /**
+       * We combine multiple user buffers into the same hardware buffer. This
+       * is the relative offset within that buffer.
+       */
+      unsigned offset;
+   } uploaded;
+
+   /**
+    * DMA'ble memory.
+    *
+    * A piece of GMR memory, with the same size of the buffer. It is created
+    * when mapping the buffer, and will be used to upload vertex data to the
+    * host.
+    */
+   struct svga_winsys_buffer *hwbuf;
+
+   /**
+    * Information about pending DMA uploads.
+    *
+    */
+   struct {
+      /**
+       * Whether this buffer has an unfinished DMA upload command.
+       *
+       * If not set then the rest of the information is null.
+       */
+      boolean pending;
+
+      SVGA3dSurfaceDMAFlags flags;
+
+      /**
+       * Pointer to the DMA copy box *inside* the command buffer.
+       */
+      SVGA3dCopyBox *boxes;
+
+      /**
+       * Context that has the pending DMA to this buffer.
+       */
+      struct svga_context *svga;
+   } dma;
+
+   /**
+    * Linked list head, used to gather all buffers with pending dma uploads on
+    * a context. It is only valid if the dma.pending is set above.
+    */
    struct list_head head;
 };
 
@@ -176,6 +214,16 @@ svga_buffer_is_user_buffer( struct pipe_buffer *buffer )
 void
 svga_screen_init_buffer_functions(struct pipe_screen *screen);
 
+
+/**
+ * Get the host surface handle for this buffer.
+ *
+ * This will ensure the host surface is updated, issuing DMAs as needed.
+ *
+ * NOTE: This may insert new commands in the context, so it *must* be called
+ * before reserving command buffer space. And, in order to insert commands
+ * it may need to call svga_context_flush().
+ */
 struct svga_winsys_surface *
 svga_buffer_handle(struct svga_context *svga,
                    struct pipe_buffer *buf);
diff --git a/src/gallium/drivers/svga/svga_state_vdecl.c b/src/gallium/drivers/svga/svga_state_vdecl.c
index d1066ce13b..ded903170b 100644
--- a/src/gallium/drivers/svga/svga_state_vdecl.c
+++ b/src/gallium/drivers/svga/svga_state_vdecl.c
@@ -54,33 +54,30 @@ upload_user_buffers( struct svga_context *svga )
    {
       if (svga_buffer_is_user_buffer(svga->curr.vb[i].buffer))
       {
-         struct pipe_buffer *upload_buffer = NULL;
-         unsigned offset = /*svga->curr.vb[i].buffer_offset*/ 0;
-         unsigned size = svga->curr.vb[i].buffer->size /*- offset*/;
-         unsigned upload_offset;
-
-         ret = u_upload_buffer( svga->upload_vb,
-                                offset,
-                                size,
-                                svga->curr.vb[i].buffer,
-                                &upload_offset,
-                                &upload_buffer );
-         if (ret)
-            return ret;
-
-         if (0)
-            debug_printf("%s: %d: orig buf %p upl buf %p ofs %d sz %d\n", 
-                         __FUNCTION__, 
-                         i,
-                         svga->curr.vb[i].buffer,
-                         upload_buffer, upload_offset, size);
-
-         /* Make sure we release the old buffer and end up with the
-          * correct refcount on the uploaded buffer.
-          */
-         pipe_buffer_reference( &svga->curr.vb[i].buffer, NULL );
-         svga->curr.vb[i].buffer = upload_buffer;
-         svga->curr.vb[i].buffer_offset = upload_offset;
+         struct svga_buffer *buffer = svga_buffer(svga->curr.vb[i].buffer);
+
+         if (!buffer->uploaded.buffer) {
+            ret = u_upload_buffer( svga->upload_vb,
+                                   0,
+                                   buffer->base.size,
+                                   &buffer->base,
+                                   &buffer->uploaded.offset,
+                                   &buffer->uploaded.buffer );
+            if (ret)
+               return ret;
+
+            if (0)
+               debug_printf("%s: %d: orig buf %p upl buf %p ofs %d sz %d\n",
+                            __FUNCTION__,
+                            i,
+                            buffer,
+                            buffer->uploaded.buffer,
+                            buffer->uploaded.offset,
+                            buffer->base.size);
+         }
+
+         pipe_buffer_reference( &svga->curr.vb[i].buffer, buffer->uploaded.buffer );
+         svga->curr.vb[i].buffer_offset = buffer->uploaded.offset;
       }
    }
 
diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h
index e8f75485d5..48eced2ece 100644
--- a/src/gallium/drivers/svga/svga_tgsi_emit.h
+++ b/src/gallium/drivers/svga/svga_tgsi_emit.h
@@ -138,6 +138,7 @@ static INLINE boolean emit_dst( struct svga_shader_emitter *emit,
                          SVGA3dShaderDestToken dest )
 {
    assert(dest.reserved0);
+   assert(dest.mask);
    return svga_shader_emit_dword( emit, dest.value );
 }
 
@@ -267,6 +268,7 @@ static INLINE SVGA3dShaderDestToken
 writemask( SVGA3dShaderDestToken dest,
            unsigned mask )
 {
+   assert(dest.mask & mask);
    dest.mask &= mask;
    return dest;
 }
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index 87aed39f78..3d4f56a67b 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -112,6 +112,7 @@ translate_dst_register( struct svga_shader_emitter *emit,
    }
 
    dest.mask = reg->Register.WriteMask;
+   assert(dest.mask);
 
    if (insn->Instruction.Saturate) 
       dest.dstMod = SVGA3DDSTMOD_SATURATE;
@@ -1410,34 +1411,42 @@ static boolean emit_tex(struct svga_shader_emitter *emit,
 
 
    if (compare) {
-      SVGA3dShaderDestToken src0_zdivw = get_temp( emit );
-      struct src_register tex_src_x = scalar(src(tex_result), TGSI_SWIZZLE_Y);
-      struct src_register one =
-         scalar( get_zero_immediate( emit ), TGSI_SWIZZLE_W );
-
-      /* Divide texcoord R by Q */
-      if (!submit_op1( emit, inst_token( SVGA3DOP_RCP ),
-                       src0_zdivw,
-                       scalar(src0, TGSI_SWIZZLE_W) ))
-         return FALSE;
+      if (dst.mask & TGSI_WRITEMASK_XYZ) {
+         SVGA3dShaderDestToken src0_zdivw = get_temp( emit );
+         struct src_register tex_src_x = scalar(src(tex_result), TGSI_SWIZZLE_Y);
+
+         /* Divide texcoord R by Q */
+         if (!submit_op1( emit, inst_token( SVGA3DOP_RCP ),
+                          writemask(src0_zdivw, TGSI_WRITEMASK_X),
+                          scalar(src0, TGSI_SWIZZLE_W) ))
+            return FALSE;
 
-      if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ),
-                       src0_zdivw,
-                       scalar(src0, TGSI_SWIZZLE_Z),
-                       src(src0_zdivw) ))
-         return FALSE;
+         if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ),
+                          writemask(src0_zdivw, TGSI_WRITEMASK_X),
+                          scalar(src0, TGSI_SWIZZLE_Z),
+                          scalar(src(src0_zdivw), TGSI_SWIZZLE_X) ))
+            return FALSE;
 
-      if (!emit_select(
-             emit,
-             emit->key.fkey.tex[src1.base.num].compare_func,
-             dst,
-             src(src0_zdivw),
-             tex_src_x))
-         return FALSE;
+         if (!emit_select(
+                emit,
+                emit->key.fkey.tex[src1.base.num].compare_func,
+                writemask( dst, TGSI_WRITEMASK_XYZ ),
+                scalar(src(src0_zdivw), TGSI_SWIZZLE_X),
+                tex_src_x))
+            return FALSE;
+      }
 
-      return submit_op1( emit, inst_token( SVGA3DOP_MOV ),
-                         writemask( dst, TGSI_WRITEMASK_W),
-                         one );
+      if (dst.mask & TGSI_WRITEMASK_W) {
+         struct src_register one =
+            scalar( get_zero_immediate( emit ), TGSI_SWIZZLE_W );
+
+        if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ),
+                         writemask( dst, TGSI_WRITEMASK_W ),
+                         one ))
+           return FALSE;
+      }
+
+      return TRUE;
    }
    else if (!emit->use_sm30 && dst.mask != TGSI_WRITEMASK_XYZW) 
    {
@@ -1827,13 +1836,13 @@ static boolean emit_exp(struct svga_shader_emitter *emit,
     */
    if (dst.mask & TGSI_WRITEMASK_X) {
       if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ),
-                       writemask( dst, dst.mask & TGSI_WRITEMASK_X ),
+                       writemask( dst, TGSI_WRITEMASK_X ),
                        src0,
                        scalar( negate( src( fraction ) ), TGSI_SWIZZLE_Y ) ) )
          return FALSE;
 
       if (!submit_op1( emit, inst_token( SVGA3DOP_EXP ),
-                       writemask( dst, dst.mask & TGSI_WRITEMASK_X ),
+                       writemask( dst, TGSI_WRITEMASK_X ),
                        scalar( src( dst ), TGSI_SWIZZLE_X ) ) )
          return FALSE;
 
@@ -1845,7 +1854,7 @@ static boolean emit_exp(struct svga_shader_emitter *emit,
     */
    if (dst.mask & TGSI_WRITEMASK_Z) {
       if (!submit_op1( emit, inst_token( SVGA3DOP_EXPP ),
-                       writemask( dst, dst.mask & TGSI_WRITEMASK_Z ),
+                       writemask( dst, TGSI_WRITEMASK_Z ),
                        src0 ) )
          return FALSE;
    }
diff --git a/src/gallium/drivers/trace/tr_drm.c b/src/gallium/drivers/trace/tr_drm.c
index b8adde77f1..2b4915003e 100644
--- a/src/gallium/drivers/trace/tr_drm.c
+++ b/src/gallium/drivers/trace/tr_drm.c
@@ -28,11 +28,11 @@
 #include "state_tracker/drm_api.h"
 
 #include "util/u_memory.h"
-#include "trace/tr_drm.h"
-#include "trace/tr_screen.h"
-#include "trace/tr_context.h"
-#include "trace/tr_buffer.h"
-#include "trace/tr_texture.h"
+#include "tr_drm.h"
+#include "tr_screen.h"
+#include "tr_context.h"
+#include "tr_buffer.h"
+#include "tr_texture.h"
 
 struct trace_drm_api
 {