140 files changed, 3452 insertions, 4018 deletions
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index d5f5c7bbba..aa29dcb394 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -358,6 +358,7 @@ struct cell_spu_function_info
 
 
 /** This is the object passed to spe_create_thread() */
+PIPE_ALIGN_TYPE(16,
 struct cell_init_info
 {
    unsigned id;
@@ -370,7 +371,7 @@ struct cell_init_info
    uint *buffer_status;  /**< points at cell_context->buffer_status */
 
    struct cell_spu_function_info *spu_functions;
-} ALIGN16_ATTRIB;
+});
 
 
 #endif /* CELL_COMMON_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 5c3188e7f9..3fb6a3227c 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -89,7 +89,7 @@ struct cell_buffer_node;
  */
 struct cell_buffer_list
 {
-   struct cell_fence fence ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) struct cell_fence fence;
    struct cell_buffer_node *head;
 };
 
@@ -150,18 +150,18 @@ struct cell_context
    /** Mapped constant buffers */
    void *mapped_constants[PIPE_SHADER_TYPES];
 
-   struct cell_spu_function_info spu_functions ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) struct cell_spu_function_info spu_functions;
 
    uint num_cells, num_spus;
 
    /** Buffers for command batches, vertex/index data */
    uint buffer_size[CELL_NUM_BUFFERS];
-   ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE];
 
    int cur_batch;  /**< which buffer is being filled w/ commands */
 
    /** [4] to ensure 16-byte alignment for each status word */
-   uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4];
 
 
    /** Associated with each command/batch buffer is a list of pipe_buffers
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index 01bea0f8cc..3fa8b975d3 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -85,7 +85,7 @@ cell_unmap_constant_buffers(struct cell_context *sp)
  *
  * XXX should the element buffer be specified/bound with a separate function?
  */
-static boolean
+static void
 cell_draw_range_elements(struct pipe_context *pipe,
                          struct pipe_buffer *indexBuffer,
                          unsigned indexSize,
@@ -145,29 +145,27 @@ cell_draw_range_elements(struct pipe_context *pipe,
 
    /* Note: leave drawing surfaces mapped */
    cell_unmap_constant_buffers(sp);
-
-   return TRUE;
 }
 
 
-static boolean
+static void
 cell_draw_elements(struct pipe_context *pipe,
                    struct pipe_buffer *indexBuffer,
                    unsigned indexSize,
                    unsigned mode, unsigned start, unsigned count)
 {
-   return cell_draw_range_elements( pipe, indexBuffer,
-                                    indexSize,
-                                    0, 0xffffffff,
-                                    mode, start, count );
+   cell_draw_range_elements( pipe, indexBuffer,
+                             indexSize,
+                             0, 0xffffffff,
+                             mode, start, count );
 }
 
 
-static boolean
+static void
 cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
                      unsigned start, unsigned count)
 {
-   return cell_draw_elements(pipe, NULL, 0, mode, start, count);
+   cell_draw_elements(pipe, NULL, 0, mode, start, count);
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 5c0179d954..55bd85bde2 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -53,8 +53,7 @@ struct spu_vs_context draw;
 /**
  * Buffers containing dynamically generated SPU code:
  */
-static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
-    ALIGN16_ATTRIB;
+PIPE_ALIGN_VAR(16) static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS];
 
 
 
@@ -405,8 +404,6 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    case PIPE_TEX_FILTER_LINEAR:
       spu.min_sample_texture_2d[unit] = sample_texture_2d_bilinear;
       break;
-   case PIPE_TEX_FILTER_ANISO:
-      /* fall-through, for now */
    case PIPE_TEX_FILTER_NEAREST:
       spu.min_sample_texture_2d[unit] = sample_texture_2d_nearest;
       break;
@@ -418,8 +415,6 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    case PIPE_TEX_FILTER_LINEAR:
       spu.mag_sample_texture_2d[unit] = sample_texture_2d_bilinear;
       break;
-   case PIPE_TEX_FILTER_ANISO:
-      /* fall-through, for now */
    case PIPE_TEX_FILTER_NEAREST:
       spu.mag_sample_texture_2d[unit] = sample_texture_2d_nearest;
       break;
@@ -547,7 +542,7 @@ cmd_batch(uint opcode)
 {
    const uint buf = (opcode >> 8) & 0xff;
    uint size = (opcode >> 16);
-   qword buffer[CELL_BUFFER_SIZE / 16] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) qword buffer[CELL_BUFFER_SIZE / 16];
    const unsigned usize = ROUNDUP16(size) / sizeof(buffer[0]);
    uint pos;
 
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index d86d8e09a5..d2166a4901 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -1839,10 +1839,11 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
    /* execute declarations (interpolants) */
    if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
       for (i = 0; i < mach->NumDeclarations; i++) {
+         PIPE_ALIGN_VAR(16)
          union {
             struct tgsi_full_declaration decl;
             qword buffer[ROUNDUP16(sizeof(struct tgsi_full_declaration)) / 16];
-         } d ALIGN16_ATTRIB;
+         } d;
          unsigned ea = (unsigned) (mach->Declarations + pc);
 
          spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl));
@@ -1853,10 +1854,11 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
 
    /* execute instructions, until pc is set to -1 */
    while (pc != -1) {
+      PIPE_ALIGN_VAR(16)
       union {
          struct tgsi_full_instruction inst;
          qword buffer[ROUNDUP16(sizeof(struct tgsi_full_instruction)) / 16];
-      } i ALIGN16_ATTRIB;
+      } i;
       unsigned ea = (unsigned) (mach->Instructions + pc);
 
       spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst));
diff --git a/src/gallium/drivers/cell/spu/spu_exec.h b/src/gallium/drivers/cell/spu/spu_exec.h
index 8605679940..0ca92af248 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.h
+++ b/src/gallium/drivers/cell/spu/spu_exec.h
@@ -98,9 +98,9 @@ struct spu_exec_machine
     * 4  internal temporaries
     * 1  address
     */
+   PIPE_ALIGN_VAR(16)
    struct spu_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS 
-                                      + TGSI_EXEC_NUM_TEMP_EXTRAS + 1]
-       ALIGN16_ATTRIB;
+                                      + TGSI_EXEC_NUM_TEMP_EXTRAS + 1];
 
    struct spu_exec_vector       *Addrs;
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index ff3d609d25..98919c43ff 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -144,7 +144,7 @@ export_func(struct cell_spu_function_info *spu_functions,
 void
 return_function_info(void)
 {
-   struct cell_spu_function_info funcs ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) struct cell_spu_function_info funcs;
    int tag = TAG_MISC;
 
    ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 33767e7c51..b18f4c22ef 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -93,6 +93,7 @@ typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
                                                          vector float *constants);
 
 
+PIPE_ALIGN_TYPE(16,
 struct spu_framebuffer
 {
    void *color_start;              /**< addr of color surface in main memory */
@@ -107,10 +108,11 @@ struct spu_framebuffer
 
    uint zsize;                     /**< 0, 2 or 4 bytes per Z */
    float zscale;                   /**< 65535.0, 2^24-1 or 2^32-1 */
-} ALIGN16_ATTRIB;
+});
 
 
 /** per-texture level info */
+PIPE_ALIGN_TYPE(16,
 struct spu_texture_level
 {
    void *start;
@@ -123,20 +125,22 @@ struct spu_texture_level
    vector signed int mask_s, mask_t, mask_r;
    /** texcoord clamp limits */
    vector signed int max_s, max_t, max_r;
-} ALIGN16_ATTRIB;
+});
 
 
+PIPE_ALIGN_TYPE(16,
 struct spu_texture
 {
    struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
    uint max_level;
    uint target;  /**< PIPE_TEXTURE_x */
-} ALIGN16_ATTRIB;
+});
 
 
 /**
  * All SPU global/context state will be in a singleton object of this type:
  */
+PIPE_ALIGN_TYPE(16,
 struct spu_global
 {
    /** One-time init/constant info */
@@ -155,8 +159,8 @@ struct spu_global
    struct vertex_info vertex_info;
 
    /** Current color and Z tiles */
-   tile_t ctile ALIGN16_ATTRIB;
-   tile_t ztile ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) tile_t ctile;
+   PIPE_ALIGN_VAR(16) tile_t ztile;
 
    /** Read depth/stencil tiles? */
    boolean read_depth_stencil;
@@ -165,8 +169,8 @@ struct spu_global
    ubyte cur_ctile_status, cur_ztile_status;
 
    /** Status of all tiles in framebuffer */
-   ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-   ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE];
+   PIPE_ALIGN_VAR(16) ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE];
 
    /** Current fragment ops machine code, at 8-byte boundary */
    uint *fragment_ops_code;
@@ -175,7 +179,7 @@ struct spu_global
    spu_fragment_ops_func fragment_ops[2];
 
    /** Current fragment program machine code, at 8-byte boundary */
-   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
+   PIPE_ALIGN_VAR(8) uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
    /** Current fragment ops function */
    spu_fragment_program_func fragment_program;
 
@@ -187,7 +191,7 @@ struct spu_global
    /** Fragment program constants */
    vector float constants[4 * CELL_MAX_CONSTANTS];
 
-} ALIGN16_ATTRIB;
+});
 
 
 extern struct spu_global spu;
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 5ffb7073ab..14987e3c3a 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -169,7 +169,7 @@ void
 cmd_render(const struct cell_command_render *render, uint *pos_incr)
 {
    /* we'll DMA into these buffers */
-   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) ubyte vertex_data[CELL_BUFFER_SIZE];
    const uint vertex_size = render->vertex_size; /* in bytes */
    /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
    uint index_bytes;
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
index 03375d84a5..087963960d 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -43,7 +43,8 @@ typedef void (*spu_fetch_func)(qword *out, const qword *in,
 			       const qword *shuffle_data);
 
 
-static const qword fetch_shuffle_data[5] ALIGN16_ATTRIB = {
+PIPE_ALIGN_VAR(16) static const qword
+fetch_shuffle_data[5] = {
    /* Shuffle used by CVT_64_FLOAT
     */
    {
@@ -110,7 +111,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
       unsigned idx;
       const unsigned bytes_per_entry = draw->vertex_fetch.size[attr];
       const unsigned quads_per_entry = (bytes_per_entry + 15) / 16;
-      qword in[2 * 4] ALIGN16_ATTRIB;
+      PIPE_ALIGN_VAR(16) qword in[2 * 4];
 
 
       /* Fetch four attributes for four vertices.  
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
index fbe5b34d39..3e9804bf8e 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -107,8 +107,8 @@ run_vertex_program(struct spu_vs_context *draw,
    struct spu_exec_machine *machine = &draw->machine;
    unsigned int j;
 
-   ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_MAX_ATTRIBS);
-   ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_MAX_ATTRIBS);
+   PIPE_ALIGN_VAR(16) struct spu_exec_vector inputs[PIPE_MAX_ATTRIBS];
+   PIPE_ALIGN_VAR(16) struct spu_exec_vector outputs[PIPE_MAX_ATTRIBS];
    const float *scale = draw->viewport.scale;
    const float *trans = draw->viewport.translate;
 
@@ -119,8 +119,8 @@ run_vertex_program(struct spu_vs_context *draw,
    ASSERT_ALIGN16(draw->constants);
    machine->Consts = (float (*)[4]) draw->constants;
 
-   machine->Inputs = ALIGN16_ASSIGN(inputs);
-   machine->Outputs = ALIGN16_ASSIGN(outputs);
+   machine->Inputs = inputs;
+   machine->Outputs = outputs;
 
    spu_vertex_fetch( draw, machine, elts, count );
 
@@ -132,8 +132,9 @@ run_vertex_program(struct spu_vs_context *draw,
    for (j = 0; j < count; j++) {
       unsigned slot;
       float x, y, z, w;
+      PIPE_ALIGN_VAR(16)
       unsigned char buffer[sizeof(struct vertex_header)
-          + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
+          + MAX_VERTEX_SIZE];
       struct vertex_header *const tmpOut =
           (struct vertex_header *) buffer;
       const unsigned vert_size = ROUNDUP16(sizeof(struct vertex_header)
@@ -186,8 +187,8 @@ run_vertex_program(struct spu_vs_context *draw,
 }
 
 
-unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]
-    ALIGN16_ATTRIB;
+PIPE_ALIGN_VAR(16) unsigned char
+immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]);
 
 
 void
diff --git a/src/gallium/drivers/failover/fo_context.c b/src/gallium/drivers/failover/fo_context.c
index 37184eac7b..46e4338d98 100644
--- a/src/gallium/drivers/failover/fo_context.c
+++ b/src/gallium/drivers/failover/fo_context.c
@@ -44,11 +44,19 @@ static void failover_destroy( struct pipe_context *pipe )
 }
 
 
+void failover_fail_over( struct failover_context *failover )
+{
+   failover->dirty = TRUE;
+   failover->mode = FO_SW;
+}
+
 
-static boolean failover_draw_elements( struct pipe_context *pipe,
-				       struct pipe_buffer *indexBuffer,
-				       unsigned indexSize,
-				       unsigned prim, unsigned start, unsigned count)
+static void failover_draw_elements( struct pipe_context *pipe,
+                                    struct pipe_buffer *indexBuffer,
+                                    unsigned indexSize,
+                                    unsigned prim, 
+                                    unsigned start, 
+                                    unsigned count)
 {
    struct failover_context *failover = failover_context( pipe );
 
@@ -62,24 +70,22 @@ static boolean failover_draw_elements( struct pipe_context *pipe,
    /* Try hardware:
     */
    if (failover->mode == FO_HW) {
-      if (!failover->hw->draw_elements( failover->hw, 
-					indexBuffer, 
-					indexSize, 
-					prim, 
-					start, 
-					count )) {
-
-	 failover->hw->flush( failover->hw, ~0, NULL );
-	 failover->mode = FO_SW;
-      }
+      failover->hw->draw_elements( failover->hw, 
+                                   indexBuffer, 
+                                   indexSize, 
+                                   prim, 
+                                   start, 
+                                   count );
    }
 
    /* Possibly try software:
     */
    if (failover->mode == FO_SW) {
 
-      if (failover->dirty) 
+      if (failover->dirty) {
+         failover->hw->flush( failover->hw, ~0, NULL );
 	 failover_state_emit( failover );
+      }
 
       failover->sw->draw_elements( failover->sw, 
 				   indexBuffer, 
@@ -94,15 +100,13 @@ static boolean failover_draw_elements( struct pipe_context *pipe,
        */
       failover->sw->flush( failover->sw, ~0, NULL );
    }
-
-   return TRUE;
 }
 
 
-static boolean failover_draw_arrays( struct pipe_context *pipe,
+static void failover_draw_arrays( struct pipe_context *pipe,
 				     unsigned prim, unsigned start, unsigned count)
 {
-   return failover_draw_elements(pipe, NULL, 0, prim, start, count);
+   failover_draw_elements(pipe, NULL, 0, prim, start, count);
 }
 
 static unsigned int
diff --git a/src/gallium/drivers/failover/fo_winsys.h b/src/gallium/drivers/failover/fo_winsys.h
index a8ce997a1f..533122b69d 100644
--- a/src/gallium/drivers/failover/fo_winsys.h
+++ b/src/gallium/drivers/failover/fo_winsys.h
@@ -36,10 +36,13 @@
 
 
 struct pipe_context;
+struct failover_context;
 
 
 struct pipe_context *failover_create( struct pipe_context *hw,
 				      struct pipe_context *sw );
 
 
+void failover_fail_over( struct failover_context *failover );
+
 #endif /* FO_WINSYS_H */
diff --git a/src/gallium/drivers/i915/i915_context.c b/src/gallium/drivers/i915/i915_context.c
index 949f046350..89feeade75 100644
--- a/src/gallium/drivers/i915/i915_context.c
+++ b/src/gallium/drivers/i915/i915_context.c
@@ -45,7 +45,7 @@
  */
 
 
-static boolean
+static void
 i915_draw_range_elements(struct pipe_context *pipe,
                          struct pipe_buffer *indexBuffer,
                          unsigned indexSize,
@@ -106,27 +106,25 @@ i915_draw_range_elements(struct pipe_context *pipe,
       pipe_buffer_unmap(pipe->screen, indexBuffer);
       draw_set_mapped_element_buffer_range(draw, 0, start, start + count - 1, NULL);
    }
-
-   return TRUE;
 }
 
-static boolean
+static void
 i915_draw_elements(struct pipe_context *pipe,
                    struct pipe_buffer *indexBuffer,
                    unsigned indexSize,
                    unsigned prim, unsigned start, unsigned count)
 {
-   return i915_draw_range_elements(pipe, indexBuffer,
-                                   indexSize,
-                                   0, 0xffffffff,
-                                   prim, start, count);
+   i915_draw_range_elements(pipe, indexBuffer,
+                            indexSize,
+                            0, 0xffffffff,
+                            prim, start, count);
 }
 
-static boolean
+static void
 i915_draw_arrays(struct pipe_context *pipe,
                  unsigned prim, unsigned start, unsigned count)
 {
-   return i915_draw_elements(pipe, NULL, 0, prim, start, count);
+   i915_draw_elements(pipe, NULL, 0, prim, start, count);
 }
 
 
diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index 1528afc859..5f5b6f8e18 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -74,8 +74,6 @@ static unsigned translate_img_filter( unsigned filter )
       return FILTER_NEAREST;
    case PIPE_TEX_FILTER_LINEAR:
       return FILTER_LINEAR;
-   case PIPE_TEX_FILTER_ANISO:
-      return FILTER_ANISOTROPIC;
    default:
       assert(0);
       return FILTER_NEAREST;
@@ -221,6 +219,9 @@ i915_create_sampler_state(struct pipe_context *pipe,
    minFilt = translate_img_filter( sampler->min_img_filter );
    magFilt = translate_img_filter( sampler->mag_img_filter );
    
+   if (sampler->max_anisotropy > 1.0)
+      minFilt = magFilt = FILTER_ANISOTROPIC;
+
    if (sampler->max_anisotropy > 2.0) {
       cso->state[0] |= SS2_MAX_ANISO_4;
    }
diff --git a/src/gallium/drivers/i965/brw_draw.c b/src/gallium/drivers/i965/brw_draw.c
index 852fd22982..ea8d39adaf 100644
--- a/src/gallium/drivers/i965/brw_draw.c
+++ b/src/gallium/drivers/i965/brw_draw.c
@@ -176,7 +176,7 @@ try_draw_range_elements(struct brw_context *brw,
 }
 
 
-static boolean
+static void
 brw_draw_range_elements(struct pipe_context *pipe,
 			struct pipe_buffer *index_buffer,
 			unsigned index_size,
@@ -228,29 +228,27 @@ brw_draw_range_elements(struct pipe_context *pipe,
       ret = try_draw_range_elements(brw, index_buffer, hw_prim, start, count );
       assert(ret == 0);
    }
-
-   return TRUE;
 }
 
-static boolean
+static void
 brw_draw_elements(struct pipe_context *pipe,
 		  struct pipe_buffer *index_buffer,
 		  unsigned index_size,
 		  unsigned mode, 
 		  unsigned start, unsigned count)
 {
-   return brw_draw_range_elements( pipe, index_buffer,
-				   index_size,
-				   0, 0xffffffff,
-				   mode, 
-				   start, count );
+   brw_draw_range_elements( pipe, index_buffer,
+                            index_size,
+                            0, 0xffffffff,
+                            mode, 
+                            start, count );
 }
 
-static boolean
+static void
 brw_draw_arrays(struct pipe_context *pipe, unsigned mode,
                      unsigned start, unsigned count)
 {
-   return brw_draw_elements(pipe, NULL, 0, mode, start, count);
+   brw_draw_elements(pipe, NULL, 0, mode, start, count);
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_pipe_sampler.c b/src/gallium/drivers/i965/brw_pipe_sampler.c
index 5ddc63f57e..81712798a5 100644
--- a/src/gallium/drivers/i965/brw_pipe_sampler.c
+++ b/src/gallium/drivers/i965/brw_pipe_sampler.c
@@ -48,8 +48,6 @@ static GLuint translate_img_filter( unsigned filter )
       return BRW_MAPFILTER_NEAREST;
    case PIPE_TEX_FILTER_LINEAR:
       return BRW_MAPFILTER_LINEAR;
-   case PIPE_TEX_FILTER_ANISO:
-      return BRW_MAPFILTER_ANISOTROPIC;
    default:
       assert(0);
       return BRW_MAPFILTER_NEAREST;
diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c
index 7e57d0306b..8f983a60ae 100644
--- a/src/gallium/drivers/i965/brw_wm_emit.c
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
@@ -691,7 +691,7 @@ static void emit_xpd( struct brw_compile *p,
 {
    GLuint i;
 
-   assert(!(mask & BRW_WRITEMASK_W) == BRW_WRITEMASK_X);
+   assert((mask & BRW_WRITEMASK_W) != BRW_WRITEMASK_W);
    
    for (i = 0 ; i < 3; i++) {
       if (mask & (1<<i)) {
diff --git a/src/gallium/drivers/identity/id_context.c b/src/gallium/drivers/identity/id_context.c
index bdbaae5987..9f5b4e6323 100644
--- a/src/gallium/drivers/identity/id_context.c
+++ b/src/gallium/drivers/identity/id_context.c
@@ -45,7 +45,7 @@ identity_destroy(struct pipe_context *_pipe)
    free(id_pipe);
 }
 
-static boolean
+static void
 identity_draw_arrays(struct pipe_context *_pipe,
                      unsigned prim,
                      unsigned start,
@@ -54,13 +54,13 @@ identity_draw_arrays(struct pipe_context *_pipe,
    struct identity_context *id_pipe = identity_context(_pipe);
    struct pipe_context *pipe = id_pipe->pipe;
 
-   return pipe->draw_arrays(pipe,
-                            prim,
-                            start,
-                            count);
+   pipe->draw_arrays(pipe,
+                     prim,
+                     start,
+                     count);
 }
 
-static boolean
+static void
 identity_draw_elements(struct pipe_context *_pipe,
                        struct pipe_buffer *_indexBuffer,
                        unsigned indexSize,
@@ -73,15 +73,15 @@ identity_draw_elements(struct pipe_context *_pipe,
    struct pipe_context *pipe = id_pipe->pipe;
    struct pipe_buffer *indexBuffer = id_buffer->buffer;
 
-   return pipe->draw_elements(pipe,
-                              indexBuffer,
-                              indexSize,
-                              prim,
-                              start,
-                              count);
+   pipe->draw_elements(pipe,
+                       indexBuffer,
+                       indexSize,
+                       prim,
+                       start,
+                       count);
 }
 
-static boolean
+static void
 identity_draw_range_elements(struct pipe_context *_pipe,
                              struct pipe_buffer *_indexBuffer,
                              unsigned indexSize,
@@ -96,14 +96,14 @@ identity_draw_range_elements(struct pipe_context *_pipe,
    struct pipe_context *pipe = id_pipe->pipe;
    struct pipe_buffer *indexBuffer = id_buffer->buffer;
 
-   return pipe->draw_range_elements(pipe,
-                                    indexBuffer,
-                                    indexSize,
-                                    minIndex,
-                                    maxIndex,
-                                    mode,
-                                    start,
-                                    count);
+   pipe->draw_range_elements(pipe,
+                             indexBuffer,
+                             indexSize,
+                             minIndex,
+                             maxIndex,
+                             mode,
+                             start,
+                             count);
 }
 
 static struct pipe_query *
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index e038a5229e..7c6e46006b 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -50,7 +50,6 @@ C_SOURCES = \
 	lp_state_vs.c \
 	lp_surface.c \
 	lp_tex_cache.c \
-	lp_tex_sample_c.c \
 	lp_tex_sample_llvm.c \
 	lp_texture.c \
 	lp_tile_cache.c \
diff --git a/src/gallium/drivers/llvmpipe/README b/src/gallium/drivers/llvmpipe/README
index 0c3f00fd58..72d9f39658 100644
--- a/src/gallium/drivers/llvmpipe/README
+++ b/src/gallium/drivers/llvmpipe/README
@@ -59,27 +59,16 @@ Requirements
    
    See /proc/cpuinfo to know what your CPU supports.
  
- - LLVM 2.5 or greater. LLVM 2.6 is preferred.
+ - LLVM 2.6.
  
-   On Debian based distributions do:
+   For Linux, on a recent Debian based distribution do:
  
      aptitude install llvm-dev
 
-   There is a typo in one of the llvm 2.5 headers, that may cause compilation
-   errors. To fix it apply the change:
-
-     --- /usr/include/llvm-c/Core.h.orig	2009-08-10 15:38:54.000000000 +0100
-     +++ /usr/include/llvm-c/Core.h	2009-08-10 15:38:25.000000000 +0100
-     @@ -831,7 +831,7 @@
-        template<typename T>
-        inline T **unwrap(LLVMValueRef *Vals, unsigned Length) {
-          #if DEBUG
-     -    for (LLVMValueRef *I = Vals, E = Vals + Length; I != E; ++I)
-     +    for (LLVMValueRef *I = Vals, *E = Vals + Length; I != E; ++I)
-            cast<T>(*I);
-          #endif
-          return reinterpret_cast<T**>(Vals);
- 
+   For Windows download pre-built MSVC 9.0 or MinGW binaries from
+   http://people.freedesktop.org/~jrfonseca/llvm/ and set the LLVM environment
+   variable to the extracted path.
+
  - scons (optional)
 
  - udis86, http://udis86.sourceforge.net/ (optional):
@@ -95,9 +84,9 @@ Requirements
 Building
 ========
 
-To build everything invoke scons as:
+To build everything on Linux invoke scons as:
 
-  scons debug=yes statetrackers=mesa drivers=llvmpipe winsys=xlib dri=false -k
+  scons debug=yes statetrackers=mesa drivers=trace,llvmpipe winsys=xlib dri=false
 
 Alternatively, you can build it with GNU make, if you prefer, by invoking it as
 
@@ -105,12 +94,15 @@ Alternatively, you can build it with GNU make, if you prefer, by invoking it as
 
 but the rest of these instructions assume that scons is used.
 
+For windows is everything the except except the winsys:
+
+  scons debug=yes statetrackers=mesa drivers=trace,llvmpipe winsys=gdi dri=false
 
 Using
 =====
 
-Building will create a drop-in alternative for libGL.so. To use it set the
-environment variables:
+On Linux, building will create a drop-in alternative for libGL.so. To use it
+set the environment variables:
 
   export LD_LIBRARY_PATH=$PWD/build/linux-x86_64-debug/lib:$LD_LIBRARY_PATH
 
@@ -121,6 +113,11 @@ or
 For performance evaluation pass debug=no to scons, and use the corresponding
 lib directory without the "-debug" suffix.
 
+On Windows, building will create a drop-in alternative for opengl32.dll. To use
+it put it in the same directory as the application. It can also be used by
+replacing the native ICD driver, but it's quite an advanced usage, so if you
+need to ask, don't even try it.
+
 
 Unit testing
 ============
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index 3ca676647c..6bb545a501 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -66,7 +66,6 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_state_vs.c',
 		'lp_surface.c',
 		'lp_tex_cache.c',
-		'lp_tex_sample_c.c',
 		'lp_tex_sample_llvm.c',
 		'lp_texture.c',
 		'lp_tile_cache.c',
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp b/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp
index d3f78c06d9..6e79438ead 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp
+++ b/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp
@@ -59,3 +59,17 @@ LLVMInitializeNativeTarget(void)
 
 
 #endif
+
+
+/* 
+ * Hack to allow the linking of release LLVM static libraries on a debug build.
+ *
+ * See also:
+ * - http://social.msdn.microsoft.com/Forums/en-US/vclanguage/thread/7234ea2b-0042-42ed-b4e2-5d8644dfb57d
+ */
+#if defined(_MSC_VER) && defined(_DEBUG)
+#include <crtdefs.h>
+extern "C" {
+   _CRTIMP void __cdecl _invalid_parameter_noinfo(void) {}
+}
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample.c b/src/gallium/drivers/llvmpipe/lp_bld_sample.c
index af70ddc6ab..9003e108c1 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample.c
@@ -69,8 +69,8 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    state->min_img_filter    = sampler->min_img_filter;
    state->min_mip_filter    = sampler->min_mip_filter;
    state->mag_img_filter    = sampler->mag_img_filter;
-   if(sampler->compare_mode) {
-      state->compare_mode      = sampler->compare_mode;
+   state->compare_mode      = sampler->compare_mode;
+   if(sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
       state->compare_func      = sampler->compare_func;
    }
    state->normalized_coords = sampler->normalized_coords;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
index 47b68b71e2..5ee8d556a6 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
@@ -488,7 +488,7 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
    LLVMValueRef res;
    unsigned chan;
 
-   if(!bld->static_state->compare_mode)
+   if(bld->static_state->compare_mode == PIPE_TEX_COMPARE_NONE)
       return;
 
    /* TODO: Compare before swizzling, to avoid redundant computations */
@@ -577,7 +577,6 @@ lp_build_sample_soa(LLVMBuilderRef builder,
       lp_build_sample_2d_nearest_soa(&bld, s, t, width, height, stride, data_ptr, texel);
       break;
    case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
       if(lp_format_is_rgba8(bld.format_desc))
          lp_build_sample_2d_linear_aos(&bld, s, t, width, height, stride, data_ptr, texel);
       else
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
index 61b033c9fc..fb1eda4423 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
@@ -361,6 +361,9 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       if (projected)
          coords[i] = lp_build_mul(&bld->base, coords[i], oow);
    }
+   for (i = num_coords; i < 3; i++) {
+      coords[i] = bld->base.undef;
+   }
 
    bld->sampler->emit_fetch_texel(bld->sampler,
                                   bld->base.builder,
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 37587d4f79..1cc3c9227c 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -256,22 +256,6 @@ llvmpipe_create( struct pipe_screen *screen )
       llvmpipe->vertex_tex_cache[i] = lp_create_tex_tile_cache(screen);
 
 
-   /* vertex shader samplers */
-   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
-      llvmpipe->tgsi.vert_samplers[i].base.get_samples = lp_get_samples;
-      llvmpipe->tgsi.vert_samplers[i].processor = TGSI_PROCESSOR_VERTEX;
-      llvmpipe->tgsi.vert_samplers[i].cache = llvmpipe->vertex_tex_cache[i];
-      llvmpipe->tgsi.vert_samplers_list[i] = &llvmpipe->tgsi.vert_samplers[i];
-   }
-
-   /* fragment shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      llvmpipe->tgsi.frag_samplers[i].base.get_samples = lp_get_samples;
-      llvmpipe->tgsi.frag_samplers[i].processor = TGSI_PROCESSOR_FRAGMENT;
-      llvmpipe->tgsi.frag_samplers[i].cache = llvmpipe->tex_cache[i];
-      llvmpipe->tgsi.frag_samplers_list[i] = &llvmpipe->tgsi.frag_samplers[i];
-   }
-
    /*
     * Create drawing context and plug our rendering stage into it.
     */
@@ -279,10 +263,7 @@ llvmpipe_create( struct pipe_screen *screen )
    if (!llvmpipe->draw) 
       goto fail;
 
-   draw_texture_samplers(llvmpipe->draw,
-                         PIPE_MAX_VERTEX_SAMPLERS,
-                         (struct tgsi_sampler **)
-                            llvmpipe->tgsi.vert_samplers_list);
+   /* FIXME: devise alternative to draw_texture_samplers */
 
    if (debug_get_bool_option( "LP_NO_RAST", FALSE ))
       llvmpipe->no_rast = TRUE;
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index cc4d5ad5fd..6411797cf5 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -115,14 +115,6 @@ struct llvmpipe_context {
 
    unsigned line_stipple_counter;
 
-   /** TGSI exec things */
-   struct {
-      struct lp_shader_sampler vert_samplers[PIPE_MAX_SAMPLERS];
-      struct lp_shader_sampler *vert_samplers_list[PIPE_MAX_SAMPLERS];
-      struct lp_shader_sampler frag_samplers[PIPE_MAX_SAMPLERS];
-      struct lp_shader_sampler *frag_samplers_list[PIPE_MAX_SAMPLERS];
-   } tgsi;
-
    /** The primitive drawing context */
    struct draw_context *draw;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
index a96c2cad9d..c152b4413f 100644
--- a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
+++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
@@ -45,11 +45,11 @@
 
 
 
-boolean
+void
 llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
                      unsigned start, unsigned count)
 {
-   return llvmpipe_draw_elements(pipe, NULL, 0, mode, start, count);
+   llvmpipe_draw_elements(pipe, NULL, 0, mode, start, count);
 }
 
 
@@ -58,7 +58,7 @@ llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
  * Basically, map the vertex buffers (and drawing surfaces), then hand off
  * the drawing to the 'draw' module.
  */
-boolean
+void
 llvmpipe_draw_range_elements(struct pipe_context *pipe,
                              struct pipe_buffer *indexBuffer,
                              unsigned indexSize,
@@ -122,20 +122,18 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe,
    /* Note: leave drawing surfaces mapped */
 
    lp->dirty_render_cache = TRUE;
-   
-   return TRUE;
 }
 
 
-boolean
+void
 llvmpipe_draw_elements(struct pipe_context *pipe,
                        struct pipe_buffer *indexBuffer,
                        unsigned indexSize,
                        unsigned mode, unsigned start, unsigned count)
 {
-   return llvmpipe_draw_range_elements( pipe, indexBuffer,
-                                        indexSize,
-                                        0, 0xffffffff,
-                                        mode, start, count );
+   llvmpipe_draw_range_elements( pipe, indexBuffer,
+                                 indexSize,
+                                 0, 0xffffffff,
+                                 mode, start, count );
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index bce3baec16..4ef0783f3e 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -79,25 +79,22 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
 
    /* struct lp_jit_context */
    {
-      LLVMTypeRef elem_types[5];
+      LLVMTypeRef elem_types[4];
       LLVMTypeRef context_type;
 
       elem_types[0] = LLVMPointerType(LLVMFloatType(), 0); /* constants */
-      elem_types[1] = LLVMPointerType(LLVMInt8Type(), 0);  /* samplers */
-      elem_types[2] = LLVMFloatType();                     /* alpha_ref_value */
-      elem_types[3] = LLVMPointerType(LLVMInt8Type(), 0);  /* blend_color */
-      elem_types[4] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); /* textures */
+      elem_types[1] = LLVMFloatType();                     /* alpha_ref_value */
+      elem_types[2] = LLVMPointerType(LLVMInt8Type(), 0);  /* blend_color */
+      elem_types[3] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); /* textures */
 
       context_type = LLVMStructType(elem_types, Elements(elem_types), 0);
 
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, constants,
                              screen->target, context_type, 0);
-      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, samplers,
-                             screen->target, context_type, 1);
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, alpha_ref_value,
-                             screen->target, context_type, 2);
+                             screen->target, context_type, 1);
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, blend_color,
-                             screen->target, context_type, 3);
+                             screen->target, context_type, 2);
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, textures,
                              screen->target, context_type,
                              LP_JIT_CONTEXT_TEXTURES_INDEX);
@@ -109,24 +106,6 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
       screen->context_ptr_type = LLVMPointerType(context_type, 0);
    }
 
-   /* fetch_texel
-    */
-   {
-      LLVMTypeRef ret_type;
-      LLVMTypeRef arg_types[3];
-      LLVMValueRef fetch_texel;
-
-      ret_type = LLVMVoidType();
-      arg_types[0] = LLVMPointerType(LLVMInt8Type(), 0);  /* samplers */
-      arg_types[1] = LLVMInt32Type();                     /* unit */
-      arg_types[2] = LLVMPointerType(LLVMVectorType(LLVMFloatType(), 4), 0); /* store */
-
-      fetch_texel = lp_declare_intrinsic(screen->module, "fetch_texel",
-                                         ret_type, arg_types, Elements(arg_types));
-
-      LLVMAddGlobalMapping(screen->engine, fetch_texel, lp_fetch_texel_soa);
-   }
-
 #ifdef DEBUG
    LLVMDumpModule(screen->module);
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index 58f716ede2..277b690c02 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -41,7 +41,6 @@
 #include "pipe/p_state.h"
 
 
-struct tgsi_sampler;
 struct llvmpipe_screen;
 
 
@@ -78,8 +77,6 @@ struct lp_jit_context
 {
    const float *constants;
 
-   struct tgsi_sampler **samplers;
-
    float alpha_ref_value;
 
    /* FIXME: store (also?) in floats */
@@ -92,16 +89,13 @@ struct lp_jit_context
 #define lp_jit_context_constants(_builder, _ptr) \
    lp_build_struct_get(_builder, _ptr, 0, "constants")
 
-#define lp_jit_context_samplers(_builder, _ptr) \
-   lp_build_struct_get(_builder, _ptr, 1, "samplers")
-
 #define lp_jit_context_alpha_ref_value(_builder, _ptr) \
-   lp_build_struct_get(_builder, _ptr, 2, "alpha_ref_value")
+   lp_build_struct_get(_builder, _ptr, 1, "alpha_ref_value")
 
 #define lp_jit_context_blend_color(_builder, _ptr) \
-   lp_build_struct_get(_builder, _ptr, 3, "blend_color")
+   lp_build_struct_get(_builder, _ptr, 2, "blend_color")
 
-#define LP_JIT_CONTEXT_TEXTURES_INDEX 4
+#define LP_JIT_CONTEXT_TEXTURES_INDEX 3
 
 #define lp_jit_context_textures(_builder, _ptr) \
    lp_build_struct_get_ptr(_builder, _ptr, LP_JIT_CONTEXT_TEXTURES_INDEX, "textures")
@@ -118,12 +112,6 @@ typedef void
                     void *color,
                     void *depth);
 
-void PIPE_CDECL
-lp_fetch_texel_soa( struct tgsi_sampler **samplers,
-                    uint32_t unit,
-                    float *store );
-
-
 void
 lp_jit_screen_cleanup(struct llvmpipe_screen *screen);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_quad.h b/src/gallium/drivers/llvmpipe/lp_quad.h
index 7eb05de77a..c3a48700a4 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad.h
+++ b/src/gallium/drivers/llvmpipe/lp_quad.h
@@ -31,6 +31,7 @@
 #ifndef LP_QUAD_H
 #define LP_QUAD_H
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_exec.h"
 
@@ -83,7 +84,7 @@ struct quad_header_inout
 struct quad_header_output
 {
    /** colors in SOA format (rrrr, gggg, bbbb, aaaa) */
-   float ALIGN16_ATTRIB color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
+   PIPE_ALIGN_VAR(16) float color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
 };
 
 
@@ -92,9 +93,9 @@ struct quad_header_output
  */
 struct quad_interp_coef
 {
-   float ALIGN16_ATTRIB a0[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-   float ALIGN16_ATTRIB dadx[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-   float ALIGN16_ATTRIB dady[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   PIPE_ALIGN_VAR(16) float a0[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   PIPE_ALIGN_VAR(16) float dadx[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   PIPE_ALIGN_VAR(16) float dady[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index b18f17c0cd..0b2d3a2801 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -117,7 +117,7 @@ struct setup_context {
 /**
  * Execute fragment shader for the four fragments in the quad.
  */
-ALIGN_STACK
+PIPE_ALIGN_STACK
 static void
 shade_quads(struct llvmpipe_context *llvmpipe,
             struct quad_header *quads[],
@@ -130,7 +130,7 @@ shade_quads(struct llvmpipe_context *llvmpipe,
    uint8_t *tile;
    uint8_t *color;
    void *depth;
-   uint32_t ALIGN16_ATTRIB mask[4][NUM_CHANNELS];
+   PIPE_ALIGN_VAR(16) uint32_t mask[4][NUM_CHANNELS];
    unsigned chan_index;
    unsigned q;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h
index 5cee7bf74b..7020da145f 100644
--- a/src/gallium/drivers/llvmpipe/lp_state.h
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@@ -56,7 +56,6 @@
 #define LP_NEW_QUERY         0x4000
 
 
-struct tgsi_sampler;
 struct vertex_info;
 struct pipe_context;
 struct llvmpipe_context;
@@ -197,14 +196,14 @@ void llvmpipe_update_fs(struct llvmpipe_context *lp);
 void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe );
 
 
-boolean llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
+void llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
 			     unsigned start, unsigned count);
 
-boolean llvmpipe_draw_elements(struct pipe_context *pipe,
+void llvmpipe_draw_elements(struct pipe_context *pipe,
 			       struct pipe_buffer *indexBuffer,
 			       unsigned indexSize,
 			       unsigned mode, unsigned start, unsigned count);
-boolean
+void
 llvmpipe_draw_range_elements(struct pipe_context *pipe,
                              struct pipe_buffer *indexBuffer,
                              unsigned indexSize,
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index acfd7be5f7..6c1ef6bc42 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -192,36 +192,6 @@ compute_cliprect(struct llvmpipe_context *lp)
 }
 
 
-static void
-update_tgsi_samplers( struct llvmpipe_context *llvmpipe )
-{
-   unsigned i;
-
-   /* vertex shader samplers */
-   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
-      llvmpipe->tgsi.vert_samplers[i].sampler = llvmpipe->vertex_samplers[i];
-      llvmpipe->tgsi.vert_samplers[i].texture = llvmpipe->vertex_textures[i];
-      llvmpipe->tgsi.vert_samplers[i].base.get_samples = lp_get_samples;
-   }
-
-   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
-      lp_tex_tile_cache_validate_texture( llvmpipe->vertex_tex_cache[i] );
-   }
-
-   /* fragment shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      llvmpipe->tgsi.frag_samplers[i].sampler = llvmpipe->sampler[i];
-      llvmpipe->tgsi.frag_samplers[i].texture = llvmpipe->texture[i];
-      llvmpipe->tgsi.frag_samplers[i].base.get_samples = lp_get_samples;
-   }
-
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      lp_tex_tile_cache_validate_texture( llvmpipe->tex_cache[i] );
-   }
-
-   llvmpipe->jit_context.samplers = (struct tgsi_sampler **)llvmpipe->tgsi.frag_samplers_list;
-}
-
 /* Hopefully this will remain quite simple, otherwise need to pull in
  * something like the state tracker mechanism.
  */
@@ -237,8 +207,9 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
    }
       
    if (llvmpipe->dirty & (LP_NEW_SAMPLER |
-                          LP_NEW_TEXTURE))
-      update_tgsi_samplers( llvmpipe );
+                          LP_NEW_TEXTURE)) {
+      /* TODO */
+   }
 
    if (llvmpipe->dirty & (LP_NEW_RASTERIZER |
                           LP_NEW_FS |
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index f2b8c36264..b73ca2d41e 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -453,8 +453,8 @@ generate_fragment(struct llvmpipe_context *lp,
                          debug_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
             debug_printf("  .mag_img_filter = %s\n",
                          debug_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
-            if(key->sampler[i].compare_mode)
-               debug_printf("  .compare_mode = %s\n", debug_dump_func(key->sampler[i].compare_func, TRUE));
+            if(key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
+               debug_printf("  .compare_func = %s\n", debug_dump_func(key->sampler[i].compare_func, TRUE));
             debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
             debug_printf("  .prefilter = %u\n", key->sampler[i].prefilter);
          }
@@ -550,13 +550,8 @@ generate_fragment(struct llvmpipe_context *lp,
                             a0_ptr, dadx_ptr, dady_ptr,
                             x0, y0, 2, 0);
 
-#if 0
-   /* C texture sampling */
-   sampler = lp_c_sampler_soa_create(context_ptr);
-#else
    /* code generated texture sampling */
    sampler = lp_llvm_sampler_soa_create(key->sampler, context_ptr);
-#endif
 
    for(i = 0; i < num_fs; ++i) {
       LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index 29fff91981..6c29e8d8ac 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -462,7 +462,7 @@ compute_blend_ref(const struct pipe_blend_state *blend,
 }
 
 
-ALIGN_STACK
+PIPE_ALIGN_STACK
 static boolean
 test_one(unsigned verbose,
          FILE *fp,
@@ -531,11 +531,11 @@ test_one(unsigned verbose,
    success = TRUE;
    for(i = 0; i < n && success; ++i) {
       if(mode == AoS) {
-         ALIGN16_ATTRIB uint8_t src[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t con[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t res[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t src[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t con[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t res[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8];
          int64_t start_counter = 0;
          int64_t end_counter = 0;
 
@@ -596,11 +596,11 @@ test_one(unsigned verbose,
 
       if(mode == SoA) {
          const unsigned stride = type.length*type.width/8;
-         ALIGN16_ATTRIB uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8];
          int64_t start_counter = 0;
          int64_t end_counter = 0;
          boolean mismatch;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index faddfb9677..c1abee424c 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -142,7 +142,7 @@ add_conv_test(LLVMModuleRef module,
 }
 
 
-ALIGN_STACK
+PIPE_ALIGN_STACK
 static boolean
 test_one(unsigned verbose,
          FILE *fp,
@@ -230,8 +230,8 @@ test_one(unsigned verbose,
    for(i = 0; i < n && success; ++i) {
       unsigned src_stride = src_type.length*src_type.width/8;
       unsigned dst_stride = dst_type.length*dst_type.width/8;
-      ALIGN16_ATTRIB uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
-      ALIGN16_ATTRIB uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      PIPE_ALIGN_VAR(16) uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      PIPE_ALIGN_VAR(16) uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       double fref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       int64_t start_counter = 0;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
index 23ea9ebbe7..2b258f1052 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -199,7 +199,7 @@ add_store_rgba_test(LLVMModuleRef module,
 }
 
 
-ALIGN_STACK
+PIPE_ALIGN_STACK
 static boolean
 test_format(unsigned verbose, FILE *fp, const struct pixel_test_case *test)
 {
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.h b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
index 9ad1bde956..cb59a94464 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.h
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
@@ -31,64 +31,11 @@
 
 #include <llvm-c/Core.h>
 
-#include "tgsi/tgsi_exec.h"
 
-
-struct llvmpipe_tex_tile_cache;
 struct lp_sampler_static_state;
 
 
 /**
- * Subclass of tgsi_sampler
- */
-struct lp_shader_sampler
-{
-   struct tgsi_sampler base;  /**< base class */
-
-   unsigned processor;
-
-   /* For lp_get_samples_2d_linear_POT:
-    */
-   unsigned xpot;
-   unsigned ypot;
-   unsigned level;
-
-   const struct pipe_texture *texture;
-   const struct pipe_sampler_state *sampler;
-
-   struct llvmpipe_tex_tile_cache *cache;
-};
-
-
-
-static INLINE struct lp_shader_sampler *
-lp_shader_sampler(const struct tgsi_sampler *sampler)
-{
-   return (struct lp_shader_sampler *) sampler;
-}
-
-
-
-extern void
-lp_get_samples(struct tgsi_sampler *tgsi_sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               float lodbias,
-               float rgba[NUM_CHANNELS][QUAD_SIZE]);
-
-
-/**
- * Texture sampling code generator that just calls lp_get_samples C function
- * for the actual sampling computation.
- *
- * @param context_ptr LLVM value with the pointer to the struct lp_jit_context.
- */
-struct lp_build_sampler_soa *
-lp_c_sampler_soa_create(LLVMValueRef context_ptr);
-
-
-/**
  * Pure-LLVM texture sampling code generator.
  *
  * @param context_ptr LLVM value with the pointer to the struct lp_jit_context.
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c b/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
deleted file mode 100644
index 68520fa4f0..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
+++ /dev/null
@@ -1,1713 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * Copyright 2008 VMware, Inc.  All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * Texture sampling
- *
- * Authors:
- *   Brian Paul
- */
-
-#include "lp_context.h"
-#include "lp_quad.h"
-#include "lp_surface.h"
-#include "lp_texture.h"
-#include "lp_tex_sample.h"
-#include "lp_tex_cache.h"
-#include "pipe/p_context.h"
-#include "pipe/p_defines.h"
-#include "pipe/p_shader_tokens.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-
-
-
-/*
- * Note, the FRAC macro has to work perfectly.  Otherwise you'll sometimes
- * see 1-pixel bands of improperly weighted linear-filtered textures.
- * The tests/texwrap.c demo is a good test.
- * Also note, FRAC(x) doesn't truly return the fractional part of x for x < 0.
- * Instead, if x < 0 then FRAC(x) = 1 - true_frac(x).
- */
-#define FRAC(f)  ((f) - util_ifloor(f))
-
-
-/**
- * Linear interpolation macro
- */
-static INLINE float
-lerp(float a, float v0, float v1)
-{
-   return v0 + a * (v1 - v0);
-}
-
-
-/**
- * Do 2D/biliner interpolation of float values.
- * v00, v10, v01 and v11 are typically four texture samples in a square/box.
- * a and b are the horizontal and vertical interpolants.
- * It's important that this function is inlined when compiled with
- * optimization!  If we find that's not true on some systems, convert
- * to a macro.
- */
-static INLINE float
-lerp_2d(float a, float b,
-        float v00, float v10, float v01, float v11)
-{
-   const float temp0 = lerp(a, v00, v10);
-   const float temp1 = lerp(a, v01, v11);
-   return lerp(b, temp0, temp1);
-}
-
-
-/**
- * As above, but 3D interpolation of 8 values.
- */
-static INLINE float
-lerp_3d(float a, float b, float c,
-        float v000, float v100, float v010, float v110,
-        float v001, float v101, float v011, float v111)
-{
-   const float temp0 = lerp_2d(a, b, v000, v100, v010, v110);
-   const float temp1 = lerp_2d(a, b, v001, v101, v011, v111);
-   return lerp(c, temp0, temp1);
-}
-
-
-
-/**
- * If A is a signed integer, A % B doesn't give the right value for A < 0
- * (in terms of texture repeat).  Just casting to unsigned fixes that.
- */
-#define REMAINDER(A, B) ((unsigned) (A) % (unsigned) (B))
-
-
-/**
- * Apply texture coord wrapping mode and return integer texture indexes
- * for a vector of four texcoords (S or T or P).
- * \param wrapMode  PIPE_TEX_WRAP_x
- * \param s  the incoming texcoords
- * \param size  the texture image size
- * \param icoord  returns the integer texcoords
- * \return  integer texture index
- */
-static INLINE void
-nearest_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
-                   int icoord[4])
-{
-   uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      /* s limited to [0,1) */
-      /* i limited to [0,size-1] */
-      for (ch = 0; ch < 4; ch++) {
-         int i = util_ifloor(s[ch] * size);
-         icoord[ch] = REMAINDER(i, size);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP:
-      /* s limited to [0,1] */
-      /* i limited to [0,size-1] */
-      for (ch = 0; ch < 4; ch++) {
-         if (s[ch] <= 0.0F)
-            icoord[ch] = 0;
-         else if (s[ch] >= 1.0F)
-            icoord[ch] = size - 1;
-         else
-            icoord[ch] = util_ifloor(s[ch] * size);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            if (s[ch] < min)
-               icoord[ch] = 0;
-            else if (s[ch] > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(s[ch] * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [-1, size] */
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            if (s[ch] <= min)
-               icoord[ch] = -1;
-            else if (s[ch] >= max)
-               icoord[ch] = size;
-            else
-               icoord[ch] = util_ifloor(s[ch] * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      {
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const int flr = util_ifloor(s[ch]);
-            float u;
-            if (flr & 1)
-               u = 1.0F - (s[ch] - (float) flr);
-            else
-               u = s[ch] - (float) flr;
-            if (u < min)
-               icoord[ch] = 0;
-            else if (u > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         /* s limited to [0,1] */
-         /* i limited to [0,size-1] */
-         const float u = fabsf(s[ch]);
-         if (u <= 0.0F)
-            icoord[ch] = 0;
-         else if (u >= 1.0F)
-            icoord[ch] = size - 1;
-         else
-            icoord[ch] = util_ifloor(u * size);
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const float u = fabsf(s[ch]);
-            if (u < min)
-               icoord[ch] = 0;
-            else if (u > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const float u = fabsf(s[ch]);
-            if (u < min)
-               icoord[ch] = -1;
-            else if (u > max)
-               icoord[ch] = size;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   default:
-      assert(0);
-   }
-}
-
-
-/**
- * Used to compute texel locations for linear sampling for four texcoords.
- * \param wrapMode  PIPE_TEX_WRAP_x
- * \param s  the texcoords
- * \param size  the texture image size
- * \param icoord0  returns first texture indexes
- * \param icoord1  returns second texture indexes (usually icoord0 + 1)
- * \param w  returns blend factor/weight between texture indexes
- * \param icoord  returns the computed integer texture coords
- */
-static INLINE void
-linear_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
-                  int icoord0[4], int icoord1[4], float w[4])
-{
-   uint ch;
-
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      for (ch = 0; ch < 4; ch++) {
-         float u = s[ch] * size - 0.5F;
-         icoord0[ch] = REMAINDER(util_ifloor(u), size);
-         icoord1[ch] = REMAINDER(icoord0[ch] + 1, size);
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.0F, 1.0F);
-         u = u * size - 0.5f;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.0F, 1.0F);
-         u = u * size - 0.5f;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      {
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            float u = CLAMP(s[ch], min, max);
-            u = u * size - 0.5f;
-            icoord0[ch] = util_ifloor(u);
-            icoord1[ch] = icoord0[ch] + 1;
-            w[ch] = FRAC(u);
-         }
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      for (ch = 0; ch < 4; ch++) {
-         const int flr = util_ifloor(s[ch]);
-         float u;
-         if (flr & 1)
-            u = 1.0F - (s[ch] - (float) flr);
-         else
-            u = s[ch] - (float) flr;
-         u = u * size - 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         float u = fabsf(s[ch]);
-         if (u >= 1.0F)
-            u = (float) size;
-         else
-            u *= size;
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      for (ch = 0; ch < 4; ch++) {
-         float u = fabsf(s[ch]);
-         if (u >= 1.0F)
-            u = (float) size;
-         else
-            u *= size;
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            float u = fabsf(s[ch]);
-            if (u <= min)
-               u = min * size;
-            else if (u >= max)
-               u = max * size;
-            else
-               u *= size;
-            u -= 0.5F;
-            icoord0[ch] = util_ifloor(u);
-            icoord1[ch] = icoord0[ch] + 1;
-            w[ch] = FRAC(u);
-         }
-      }
-      break;;
-   default:
-      assert(0);
-   }
-}
-
-
-/**
- * For RECT textures / unnormalized texcoords
- * Only a subset of wrap modes supported.
- */
-static INLINE void
-nearest_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
-                          int icoord[4])
-{
-   uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         int i = util_ifloor(s[ch]);
-         icoord[ch]= CLAMP(i, 0, (int) size-1);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      /* fall-through */
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      for (ch = 0; ch < 4; ch++) {
-         icoord[ch]= util_ifloor( CLAMP(s[ch], 0.5F, (float) size - 0.5F) );
-      }
-      return;
-   default:
-      assert(0);
-   }
-}
-
-
-/**
- * For RECT textures / unnormalized texcoords.
- * Only a subset of wrap modes supported.
- */
-static INLINE void
-linear_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
-                         int icoord0[4], int icoord1[4], float w[4])
-{
-   uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         /* Not exactly what the spec says, but it matches NVIDIA output */
-         float u = CLAMP(s[ch] - 0.5F, 0.0f, (float) size - 1.0f);
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      /* fall-through */
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.5F, (float) size - 0.5F);
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord1[ch] > (int) size - 1)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;
-   default:
-      assert(0);
-   }
-}
-
-
-static unsigned
-choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
-{
-   /*
-      major axis
-      direction     target                             sc     tc    ma
-      ----------    -------------------------------    ---    ---   ---
-       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
-       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
-       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
-       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
-       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
-       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
-   */
-   const float arx = fabsf(rx), ary = fabsf(ry), arz = fabsf(rz);
-   unsigned face;
-   float sc, tc, ma;
-
-   if (arx > ary && arx > arz) {
-      if (rx >= 0.0F) {
-         face = PIPE_TEX_FACE_POS_X;
-         sc = -rz;
-         tc = -ry;
-         ma = arx;
-      }
-      else {
-         face = PIPE_TEX_FACE_NEG_X;
-         sc = rz;
-         tc = -ry;
-         ma = arx;
-      }
-   }
-   else if (ary > arx && ary > arz) {
-      if (ry >= 0.0F) {
-         face = PIPE_TEX_FACE_POS_Y;
-         sc = rx;
-         tc = rz;
-         ma = ary;
-      }
-      else {
-         face = PIPE_TEX_FACE_NEG_Y;
-         sc = rx;
-         tc = -rz;
-         ma = ary;
-      }
-   }
-   else {
-      if (rz > 0.0F) {
-         face = PIPE_TEX_FACE_POS_Z;
-         sc = rx;
-         tc = -ry;
-         ma = arz;
-      }
-      else {
-         face = PIPE_TEX_FACE_NEG_Z;
-         sc = -rx;
-         tc = -ry;
-         ma = arz;
-      }
-   }
-
-   *newS = ( sc / ma + 1.0F ) * 0.5F;
-   *newT = ( tc / ma + 1.0F ) * 0.5F;
-
-   return face;
-}
-
-
-/**
- * Examine the quad's texture coordinates to compute the partial
- * derivatives w.r.t X and Y, then compute lambda (level of detail).
- *
- * This is only done for fragment shaders, not vertex shaders.
- */
-static float
-compute_lambda(struct tgsi_sampler *tgsi_sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               float lodbias)
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-   float rho, lambda;
-
-   if (samp->processor == TGSI_PROCESSOR_VERTEX)
-      return lodbias;
-
-   assert(sampler->normalized_coords);
-
-   assert(s);
-   {
-      float dsdx = s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT];
-      float dsdy = s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT];
-      dsdx = fabsf(dsdx);
-      dsdy = fabsf(dsdy);
-      rho = MAX2(dsdx, dsdy) * texture->width0;
-   }
-   if (t) {
-      float dtdx = t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT];
-      float dtdy = t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT];
-      float max;
-      dtdx = fabsf(dtdx);
-      dtdy = fabsf(dtdy);
-      max = MAX2(dtdx, dtdy) * texture->height0;
-      rho = MAX2(rho, max);
-   }
-   if (p) {
-      float dpdx = p[QUAD_BOTTOM_RIGHT] - p[QUAD_BOTTOM_LEFT];
-      float dpdy = p[QUAD_TOP_LEFT]     - p[QUAD_BOTTOM_LEFT];
-      float max;
-      dpdx = fabsf(dpdx);
-      dpdy = fabsf(dpdy);
-      max = MAX2(dpdx, dpdy) * texture->depth0;
-      rho = MAX2(rho, max);
-   }
-
-   lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->lod_bias;
-   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
-
-   return lambda;
-}
-
-
-/**
- * Do several things here:
- * 1. Compute lambda from the texcoords, if needed
- * 2. Determine if we're minifying or magnifying
- * 3. If minifying, choose mipmap levels
- * 4. Return image filter to use within mipmap images
- * \param level0  Returns first mipmap level to sample from
- * \param level1  Returns second mipmap level to sample from
- * \param levelBlend  Returns blend factor between levels, in [0,1]
- * \param imgFilter  Returns either the min or mag filter, depending on lambda
- */
-static void
-choose_mipmap_levels(struct tgsi_sampler *tgsi_sampler,
-                     const float s[QUAD_SIZE],
-                     const float t[QUAD_SIZE],
-                     const float p[QUAD_SIZE],
-                     float lodbias,
-                     unsigned *level0, unsigned *level1, float *levelBlend,
-                     unsigned *imgFilter)
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-
-   if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
-      /* no mipmap selection needed */
-      *level0 = *level1 = CLAMP((int) sampler->min_lod,
-                                0, (int) texture->last_level);
-
-      if (sampler->min_img_filter != sampler->mag_img_filter) {
-         /* non-mipmapped texture, but still need to determine if doing
-          * minification or magnification.
-          */
-         float lambda = compute_lambda(tgsi_sampler, s, t, p, lodbias);
-         if (lambda <= 0.0) {
-            *imgFilter = sampler->mag_img_filter;
-         }
-         else {
-            *imgFilter = sampler->min_img_filter;
-         }
-      }
-      else {
-         *imgFilter = sampler->mag_img_filter;
-      }
-   }
-   else {
-      float lambda = compute_lambda(tgsi_sampler, s, t, p, lodbias);
-
-      if (lambda <= 0.0) { /* XXX threshold depends on the filter */
-         /* magnifying */
-         *imgFilter = sampler->mag_img_filter;
-         *level0 = *level1 = 0;
-      }
-      else {
-         /* minifying */
-         *imgFilter = sampler->min_img_filter;
-
-         /* choose mipmap level(s) and compute the blend factor between them */
-         if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
-            /* Nearest mipmap level */
-            const int lvl = (int) (lambda + 0.5);
-            *level0 =
-            *level1 = CLAMP(lvl, 0, (int) texture->last_level);
-         }
-         else {
-            /* Linear interpolation between mipmap levels */
-            const int lvl = (int) lambda;
-            *level0 = CLAMP(lvl,     0, (int) texture->last_level);
-            *level1 = CLAMP(lvl + 1, 0, (int) texture->last_level);
-            *levelBlend = FRAC(lambda);  /* blending weight between levels */
-         }
-      }
-   }
-}
-
-
-/**
- * Get a texel from a texture, using the texture tile cache.
- *
- * \param face  the cube face in 0..5
- * \param level  the mipmap level
- * \param x  the x coord of texel within 2D image
- * \param y  the y coord of texel within 2D image
- * \param z  which slice of a 3D texture
- * \param rgba  the quad to put the texel/color into
- * \param j  which element of the rgba quad to write to
- *
- * XXX maybe move this into lp_tile_cache.c and merge with the
- * lp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
- */
-static void
-get_texel_quad_2d(const struct tgsi_sampler *tgsi_sampler,
-                  unsigned face, unsigned level, int x, int y, 
-                  const uint8_t *out[4])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-
-   const struct llvmpipe_cached_tex_tile *tile
-      = lp_get_cached_tex_tile(samp->cache,
-                               tex_tile_address(x, y, 0, face, level));
-
-   y %= TEX_TILE_SIZE;
-   x %= TEX_TILE_SIZE;
-      
-   out[0] = &tile->color[y  ][x  ][0];
-   out[1] = &tile->color[y  ][x+1][0];
-   out[2] = &tile->color[y+1][x  ][0];
-   out[3] = &tile->color[y+1][x+1][0];
-}
-
-static INLINE const uint8_t *
-get_texel_2d_ptr(const struct tgsi_sampler *tgsi_sampler,
-                 unsigned face, unsigned level, int x, int y)
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-
-   const struct llvmpipe_cached_tex_tile *tile
-      = lp_get_cached_tex_tile(samp->cache,
-                               tex_tile_address(x, y, 0, face, level));
-
-   y %= TEX_TILE_SIZE;
-   x %= TEX_TILE_SIZE;
-
-   return &tile->color[y][x][0];
-}
-
-
-static void
-get_texel_quad_2d_mt(const struct tgsi_sampler *tgsi_sampler,
-                     unsigned face, unsigned level, 
-                     int x0, int y0, 
-                     int x1, int y1,
-                     const uint8_t *out[4])
-{
-   unsigned i;
-
-   for (i = 0; i < 4; i++) {
-      unsigned tx = (i & 1) ? x1 : x0;
-      unsigned ty = (i >> 1) ? y1 : y0;
-
-      out[i] = get_texel_2d_ptr( tgsi_sampler, face, level, tx, ty );
-   }
-}
-
-static void
-get_texel(const struct tgsi_sampler *tgsi_sampler,
-                 unsigned face, unsigned level, int x, int y, int z,
-                 float rgba[NUM_CHANNELS][QUAD_SIZE], unsigned j)
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-
-   if (x < 0 || x >= (int) u_minify(texture->width0, level) ||
-       y < 0 || y >= (int) u_minify(texture->height0, level) ||
-       z < 0 || z >= (int) u_minify(texture->depth0, level)) {
-      rgba[0][j] = sampler->border_color[0];
-      rgba[1][j] = sampler->border_color[1];
-      rgba[2][j] = sampler->border_color[2];
-      rgba[3][j] = sampler->border_color[3];
-   }
-   else {
-      const unsigned tx = x % TEX_TILE_SIZE;
-      const unsigned ty = y % TEX_TILE_SIZE;
-      const struct llvmpipe_cached_tex_tile *tile;
-
-      tile = lp_get_cached_tex_tile(samp->cache,
-                                    tex_tile_address(x, y, z, face, level));
-
-      rgba[0][j] = ubyte_to_float(tile->color[ty][tx][0]);
-      rgba[1][j] = ubyte_to_float(tile->color[ty][tx][1]);
-      rgba[2][j] = ubyte_to_float(tile->color[ty][tx][2]);
-      rgba[3][j] = ubyte_to_float(tile->color[ty][tx][3]);
-      if (0)
-      {
-         debug_printf("Get texel %f %f %f %f from %s\n",
-                      rgba[0][j], rgba[1][j], rgba[2][j], rgba[3][j],
-                      pf_name(texture->format));
-      }
-   }
-}
-
-
-/**
- * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
- * When we sampled the depth texture, the depth value was put into all
- * RGBA channels.  We look at the red channel here.
- * \param rgba  quad of (depth) texel values
- * \param p  texture 'P' components for four pixels in quad
- * \param j  which pixel in the quad to test [0..3]
- */
-static INLINE void
-shadow_compare(const struct pipe_sampler_state *sampler,
-               float rgba[NUM_CHANNELS][QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               uint j)
-{
-   int k;
-   switch (sampler->compare_func) {
-   case PIPE_FUNC_LESS:
-      k = p[j] < rgba[0][j];
-      break;
-   case PIPE_FUNC_LEQUAL:
-      k = p[j] <= rgba[0][j];
-      break;
-   case PIPE_FUNC_GREATER:
-      k = p[j] > rgba[0][j];
-      break;
-   case PIPE_FUNC_GEQUAL:
-      k = p[j] >= rgba[0][j];
-      break;
-   case PIPE_FUNC_EQUAL:
-      k = p[j] == rgba[0][j];
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      k = p[j] != rgba[0][j];
-      break;
-   case PIPE_FUNC_ALWAYS:
-      k = 1;
-      break;
-   case PIPE_FUNC_NEVER:
-      k = 0;
-      break;
-   default:
-      k = 0;
-      assert(0);
-      break;
-   }
-
-   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
-   rgba[0][j] = rgba[1][j] = rgba[2][j] = (float) k;
-   rgba[3][j] = 1.0F;
-}
-
-
-/**
- * As above, but do four z/texture comparisons.
- */
-static INLINE void
-shadow_compare4(const struct pipe_sampler_state *sampler,
-                float rgba[NUM_CHANNELS][QUAD_SIZE],
-                const float p[QUAD_SIZE])
-{
-   int j, k0, k1, k2, k3;
-   float val;
-
-   /* compare four texcoords vs. four texture samples */
-   switch (sampler->compare_func) {
-   case PIPE_FUNC_LESS:
-      k0 = p[0] < rgba[0][0];
-      k1 = p[1] < rgba[0][1];
-      k2 = p[2] < rgba[0][2];
-      k3 = p[3] < rgba[0][3];
-      break;
-   case PIPE_FUNC_LEQUAL:
-      k0 = p[0] <= rgba[0][0];
-      k1 = p[1] <= rgba[0][1];
-      k2 = p[2] <= rgba[0][2];
-      k3 = p[3] <= rgba[0][3];
-      break;
-   case PIPE_FUNC_GREATER:
-      k0 = p[0] > rgba[0][0];
-      k1 = p[1] > rgba[0][1];
-      k2 = p[2] > rgba[0][2];
-      k3 = p[3] > rgba[0][3];
-      break;
-   case PIPE_FUNC_GEQUAL:
-      k0 = p[0] >= rgba[0][0];
-      k1 = p[1] >= rgba[0][1];
-      k2 = p[2] >= rgba[0][2];
-      k3 = p[3] >= rgba[0][3];
-      break;
-   case PIPE_FUNC_EQUAL:
-      k0 = p[0] == rgba[0][0];
-      k1 = p[1] == rgba[0][1];
-      k2 = p[2] == rgba[0][2];
-      k3 = p[3] == rgba[0][3];
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      k0 = p[0] != rgba[0][0];
-      k1 = p[1] != rgba[0][1];
-      k2 = p[2] != rgba[0][2];
-      k3 = p[3] != rgba[0][3];
-      break;
-   case PIPE_FUNC_ALWAYS:
-      k0 = k1 = k2 = k3 = 1;
-      break;
-   case PIPE_FUNC_NEVER:
-      k0 = k1 = k2 = k3 = 0;
-      break;
-   default:
-      k0 = k1 = k2 = k3 = 0;
-      assert(0);
-      break;
-   }
-
-   /* convert four pass/fail values to an intensity in [0,1] */
-   val = 0.25F * (k0 + k1 + k2 + k3);
-
-   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
-   for (j = 0; j < 4; j++) {
-      rgba[0][j] = rgba[1][j] = rgba[2][j] = val;
-      rgba[3][j] = 1.0F;
-   }
-}
-
-
-
-static void
-lp_get_samples_2d_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
-                                    const float s[QUAD_SIZE],
-                                    const float t[QUAD_SIZE],
-                                    const float p[QUAD_SIZE],
-                                    float lodbias,
-                                    float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   unsigned  j;
-   unsigned level = samp->level;
-   unsigned xpot = 1 << (samp->xpot - level);
-   unsigned ypot = 1 << (samp->ypot - level);
-   unsigned xmax = (xpot - 1) & (TEX_TILE_SIZE - 1); /* MIN2(TEX_TILE_SIZE, xpot) - 1; */
-   unsigned ymax = (ypot - 1) & (TEX_TILE_SIZE - 1); /* MIN2(TEX_TILE_SIZE, ypot) - 1; */
-      
-   for (j = 0; j < QUAD_SIZE; j++) {
-      int c;
-
-      float u = s[j] * xpot - 0.5F;
-      float v = t[j] * ypot - 0.5F;
-
-      int uflr = util_ifloor(u);
-      int vflr = util_ifloor(v);
-
-      float xw = u - (float)uflr;
-      float yw = v - (float)vflr;
-
-      int x0 = uflr & (xpot - 1);
-      int y0 = vflr & (ypot - 1);
-
-      const uint8_t *tx[4];
-      
-
-      /* Can we fetch all four at once:
-       */
-      if (x0 < xmax && y0 < ymax)
-      {
-         get_texel_quad_2d(tgsi_sampler, 0, level, x0, y0, tx);
-      }
-      else 
-      {
-         unsigned x1 = (x0 + 1) & (xpot - 1);
-         unsigned y1 = (y0 + 1) & (ypot - 1);
-         get_texel_quad_2d_mt(tgsi_sampler, 0, level, 
-                              x0, y0, x1, y1, tx);
-      }
-
-
-      /* interpolate R, G, B, A */
-      for (c = 0; c < 4; c++) {
-         rgba[c][j] = lerp_2d(xw, yw, 
-                              ubyte_to_float(tx[0][c]), ubyte_to_float(tx[1][c]),
-                              ubyte_to_float(tx[2][c]), ubyte_to_float(tx[3][c]));
-      }
-   }
-}
-
-
-static void
-lp_get_samples_2d_nearest_repeat_POT(struct tgsi_sampler *tgsi_sampler,
-                                     const float s[QUAD_SIZE],
-                                     const float t[QUAD_SIZE],
-                                     const float p[QUAD_SIZE],
-                                     float lodbias,
-                                     float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   unsigned  j;
-   unsigned level = samp->level;
-   unsigned xpot = 1 << (samp->xpot - level);
-   unsigned ypot = 1 << (samp->ypot - level);
-
-   for (j = 0; j < QUAD_SIZE; j++) {
-      int c;
-
-      float u = s[j] * xpot;
-      float v = t[j] * ypot;
-
-      int uflr = util_ifloor(u);
-      int vflr = util_ifloor(v);
-
-      int x0 = uflr & (xpot - 1);
-      int y0 = vflr & (ypot - 1);
-
-      const uint8_t *out = get_texel_2d_ptr(tgsi_sampler, 0, level, x0, y0);
-
-      for (c = 0; c < 4; c++) {
-         rgba[c][j] = ubyte_to_float(out[c]);
-      }
-   }
-}
-
-
-static void
-lp_get_samples_2d_nearest_clamp_POT(struct tgsi_sampler *tgsi_sampler,
-                                     const float s[QUAD_SIZE],
-                                     const float t[QUAD_SIZE],
-                                     const float p[QUAD_SIZE],
-                                     float lodbias,
-                                     float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   unsigned  j;
-   unsigned level = samp->level;
-   unsigned xpot = 1 << (samp->xpot - level);
-   unsigned ypot = 1 << (samp->ypot - level);
-
-   for (j = 0; j < QUAD_SIZE; j++) {
-      int c;
-
-      float u = s[j] * xpot;
-      float v = t[j] * ypot;
-
-      int x0, y0;
-      const uint8_t *out;
-
-      x0 = util_ifloor(u);
-      if (x0 < 0) 
-         x0 = 0;
-      else if (x0 > xpot - 1)
-         x0 = xpot - 1;
-
-      y0 = util_ifloor(v);
-      if (y0 < 0) 
-         y0 = 0;
-      else if (y0 > ypot - 1)
-         y0 = ypot - 1;
-      
-      out = get_texel_2d_ptr(tgsi_sampler, 0, level, x0, y0);
-
-      for (c = 0; c < 4; c++) {
-         rgba[c][j] = ubyte_to_float(out[c]);
-      }
-   }
-}
-
-
-static void
-lp_get_samples_2d_linear_mip_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
-                                               const float s[QUAD_SIZE],
-                                               const float t[QUAD_SIZE],
-                                               const float p[QUAD_SIZE],
-                                               float lodbias,
-                                               float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   int level0;
-   float lambda;
-
-   lambda = compute_lambda(tgsi_sampler, s, t, p, lodbias);
-   level0 = (int)lambda;
-
-   if (lambda < 0.0) { 
-      samp->level = 0;
-      lp_get_samples_2d_linear_repeat_POT( tgsi_sampler,
-                                           s, t, p, 0, rgba );
-   }
-   else if (level0 >= texture->last_level) {
-      samp->level = texture->last_level;
-      lp_get_samples_2d_linear_repeat_POT( tgsi_sampler,
-                                           s, t, p, 0, rgba );
-   }
-   else {
-      float levelBlend = lambda - level0;
-      float rgba0[4][4];
-      float rgba1[4][4];
-      int c,j;
-
-      samp->level = level0;
-      lp_get_samples_2d_linear_repeat_POT( tgsi_sampler,
-                                           s, t, p, 0, rgba0 );
-
-      samp->level = level0+1;
-      lp_get_samples_2d_linear_repeat_POT( tgsi_sampler,
-                                           s, t, p, 0, rgba1 );
-
-      for (j = 0; j < QUAD_SIZE; j++) {
-         for (c = 0; c < 4; c++) {
-            rgba[c][j] = lerp(levelBlend, rgba0[c][j], rgba1[c][j]);
-         }
-      }
-   }
-}
-
-/**
- * Common code for sampling 1D/2D/cube textures.
- * Could probably extend for 3D...
- */
-static void
-lp_get_samples_2d_common(struct tgsi_sampler *tgsi_sampler,
-                         const float s[QUAD_SIZE],
-                         const float t[QUAD_SIZE],
-                         const float p[QUAD_SIZE],
-                         float lodbias,
-                         float rgba[NUM_CHANNELS][QUAD_SIZE],
-                         const unsigned faces[4])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-   unsigned level0, level1, j, imgFilter;
-   int width, height;
-   float levelBlend = 0.0f;
-
-   choose_mipmap_levels(tgsi_sampler, s, t, p, 
-                        lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   assert(sampler->normalized_coords);
-
-   width = u_minify(texture->width0, level0);
-   height = u_minify(texture->height0, level0);
-
-   assert(width > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4];
-         nearest_texcoord_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_4(sampler->wrap_t, t, height, y);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, faces[j], level0, x[j], y[j], 0, rgba, j);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare(sampler, rgba, p, j);
-            }
-
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               unsigned c;
-               x[j] /= 2;
-               y[j] /= 2;
-               get_texel(tgsi_sampler, faces[j], level1, x[j], y[j], 0,
-                         rgba2, j);
-               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
-                  shadow_compare(sampler, rgba2, p, j);
-               }
-
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
-      {
-         int x0[4], y0[4], x1[4], y1[4];
-         float xw[4], yw[4]; /* weights */
-
-         linear_texcoord_4(sampler->wrap_s, s, width, x0, x1, xw);
-         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            float tx[4][4]; /* texels */
-            int c;
-            get_texel(tgsi_sampler, faces[j], level0, x0[j], y0[j], 0, tx, 0);
-            get_texel(tgsi_sampler, faces[j], level0, x1[j], y0[j], 0, tx, 1);
-            get_texel(tgsi_sampler, faces[j], level0, x0[j], y1[j], 0, tx, 2);
-            get_texel(tgsi_sampler, faces[j], level0, x1[j], y1[j], 0, tx, 3);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare4(sampler, tx, p);
-            }
-
-            /* interpolate R, G, B, A */
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_2d(xw[j], yw[j],
-                                    tx[c][0], tx[c][1],
-                                    tx[c][2], tx[c][3]);
-            }
-
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-
-               /* XXX: This is incorrect -- will often end up with (x0
-                *  == x1 && y0 == y1), meaning that we fetch the same
-                *  texel four times and linearly interpolate between
-                *  identical values.  The correct approach would be to
-                *  call linear_texcoord again for the second level.
-                */
-               x0[j] /= 2;
-               y0[j] /= 2;
-               x1[j] /= 2;
-               y1[j] /= 2;
-               get_texel(tgsi_sampler, faces[j], level1, x0[j], y0[j], 0, tx, 0);
-               get_texel(tgsi_sampler, faces[j], level1, x1[j], y0[j], 0, tx, 1);
-               get_texel(tgsi_sampler, faces[j], level1, x0[j], y1[j], 0, tx, 2);
-               get_texel(tgsi_sampler, faces[j], level1, x1[j], y1[j], 0, tx, 3);
-               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
-                  shadow_compare4(sampler, tx, p);
-               }
-
-               /* interpolate R, G, B, A */
-               for (c = 0; c < 4; c++) {
-                  rgba2[c][j] = lerp_2d(xw[j], yw[j],
-                                        tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
-               }
-
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
-   default:
-      assert(0);
-   }
-}
-
-
-static INLINE void
-lp_get_samples_1d(struct tgsi_sampler *sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   static const unsigned faces[4] = {0, 0, 0, 0};
-   static const float tzero[4] = {0, 0, 0, 0};
-   lp_get_samples_2d_common(sampler, s, tzero, NULL,
-                            lodbias, rgba, faces);
-}
-
-
-static INLINE void
-lp_get_samples_2d(struct tgsi_sampler *sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   static const unsigned faces[4] = {0, 0, 0, 0};
-   lp_get_samples_2d_common(sampler, s, t, p,
-                            lodbias, rgba, faces);
-}
-
-
-static INLINE void
-lp_get_samples_3d(struct tgsi_sampler *tgsi_sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-   /* get/map pipe_surfaces corresponding to 3D tex slices */
-   unsigned level0, level1, j, imgFilter;
-   int width, height, depth;
-   float levelBlend = 0.0f;
-   const uint face = 0;
-
-   choose_mipmap_levels(tgsi_sampler, s, t, p, 
-                        lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   assert(sampler->normalized_coords);
-
-   width = u_minify(texture->width0, level0);
-   height = u_minify(texture->height0, level0);
-   depth = u_minify(texture->depth0, level0);
-
-   assert(width > 0);
-   assert(height > 0);
-   assert(depth > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4], z[4];
-         nearest_texcoord_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_4(sampler->wrap_t, t, height, y);
-         nearest_texcoord_4(sampler->wrap_r, p, depth, z);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, face, level0, x[j], y[j], z[j], rgba, j);
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               unsigned c;
-               x[j] /= 2;
-               y[j] /= 2;
-               z[j] /= 2;
-               get_texel(tgsi_sampler, face, level1, x[j], y[j], z[j], rgba2, j);
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba2[c][j], rgba[c][j]);
-               }
-            }
-         }
-      }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
-      {
-         int x0[4], x1[4], y0[4], y1[4], z0[4], z1[4];
-         float xw[4], yw[4], zw[4]; /* interpolation weights */
-         linear_texcoord_4(sampler->wrap_s, s, width,  x0, x1, xw);
-         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
-         linear_texcoord_4(sampler->wrap_r, p, depth,  z0, z1, zw);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int c;
-            float tx0[4][4], tx1[4][4];
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z0[j], tx0, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z0[j], tx0, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z0[j], tx0, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z0[j], tx0, 3);
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z1[j], tx1, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z1[j], tx1, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z1[j], tx1, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z1[j], tx1, 3);
-
-            /* interpolate R, G, B, A */
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_3d(xw[j], yw[j], zw[j],
-                                    tx0[c][0], tx0[c][1],
-                                    tx0[c][2], tx0[c][3],
-                                    tx1[c][0], tx1[c][1],
-                                    tx1[c][2], tx1[c][3]);
-            }
-
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               x0[j] /= 2;
-               y0[j] /= 2;
-               z0[j] /= 2;
-               x1[j] /= 2;
-               y1[j] /= 2;
-               z1[j] /= 2;
-               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z0[j], tx0, 0);
-               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z0[j], tx0, 1);
-               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z0[j], tx0, 2);
-               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z0[j], tx0, 3);
-               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z1[j], tx1, 0);
-               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z1[j], tx1, 1);
-               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z1[j], tx1, 2);
-               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z1[j], tx1, 3);
-
-               /* interpolate R, G, B, A */
-               for (c = 0; c < 4; c++) {
-                  rgba2[c][j] = lerp_3d(xw[j], yw[j], zw[j],
-                                        tx0[c][0], tx0[c][1],
-                                        tx0[c][2], tx0[c][3],
-                                        tx1[c][0], tx1[c][1],
-                                        tx1[c][2], tx1[c][3]);
-               }
-
-               /* blend mipmap levels */
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
-   default:
-      assert(0);
-   }
-}
-
-
-static void
-lp_get_samples_cube(struct tgsi_sampler *sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   unsigned faces[QUAD_SIZE], j;
-   float ssss[4], tttt[4];
-   for (j = 0; j < QUAD_SIZE; j++) {
-      faces[j] = choose_cube_face(s[j], t[j], p[j], ssss + j, tttt + j);
-   }
-   lp_get_samples_2d_common(sampler, ssss, tttt, NULL,
-                            lodbias, rgba, faces);
-}
-
-
-static void
-lp_get_samples_rect(struct tgsi_sampler *tgsi_sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-   const uint face = 0;
-   unsigned level0, level1, j, imgFilter;
-   int width, height;
-   float levelBlend;
-
-   choose_mipmap_levels(tgsi_sampler, s, t, p, 
-                        lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   /* texture RECTS cannot be mipmapped */
-   assert(level0 == level1);
-
-   width = u_minify(texture->width0, level0);
-   height = u_minify(texture->height0, level0);
-
-   assert(width > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4];
-         nearest_texcoord_unnorm_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_unnorm_4(sampler->wrap_t, t, height, y);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, face, level0, x[j], y[j], 0, rgba, j);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare(sampler, rgba, p, j);
-            }
-         }
-      }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
-      {
-         int x0[4], y0[4], x1[4], y1[4];
-         float xw[4], yw[4]; /* weights */
-         linear_texcoord_unnorm_4(sampler->wrap_s, s, width,  x0, x1, xw);
-         linear_texcoord_unnorm_4(sampler->wrap_t, t, height, y0, y1, yw);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            float tx[4][4]; /* texels */
-            int c;
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], 0, tx, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], 0, tx, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], 0, tx, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], 0, tx, 3);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare4(sampler, tx, p);
-            }
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_2d(xw[j], yw[j],
-                                    tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
-            }
-         }
-      }
-      break;
-   default:
-      assert(0);
-   }
-}
-
-
-/**
- * Error condition handler
- */
-static INLINE void
-lp_get_samples_null(struct tgsi_sampler *tgsi_sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   int i,j;
-
-   for (i = 0; i < 4; i++)
-      for (j = 0; j < 4; j++)
-         rgba[i][j] = 1.0;
-}
-
-/**
- * Called via tgsi_sampler::get_samples() when using a sampler for the
- * first time.  Determine the actual sampler function, link it in and
- * call it.
- */
-void
-lp_get_samples(struct tgsi_sampler *tgsi_sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               float lodbias,
-               float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-
-   /* Default to the 'undefined' case:
-    */
-   tgsi_sampler->get_samples = lp_get_samples_null;
-
-   if (!texture) {
-      assert(0);                /* is this legal?? */
-      goto out;
-   }
-
-   if (!sampler->normalized_coords) {
-      assert (texture->target == PIPE_TEXTURE_2D);
-      tgsi_sampler->get_samples = lp_get_samples_rect;
-      goto out;
-   }
-
-   switch (texture->target) {
-   case PIPE_TEXTURE_1D:
-      tgsi_sampler->get_samples = lp_get_samples_1d;
-      break;
-   case PIPE_TEXTURE_2D:
-      tgsi_sampler->get_samples = lp_get_samples_2d;
-      break;
-   case PIPE_TEXTURE_3D:
-      tgsi_sampler->get_samples = lp_get_samples_3d;
-      break;
-   case PIPE_TEXTURE_CUBE:
-      tgsi_sampler->get_samples = lp_get_samples_cube;
-      break;
-   default:
-      assert(0);
-      break;
-   }
-
-   /* Do this elsewhere: 
-    */
-   samp->xpot = util_unsigned_logbase2( samp->texture->width0 );
-   samp->ypot = util_unsigned_logbase2( samp->texture->height0 );
-
-   /* Try to hook in a faster sampler.  Ultimately we'll have to
-    * code-generate these.  Luckily most of this looks like it is
-    * orthogonal state within the sampler.
-    */
-   if (texture->target == PIPE_TEXTURE_2D &&
-       sampler->min_img_filter == sampler->mag_img_filter &&
-       sampler->wrap_s == sampler->wrap_t &&
-       sampler->compare_mode == FALSE &&
-       sampler->normalized_coords) 
-   {
-      if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
-         samp->level = CLAMP((int) sampler->min_lod,
-                             0, (int) texture->last_level);
-
-         if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT) {
-            switch (sampler->min_img_filter) {
-            case PIPE_TEX_FILTER_NEAREST:
-               tgsi_sampler->get_samples = lp_get_samples_2d_nearest_repeat_POT;
-               break;
-            case PIPE_TEX_FILTER_LINEAR:
-               tgsi_sampler->get_samples = lp_get_samples_2d_linear_repeat_POT;
-               break;
-            default:
-               break;
-            }
-         } 
-         else if (sampler->wrap_s == PIPE_TEX_WRAP_CLAMP) {
-            switch (sampler->min_img_filter) {
-            case PIPE_TEX_FILTER_NEAREST:
-               tgsi_sampler->get_samples = lp_get_samples_2d_nearest_clamp_POT;
-               break;
-            default:
-               break;
-            }
-         }
-      }
-      else if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT) {
-            switch (sampler->min_img_filter) {
-            case PIPE_TEX_FILTER_LINEAR:
-               tgsi_sampler->get_samples = lp_get_samples_2d_linear_mip_linear_repeat_POT;
-               break;
-            default:
-               break;
-            }
-         } 
-      }
-   }
-   else if (0) {
-      _debug_printf("target %d/%d min_mip %d/%d min_img %d/%d wrap %d/%d compare %d/%d norm %d/%d\n",
-                    texture->target, PIPE_TEXTURE_2D,
-                    sampler->min_mip_filter, PIPE_TEX_MIPFILTER_NONE,
-                    sampler->min_img_filter, sampler->mag_img_filter,
-                    sampler->wrap_s, sampler->wrap_t,
-                    sampler->compare_mode, FALSE,
-                    sampler->normalized_coords, TRUE);
-   }
-
-out:
-   tgsi_sampler->get_samples( tgsi_sampler, s, t, p, lodbias, rgba );
-}
-
-
-void PIPE_CDECL
-lp_fetch_texel_soa( struct tgsi_sampler **samplers,
-                    uint32_t unit,
-                    float *store )
-{
-   struct tgsi_sampler *sampler = samplers[unit];
-
-#if 0
-   uint j;
-
-   debug_printf("%s sampler: %p (%p) store: %p\n",
-                __FUNCTION__,
-                sampler, *sampler,
-                store );
-
-   debug_printf("lodbias %f\n", store[12]);
-
-   for (j = 0; j < 4; j++)
-      debug_printf("sample %d texcoord %f %f\n",
-                   j,
-                   store[0+j],
-                   store[4+j]);
-#endif
-
-   {
-      float rgba[NUM_CHANNELS][QUAD_SIZE];
-      sampler->get_samples(sampler,
-                           &store[0],
-                           &store[4],
-                           &store[8],
-                           0.0f, /*store[12],  lodbias */
-                           rgba);
-      memcpy(store, rgba, sizeof rgba);
-   }
-
-#if 0
-   for (j = 0; j < 4; j++)
-      debug_printf("sample %d result %f %f %f %f\n",
-                   j,
-                   store[0+j],
-                   store[4+j],
-                   store[8+j],
-                   store[12+j]);
-#endif
-}
-
-
-#include "lp_bld_type.h"
-#include "lp_bld_intr.h"
-#include "lp_bld_tgsi.h"
-
-
-struct lp_c_sampler_soa
-{
-   struct lp_build_sampler_soa base;
-
-   LLVMValueRef context_ptr;
-
-   LLVMValueRef samplers_ptr;
-
-   /** Coords/texels store */
-   LLVMValueRef store_ptr;
-};
-
-
-static void
-lp_c_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
-{
-   FREE(sampler);
-}
-
-
-static void
-lp_c_sampler_soa_emit_fetch_texel(struct lp_build_sampler_soa *_sampler,
-                                  LLVMBuilderRef builder,
-                                  struct lp_type type,
-                                  unsigned unit,
-                                  unsigned num_coords,
-                                  const LLVMValueRef *coords,
-                                  LLVMValueRef lodbias,
-                                  LLVMValueRef *texel)
-{
-   struct lp_c_sampler_soa *sampler = (struct lp_c_sampler_soa *)_sampler;
-   LLVMTypeRef vec_type = LLVMTypeOf(coords[0]);
-   LLVMValueRef args[3];
-   unsigned i;
-
-   if(!sampler->samplers_ptr)
-      sampler->samplers_ptr = lp_jit_context_samplers(builder, sampler->context_ptr);
-
-   if(!sampler->store_ptr)
-      sampler->store_ptr = LLVMBuildArrayAlloca(builder,
-                                            vec_type,
-                                            LLVMConstInt(LLVMInt32Type(), 4, 0),
-                                            "texel_store");
-
-   for (i = 0; i < num_coords; i++) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef coord_ptr = LLVMBuildGEP(builder, sampler->store_ptr, &index, 1, "");
-      LLVMBuildStore(builder, coords[i], coord_ptr);
-   }
-
-   args[0] = sampler->samplers_ptr;
-   args[1] = LLVMConstInt(LLVMInt32Type(), unit, 0);
-   args[2] = sampler->store_ptr;
-
-   lp_build_intrinsic(builder, "fetch_texel", LLVMVoidType(), args, 3);
-
-   for (i = 0; i < NUM_CHANNELS; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef texel_ptr = LLVMBuildGEP(builder, sampler->store_ptr, &index, 1, "");
-      texel[i] = LLVMBuildLoad(builder, texel_ptr, "");
-   }
-}
-
-
-struct lp_build_sampler_soa *
-lp_c_sampler_soa_create(LLVMValueRef context_ptr)
-{
-   struct lp_c_sampler_soa *sampler;
-
-   sampler = CALLOC_STRUCT(lp_c_sampler_soa);
-   if(!sampler)
-      return NULL;
-
-   sampler->base.destroy = lp_c_sampler_soa_destroy;
-   sampler->base.emit_fetch_texel = lp_c_sampler_soa_emit_fetch_texel;
-   sampler->context_ptr = context_ptr;
-
-   return &sampler->base;
-}
-
diff --git a/src/gallium/drivers/nouveau/nouveau_push.h b/src/gallium/drivers/nouveau/nouveau_push.h
deleted file mode 100644
index 9c235080a5..0000000000
--- a/src/gallium/drivers/nouveau/nouveau_push.h
+++ /dev/null
@@ -1,93 +0,0 @@
-#ifndef __NOUVEAU_PUSH_H__
-#define __NOUVEAU_PUSH_H__
-
-#include "nouveau/nouveau_winsys.h"
-
-#ifndef NOUVEAU_PUSH_CONTEXT
-#error undefined push context
-#endif
-
-#define OUT_RING(data) do {                                                    \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	(*pc->base.channel->pushbuf->cur++) = (data);                          \
-} while(0)
-
-#define OUT_RINGp(src,size) do {                                               \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	memcpy(pc->base.channel->pushbuf->cur, (src), (size) * 4);             \
-	pc->base.channel->pushbuf->cur += (size);                              \
-} while(0)
-
-#define OUT_RINGf(data) do {                                                   \
-	union { float v; uint32_t u; } c;                                      \
-	c.v = (data);                                                          \
-	OUT_RING(c.u);                                                         \
-} while(0)
-
-#define BEGIN_RING(obj,mthd,size) do {                                         \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	struct nouveau_channel *chan = pc->base.channel;                       \
-	if (chan->pushbuf->remaining < ((size) + 1))                           \
-		nouveau_pushbuf_flush(chan, ((size) + 1));                     \
-	OUT_RING((pc->obj->subc << 13) | ((size) << 18) | (mthd));             \
-	chan->pushbuf->remaining -= ((size) + 1);                              \
-} while(0)
-
-#define BEGIN_RING_NI(obj,mthd,size) do {                                      \
-	BEGIN_RING(obj, (mthd) | 0x40000000, (size));                          \
-} while(0)
-
-static inline void
-DO_FIRE_RING(struct nouveau_channel *chan, struct pipe_fence_handle **fence)
-{
-	nouveau_pushbuf_flush(chan, 0);
-	if (fence)
-		*fence = NULL;
-}
-
-#define FIRE_RING(fence) do {                                                  \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	DO_FIRE_RING(pc->base.channel, fence);                                 \
-} while(0)
-
-#define OUT_RELOC(bo,data,flags,vor,tor) do {                                  \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	struct nouveau_channel *chan = pc->base.channel;                       \
-	nouveau_pushbuf_emit_reloc(chan, chan->pushbuf->cur++, nouveau_bo(bo), \
-				   (data), 0, (flags), (vor), (tor));          \
-} while(0)
-
-/* Raw data + flags depending on FB/TT buffer */
-#define OUT_RELOCd(bo,data,flags,vor,tor) do {                                 \
-	OUT_RELOC((bo), (data), (flags) | NOUVEAU_BO_OR, (vor), (tor));        \
-} while(0)
-
-/* FB/TT object handle */
-#define OUT_RELOCo(bo,flags) do {                                              \
-	OUT_RELOC((bo), 0, (flags) | NOUVEAU_BO_OR,                            \
-		  pc->base.channel->vram->handle,                              \
-		  pc->base.channel->gart->handle);                             \
-} while(0)
-
-/* Low 32-bits of offset */
-#define OUT_RELOCl(bo,delta,flags) do {                                        \
-	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_LOW, 0, 0);              \
-} while(0)
-
-/* High 32-bits of offset */
-#define OUT_RELOCh(bo,delta,flags) do {                                        \
-	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_HIGH, 0, 0);             \
-} while(0)
-
-/* A reloc which'll recombine into a NV_DMA_METHOD packet header */
-#define OUT_RELOCm(bo, flags, obj, mthd, size) do {                            \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	struct nouveau_channel *chan = pc->base.channel;                       \
-	if (chan->pushbuf->remaining < ((size) + 1))                           \
-		nouveau_pushbuf_flush(chan, ((size) + 1));                     \
-	OUT_RELOCd((bo), (pc->obj->subc << 13) | ((size) << 18) | (mthd),      \
-		   (flags), 0, 0);                                             \
-	chan->pushbuf->remaining -= ((size) + 1);                              \
-} while(0)
-
-#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index 0437af3725..7ebc94ed6c 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -127,8 +127,18 @@ nouveau_screen_bo_map(struct pipe_screen *pscreen, struct pipe_buffer *pb,
 		      unsigned usage)
 {
 	struct nouveau_bo *bo = nouveau_bo(pb);
+	struct nouveau_screen *nscreen = nouveau_screen(pscreen);
 	int ret;
 
+	if (nscreen->pre_pipebuffer_map_callback) {
+		ret = nscreen->pre_pipebuffer_map_callback(pscreen, pb, usage);
+		if (ret) {
+			debug_printf("pre_pipebuffer_map_callback failed %d\n",
+				ret);
+			return NULL;
+		}
+	}
+
 	ret = nouveau_bo_map(bo, nouveau_screen_map_flags(usage));
 	if (ret) {
 		debug_printf("map failed: %d\n", ret);
@@ -143,11 +153,22 @@ nouveau_screen_bo_map_range(struct pipe_screen *pscreen, struct pipe_buffer *pb,
 			    unsigned offset, unsigned length, unsigned usage)
 {
 	struct nouveau_bo *bo = nouveau_bo(pb);
+	struct nouveau_screen *nscreen = nouveau_screen(pscreen);
 	uint32_t flags = nouveau_screen_map_flags(usage);
 	int ret;
 
+	if (nscreen->pre_pipebuffer_map_callback) {
+		ret = nscreen->pre_pipebuffer_map_callback(pscreen, pb, usage);
+		if (ret) {
+			debug_printf("pre_pipebuffer_map_callback failed %d\n",
+				ret);
+			return NULL;
+		}
+	}
+
 	ret = nouveau_bo_map_range(bo, offset, length, flags);
 	if (ret) {
+		nouveau_bo_unmap(bo);
 		if (!(flags & NOUVEAU_BO_NOWAIT) || ret != -EBUSY)
 			debug_printf("map_range failed: %d\n", ret);
 		return NULL;
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h
index ebfc67ad1c..a7927d88df 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.h
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@@ -5,6 +5,9 @@ struct nouveau_screen {
 	struct pipe_screen base;
 	struct nouveau_device *device;
 	struct nouveau_channel *channel;
+
+	int (*pre_pipebuffer_map_callback) (struct pipe_screen *pscreen,
+		struct pipe_buffer *pb, unsigned usage);
 };
 
 static inline struct nouveau_screen *
diff --git a/src/gallium/drivers/nouveau/nouveau_stateobj.h b/src/gallium/drivers/nouveau/nouveau_stateobj.h
index 9aee9e4956..e844f6abb3 100644
--- a/src/gallium/drivers/nouveau/nouveau_stateobj.h
+++ b/src/gallium/drivers/nouveau/nouveau_stateobj.h
@@ -3,41 +3,95 @@
 
 #include "util/u_debug.h"
 
+#ifdef DEBUG
+#define DEBUG_NOUVEAU_STATEOBJ
+#endif /* DEBUG */
+
 struct nouveau_stateobj_reloc {
 	struct nouveau_bo *bo;
 
-	unsigned offset;
-	unsigned packet;
+	struct nouveau_grobj *gr;
+	uint32_t push_offset;
+	uint32_t mthd;
 
-	unsigned data;
+	uint32_t data;
 	unsigned flags;
 	unsigned vor;
 	unsigned tor;
 };
 
+struct nouveau_stateobj_start {
+	struct nouveau_grobj *gr;
+	uint32_t mthd;
+	uint32_t size;
+	unsigned offset;
+};
+
 struct nouveau_stateobj {
 	struct pipe_reference reference;
 
-	unsigned *push;
+	struct nouveau_stateobj_start *start;
 	struct nouveau_stateobj_reloc *reloc;
 
-	unsigned *cur;
-	unsigned cur_packet;
+	/* Common memory pool for data. */
+	uint32_t *pool;
+	unsigned pool_cur;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	unsigned start_alloc;
+	unsigned reloc_alloc;
+	unsigned pool_alloc;
+#endif  /* DEBUG_NOUVEAU_STATEOBJ */
+
+	unsigned total; /* includes begin_ring */
+	unsigned cur; /* excludes begin_ring, offset from "cur_start" */
+	unsigned cur_start;
 	unsigned cur_reloc;
 };
 
+static INLINE void
+so_dump(struct nouveau_stateobj *so)
+{
+	unsigned i, nr, total = 0;
+
+	for (i = 0; i < so->cur_start; i++) {
+		if (so->start[i].gr->subc > -1)
+			debug_printf("+0x%04x: 0x%08x\n", total++,
+				(so->start[i].size << 18) | (so->start[i].gr->subc << 13)
+				| so->start[i].mthd);
+		else
+			debug_printf("+0x%04x: 0x%08x\n", total++,
+				(so->start[i].size << 18) | so->start[i].mthd);
+		for (nr = 0; nr < so->start[i].size; nr++, total++)
+			debug_printf("+0x%04x: 0x%08x\n", total,
+				so->pool[so->start[i].offset + nr]);
+	}
+}
+
 static INLINE struct nouveau_stateobj *
-so_new(unsigned push, unsigned reloc)
+so_new(unsigned start, unsigned push, unsigned reloc)
 {
 	struct nouveau_stateobj *so;
 
 	so = MALLOC(sizeof(struct nouveau_stateobj));
 	pipe_reference_init(&so->reference, 1);
-	so->push = MALLOC(sizeof(unsigned) * push);
-	so->reloc = MALLOC(sizeof(struct nouveau_stateobj_reloc) * reloc);
+	so->total = so->cur = so->cur_start = so->cur_reloc = 0;
 
-	so->cur = so->push;
-	so->cur_reloc = so->cur_packet = 0;
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	so->start_alloc = start;
+	so->reloc_alloc = reloc;
+	so->pool_alloc = push;
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	so->start = MALLOC(start * sizeof(struct nouveau_stateobj_start));
+	so->reloc = MALLOC(reloc * sizeof(struct nouveau_stateobj_reloc));
+	so->pool = MALLOC(push * sizeof(uint32_t));
+	so->pool_cur = 0;
+
+	if (!so->start || !so->reloc || !so->pool) {
+		debug_printf("malloc failed\n");
+		assert(0);
+	}
 
 	return so;
 }
@@ -48,63 +102,128 @@ so_ref(struct nouveau_stateobj *ref, struct nouveau_stateobj **pso)
 	struct nouveau_stateobj *so = *pso;
 	int i;
 
-        if (pipe_reference(&(*pso)->reference, &ref->reference)) {
-		free(so->push);
+	if (pipe_reference(&(*pso)->reference, &ref->reference)) {
+		FREE(so->start);
 		for (i = 0; i < so->cur_reloc; i++)
 			nouveau_bo_ref(NULL, &so->reloc[i].bo);
-		free(so->reloc);
-		free(so);
+		FREE(so->reloc);
+		FREE(so->pool);
+		FREE(so);
 	}
 	*pso = ref;
 }
 
 static INLINE void
-so_data(struct nouveau_stateobj *so, unsigned data)
+so_data(struct nouveau_stateobj *so, uint32_t data)
 {
-	(*so->cur++) = (data);
-	so->cur_packet += 4;
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->cur >= so->start[so->cur_start - 1].size) {
+		debug_printf("exceeding specified size\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	so->pool[so->start[so->cur_start - 1].offset + so->cur++] = data;
 }
 
 static INLINE void
-so_datap(struct nouveau_stateobj *so, unsigned *data, unsigned size)
+so_datap(struct nouveau_stateobj *so, uint32_t *data, unsigned size)
 {
-	so->cur_packet += (4 * size);
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if ((so->cur + size) > so->start[so->cur_start - 1].size) {
+		debug_printf("exceeding specified size\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
 	while (size--)
-		(*so->cur++) = (*data++);
+		so->pool[so->start[so->cur_start - 1].offset + so->cur++] =
+			*data++;
 }
 
 static INLINE void
 so_method(struct nouveau_stateobj *so, struct nouveau_grobj *gr,
 	  unsigned mthd, unsigned size)
 {
-	so->cur_packet = (gr->subc << 13) | (1 << 18) | (mthd - 4);
-	so_data(so, (gr->subc << 13) | (size << 18) | mthd);
+	struct nouveau_stateobj_start *start;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->start_alloc <= so->cur_start) {
+		debug_printf("exceeding num_start size\n");
+		assert(0);
+	} else
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+		start = so->start;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->cur_start > 0 && start[so->cur_start - 1].size > so->cur) {
+		debug_printf("previous so_method was not filled\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	so->start = start;
+	start[so->cur_start].gr = gr;
+	start[so->cur_start].mthd = mthd;
+	start[so->cur_start].size = size;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->pool_alloc < (size + so->pool_cur)) {
+		debug_printf("exceeding num_pool size\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	start[so->cur_start].offset = so->pool_cur;
+	so->pool_cur += size;
+
+	so->cur_start++;
+	/* The 1 is for *this* begin_ring. */
+	so->total += so->cur + 1;
+	so->cur = 0;
 }
 
 static INLINE void
 so_reloc(struct nouveau_stateobj *so, struct nouveau_bo *bo,
 	 unsigned data, unsigned flags, unsigned vor, unsigned tor)
 {
-	struct nouveau_stateobj_reloc *r = &so->reloc[so->cur_reloc++];
-	
-	r->bo = NULL;
-	nouveau_bo_ref(bo, &r->bo);
-	r->offset = so->cur - so->push;
-	r->packet = so->cur_packet;
-	r->data = data;
-	r->flags = flags;
-	r->vor = vor;
-	r->tor = tor;
+	struct nouveau_stateobj_reloc *r;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->reloc_alloc <= so->cur_reloc) {
+		debug_printf("exceeding num_reloc size\n");
+		assert(0);
+	} else
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+		r = so->reloc;
+
+	so->reloc = r;
+	r[so->cur_reloc].bo = NULL;
+	nouveau_bo_ref(bo, &(r[so->cur_reloc].bo));
+	r[so->cur_reloc].gr = so->start[so->cur_start-1].gr;
+	r[so->cur_reloc].push_offset = so->total + so->cur;
+	r[so->cur_reloc].data = data;
+	r[so->cur_reloc].flags = flags;
+	r[so->cur_reloc].mthd = so->start[so->cur_start-1].mthd +
+							(so->cur << 2);
+	r[so->cur_reloc].vor = vor;
+	r[so->cur_reloc].tor = tor;
+
 	so_data(so, data);
+	so->cur_reloc++;
 }
 
-static INLINE void
-so_dump(struct nouveau_stateobj *so)
+/* Determine if this buffer object is referenced by this state object. */
+static INLINE boolean
+so_bo_is_reloc(struct nouveau_stateobj *so, struct nouveau_bo *bo)
 {
-	unsigned i, nr = so->cur - so->push;
+	int i;
+
+	for (i = 0; i < so->cur_reloc; i++)
+		if (so->reloc[i].bo == bo)
+			return true;
 
-	for (i = 0; i < nr; i++)
-		debug_printf("+0x%04x: 0x%08x\n", i, so->push[i]);
+	return false;
 }
 
 static INLINE void
@@ -114,75 +233,93 @@ so_emit(struct nouveau_channel *chan, struct nouveau_stateobj *so)
 	unsigned nr, i;
 	int ret = 0;
 
-	nr = so->cur - so->push;
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->start[so->cur_start - 1].size > so->cur) {
+		debug_printf("emit: previous so_method was not filled\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	/* We cannot update total in case we so_emit again. */
+	nr = so->total + so->cur;
+
 	/* This will flush if we need space.
 	 * We don't actually need the marker.
 	 */
 	if ((ret = nouveau_pushbuf_marker_emit(chan, nr, so->cur_reloc))) {
 		debug_printf("so_emit failed marker emit with error %d\n", ret);
-		return;
+		assert(0);
+	}
+
+	/* Submit data. This will ensure proper binding of objects. */
+	for (i = 0; i < so->cur_start; i++) {
+		BEGIN_RING(chan, so->start[i].gr, so->start[i].mthd, so->start[i].size);
+		OUT_RINGp(chan, &(so->pool[so->start[i].offset]), so->start[i].size);
 	}
-	pb->remaining -= nr;
 
-	memcpy(pb->cur, so->push, nr * 4);
 	for (i = 0; i < so->cur_reloc; i++) {
 		struct nouveau_stateobj_reloc *r = &so->reloc[i];
 
-		if ((ret = nouveau_pushbuf_emit_reloc(chan, pb->cur + r->offset,
-					   r->bo, r->data, 0, r->flags,
-					   r->vor, r->tor))) {
+		if ((ret = nouveau_pushbuf_emit_reloc(chan, pb->cur - nr +
+						r->push_offset, r->bo, r->data,
+						0, r->flags, r->vor, r->tor))) {
 			debug_printf("so_emit failed reloc with error %d\n", ret);
-			goto out;
+			assert(0);
 		}
 	}
-out:
-	pb->cur += nr;
 }
 
 static INLINE void
 so_emit_reloc_markers(struct nouveau_channel *chan, struct nouveau_stateobj *so)
 {
 	struct nouveau_pushbuf *pb = chan->pushbuf;
+	struct nouveau_grobj *gr = NULL;
 	unsigned i;
 	int ret = 0;
 
 	if (!so)
 		return;
 
-	i = so->cur_reloc << 1;
-	/* This will flush if we need space.
-	 * We don't actually need the marker.
-	 */
-	if ((ret = nouveau_pushbuf_marker_emit(chan, i, i))) {
-		debug_printf("so_emit_reloc_markers failed marker emit with" \
-			"error %d\n", ret);
-		return;
-	}
-	pb->remaining -= i;
-
+	/* If we need to flush in flush notify, then we have a problem anyway. */
 	for (i = 0; i < so->cur_reloc; i++) {
 		struct nouveau_stateobj_reloc *r = &so->reloc[i];
 
-		if ((ret = nouveau_pushbuf_emit_reloc(chan, pb->cur++, r->bo,
-					   r->packet, 0,
-					   (r->flags & (NOUVEAU_BO_VRAM |
-							NOUVEAU_BO_GART |
-							NOUVEAU_BO_RDWR)) |
-					   NOUVEAU_BO_DUMMY, 0, 0))) {
-			debug_printf("so_emit_reloc_markers failed reloc" \
-						"with error %d\n", ret);
-			pb->remaining += ((so->cur_reloc - i) << 1);
-			return;
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+		if (r->mthd & 0x40000000) {
+			debug_printf("error: NI mthd 0x%08X\n", r->mthd);
+			continue;
 		}
-		if ((ret = nouveau_pushbuf_emit_reloc(chan, pb->cur++, r->bo,
-					   r->data, 0,
-					   r->flags | NOUVEAU_BO_DUMMY,
-					   r->vor, r->tor))) {
-			debug_printf("so_emit_reloc_markers failed reloc" \
-						"with error %d\n", ret);
-			pb->remaining += ((so->cur_reloc - i) << 1) - 1;
-			return;
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+		/* The object needs to be bound and the system must know the
+		 * subchannel is being used. Otherwise it will discard it.
+		 */
+		if (gr != r->gr) {
+			BEGIN_RING(chan, r->gr, 0x100, 1);
+			OUT_RING(chan, 0);
+			gr = r->gr;
+		}
+
+		/* Some relocs really don't like to be hammered,
+		 * NOUVEAU_BO_DUMMY makes sure it only
+		 * happens when needed.
+		 */
+		ret = OUT_RELOC(chan, r->bo, (r->gr->subc << 13) | (1<< 18) |
+			r->mthd, (r->flags & (NOUVEAU_BO_VRAM | NOUVEAU_BO_GART
+				| NOUVEAU_BO_RDWR)) | NOUVEAU_BO_DUMMY, 0, 0);
+		if (ret) {
+			debug_printf("OUT_RELOC failed %d\n", ret);
+			assert(0);
 		}
+
+		ret = OUT_RELOC(chan, r->bo, r->data, r->flags |
+			NOUVEAU_BO_DUMMY, r->vor, r->tor);
+		if (ret) {
+			debug_printf("OUT_RELOC failed %d\n", ret);
+			assert(0);
+		}
+
+		pb->remaining -= 2;
 	}
 }
 
diff --git a/src/gallium/drivers/nv04/nv04_context.c b/src/gallium/drivers/nv04/nv04_context.c
index 770733a4a1..edd96859cf 100644
--- a/src/gallium/drivers/nv04/nv04_context.c
+++ b/src/gallium/drivers/nv04/nv04_context.c
@@ -10,10 +10,14 @@ nv04_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
 
 	draw_flush(nv04->draw);
 
-	FIRE_RING(fence);
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
 }
 
 static void
@@ -30,32 +34,36 @@ nv04_destroy(struct pipe_context *pipe)
 static boolean
 nv04_init_hwctx(struct nv04_context *nv04)
 {
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+
 	// requires a valid handle
-//	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_NOTIFY, 1);
+//	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_NOTIFY, 1);
 //	OUT_RING(0);
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_NOP, 1);
-	OUT_RING(0);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_NOP, 1);
+	OUT_RING(chan, 0);
 
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
-	OUT_RING(0x40182800);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
+	OUT_RING(chan, 0x40182800);
 //	OUT_RING(1<<20/*no cull*/);
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_BLEND, 1);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_BLEND, 1);
 //	OUT_RING(0x24|(1<<6)|(1<<8));
-	OUT_RING(0x120001a4);
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_FORMAT, 1);
-	OUT_RING(0x332213a1);
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_FILTER, 1);
-	OUT_RING(0x11001010);
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_COLORKEY, 1);
-	OUT_RING(0x0);
-//	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 1);
+	OUT_RING(chan, 0x120001a4);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_FORMAT, 1);
+	OUT_RING(chan, 0x332213a1);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_FILTER, 1);
+	OUT_RING(chan, 0x11001010);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_COLORKEY, 1);
+	OUT_RING(chan, 0x0);
+//	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 1);
 //	OUT_RING(SCREEN_OFFSET);
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_FOGCOLOR, 1);
-	OUT_RING(0xff000000);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_FOGCOLOR, 1);
+	OUT_RING(chan, 0xff000000);
 
 
 
-	FIRE_RING (NULL);
+	FIRE_RING (chan);
 	return TRUE;
 }
 
diff --git a/src/gallium/drivers/nv04/nv04_context.h b/src/gallium/drivers/nv04/nv04_context.h
index 55326c787a..fe3b527423 100644
--- a/src/gallium/drivers/nv04/nv04_context.h
+++ b/src/gallium/drivers/nv04/nv04_context.h
@@ -15,10 +15,6 @@
 #include "nouveau/nouveau_gldefs.h"
 #include "nouveau/nouveau_context.h"
 
-#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
-	struct nv04_screen *ctx = nv04->screen
-#include "nouveau/nouveau_push.h"
-
 #include "nv04_state.h"
 
 #define NOUVEAU_ERR(fmt, args...) \
@@ -141,9 +137,9 @@ extern void nv04_emit_hw_state(struct nv04_context *nv04);
 extern void nv04_state_tex_update(struct nv04_context *nv04);
 
 /* nv04_vbo.c */
-extern boolean nv04_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv04_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv04_draw_elements( struct pipe_context *pipe,
+extern void nv04_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count);
diff --git a/src/gallium/drivers/nv04/nv04_prim_vbuf.c b/src/gallium/drivers/nv04/nv04_prim_vbuf.c
index 25395edfd7..0b795ea243 100644
--- a/src/gallium/drivers/nv04/nv04_prim_vbuf.c
+++ b/src/gallium/drivers/nv04/nv04_prim_vbuf.c
@@ -93,33 +93,45 @@ nv04_vbuf_render_set_primitive( struct vbuf_render *render,
 
 static INLINE void nv04_2triangles(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5)
 {
-	BEGIN_RING(fahrenheit,NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xA),49);
-	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v3,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v4,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v5,8);
-	OUT_RING(0xFEDCBA);
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xA), 49);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v2,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v3,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v4,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v5,8);
+	OUT_RING(chan, 0xFEDCBA);
 }
 
 static INLINE void nv04_1triangle(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2)
 {
-	BEGIN_RING(fahrenheit,NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xD),25);
-	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
-	OUT_RING(0xFED);
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xD), 25);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v2,8);
+	OUT_RING(chan, 0xFED);
 }
 
 static INLINE void nv04_1quad(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2, ushort v3)
 {
-	BEGIN_RING(fahrenheit,NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xC),33);
-	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v3,8);
-	OUT_RING(0xFECEDC);
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xC), 33);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v2,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v3,8);
+	OUT_RING(chan, 0xFECEDC);
 }
 
 static void nv04_vbuf_render_triangles_elts(struct nv04_vbuf_render * render, const ushort * indices, uint nr_indices)
@@ -156,7 +168,10 @@ static void nv04_vbuf_render_tri_strip_elts(struct nv04_vbuf_render* render, con
 {
 	const uint32_t striptbl[]={0x321210,0x543432,0x765654,0x987876,0xBA9A98,0xDCBCBA,0xFEDEDC};
 	unsigned char* buffer = render->buffer;
-	struct nv04_context* nv04 = render->nv04;
+	struct nv04_context *nv04 = render->nv04;
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
 	int i,j;
 
 	for(i = 0; i<nr_indices; i+=14) 
@@ -166,15 +181,15 @@ static void nv04_vbuf_render_tri_strip_elts(struct nv04_vbuf_render* render, con
 		if (numvert<3)
 			break;
 
-		BEGIN_RING( fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), numvert*8 );
+		BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), numvert*8);
 		for(j = 0; j<numvert; j++)
-			OUT_RINGp( buffer + VERTEX_SIZE * indices [i+j], 8 );
+			OUT_RINGp(chan, buffer + VERTEX_SIZE * indices [i+j], 8 );
 
-		BEGIN_RING_NI( fahrenheit, NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE(0), (numtri+1)/2 );
+		BEGIN_RING_NI(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE(0), (numtri+1)/2 );
 		for(j = 0; j<numtri/2; j++ )
-			OUT_RING(striptbl[j]);
+			OUT_RING(chan, striptbl[j]);
 		if (numtri%2)
-			OUT_RING(striptbl[numtri/2]&0xFFF);
+			OUT_RING(chan, striptbl[numtri/2]&0xFFF);
 	}
 }
 
@@ -182,11 +197,14 @@ static void nv04_vbuf_render_tri_fan_elts(struct nv04_vbuf_render* render, const
 {
 	const uint32_t fantbl[]={0x320210,0x540430,0x760650,0x980870,0xBA0A90,0xDC0CB0,0xFE0ED0};
 	unsigned char* buffer = render->buffer;
-	struct nv04_context* nv04 = render->nv04;
+	struct nv04_context *nv04 = render->nv04;
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
 	int i,j;
 
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), 8);
-	OUT_RINGp(buffer + VERTEX_SIZE * indices[0], 8);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), 8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * indices[0], 8);
 
 	for(i = 1; i<nr_indices; i+=14)
 	{
@@ -195,16 +213,16 @@ static void nv04_vbuf_render_tri_fan_elts(struct nv04_vbuf_render* render, const
 		if (numvert < 3)
 			break;
 
-		BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x1), numvert*8);
+		BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x1), numvert*8);
 
 		for(j=0;j<numvert;j++)
-			OUT_RINGp( buffer + VERTEX_SIZE * indices[ i+j ], 8 );
+			OUT_RINGp(chan, buffer + VERTEX_SIZE * indices[ i+j ], 8 );
 
-		BEGIN_RING_NI(fahrenheit, NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE(0), (numtri+1)/2);
+		BEGIN_RING_NI(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE(0), (numtri+1)/2);
 		for(j = 0; j<numtri/2; j++)
-			OUT_RING(fantbl[j]);
+			OUT_RING(chan, fantbl[j]);
 		if (numtri%2)
-			OUT_RING(fantbl[numtri/2]&0xFFF);
+			OUT_RING(chan, fantbl[numtri/2]&0xFFF);
 	}
 }
 
diff --git a/src/gallium/drivers/nv04/nv04_state_emit.c b/src/gallium/drivers/nv04/nv04_state_emit.c
index bd98ae091f..b8d6dc560f 100644
--- a/src/gallium/drivers/nv04/nv04_state_emit.c
+++ b/src/gallium/drivers/nv04/nv04_state_emit.c
@@ -57,13 +57,19 @@ static uint32_t nv04_blend_func(uint32_t f)
 static void nv04_emit_control(struct nv04_context* nv04)
 {
 	uint32_t control = nv04->dsa->control;
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
 
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
-	OUT_RING(control);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
+	OUT_RING(chan, control);
 }
 
 static void nv04_emit_blend(struct nv04_context* nv04)
 {
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
 	uint32_t blend;
 
 	blend=0x4; // texture MODULATE_ALPHA
@@ -75,19 +81,23 @@ static void nv04_emit_blend(struct nv04_context* nv04)
 	blend|=(nv04_blend_func(nv04->blend->b_src)<<24);
 	blend|=(nv04_blend_func(nv04->blend->b_dst)<<28);
 
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_BLEND, 1);
-	OUT_RING(blend);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_BLEND, 1);
+	OUT_RING(chan, blend);
 }
 
 static void nv04_emit_sampler(struct nv04_context *nv04, int unit)
 {
 	struct nv04_miptree *nv04mt = nv04->tex_miptree[unit];
 	struct pipe_texture *pt = &nv04mt->base;
-
-	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 3);
-	OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-	OUT_RELOCd(nv04mt->buffer, (nv04->fragtex.format | nv04->sampler[unit]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
-	OUT_RING(nv04->sampler[unit]->filter);
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+	struct nouveau_bo *bo = nouveau_bo(nv04mt->buffer);
+
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 3);
+	OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(chan, bo, (nv04->fragtex.format | nv04->sampler[unit]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING(chan, nv04->sampler[unit]->filter);
 }
 
 static void nv04_state_emit_framebuffer(struct nv04_context* nv04)
@@ -97,6 +107,10 @@ static void nv04_state_emit_framebuffer(struct nv04_context* nv04)
 	uint32_t rt_format, w, h;
 	int colour_format = 0, zeta_format = 0;
 	struct nv04_miptree *nv04mt = 0;
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *context_surfaces_3d = screen->context_surfaces_3d;
+	struct nouveau_bo *bo;
 
 	w = fb->cbufs[0]->width;
 	h = fb->cbufs[0]->height;
@@ -128,24 +142,29 @@ static void nv04_state_emit_framebuffer(struct nv04_context* nv04)
 		assert(0);
 	}
 
-	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_FORMAT, 1);
-	OUT_RING(rt_format);
+	BEGIN_RING(chan, context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_FORMAT, 1);
+	OUT_RING(chan, rt_format);
 
 	nv04mt = (struct nv04_miptree *)rt->base.texture;
+	bo = nouveau_bo(nv04mt->buffer);
 	/* FIXME pitches have to be aligned ! */
-	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
-	OUT_RING(rt->pitch|(zeta->pitch<<16));
-	OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
+	OUT_RING(chan, rt->pitch|(zeta->pitch<<16));
+	OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	if (fb->zsbuf) {
 		nv04mt = (struct nv04_miptree *)zeta->base.texture;
-		BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
-		OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(chan, context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
+		OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	}
 }
 
 void
 nv04_emit_hw_state(struct nv04_context *nv04)
 {
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+	struct nouveau_grobj *context_surfaces_3d = screen->context_surfaces_3d;
 	int i;
 
 	if (nv04->dirty & NV04_NEW_VERTPROG) {
@@ -163,8 +182,8 @@ nv04_emit_hw_state(struct nv04_context *nv04)
 	if (nv04->dirty & NV04_NEW_CONTROL) {
 		nv04->dirty &= ~NV04_NEW_CONTROL;
 
-		BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
-		OUT_RING(nv04->dsa->control);
+		BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
+		OUT_RING(chan, nv04->dsa->control);
 	}
 
 	if (nv04->dirty & NV04_NEW_BLEND) {
@@ -205,12 +224,12 @@ nv04_emit_hw_state(struct nv04_context *nv04)
 	unsigned rt_pitch = ((struct nv04_surface *)nv04->rt)->pitch;
 	unsigned zeta_pitch = ((struct nv04_surface *)nv04->zeta)->pitch;
 
-	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
-	OUT_RING(rt_pitch|(zeta_pitch<<16));
-	OUT_RELOCl(nv04->rt, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
+	OUT_RING(chan, rt_pitch|(zeta_pitch<<16));
+	OUT_RELOCl(chan, nouveau_bo(nv04->rt), 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	if (nv04->zeta) {
-		BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
-		OUT_RELOCl(nv04->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(chan, context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
+		OUT_RELOCl(chan, nouveau_bo(nv04->zeta), 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	}
 
 	/* Texture images */
@@ -218,9 +237,10 @@ nv04_emit_hw_state(struct nv04_context *nv04)
 		if (!(nv04->fp_samplers & (1 << i)))
 			continue;
 		struct nv04_miptree *nv04mt = nv04->tex_miptree[i];
-		BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 2);
-		OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-		OUT_RELOCd(nv04mt->buffer, (nv04->fragtex.format | nv04->sampler[i]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+		struct nouveau_bo *bo = nouveau_bo(nv04mt->buffer);
+		BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 2);
+		OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		OUT_RELOCd(chan, bo, (nv04->fragtex.format | nv04->sampler[i]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
 	}
 }
 
diff --git a/src/gallium/drivers/nv04/nv04_vbo.c b/src/gallium/drivers/nv04/nv04_vbo.c
index 099ab10043..3484771814 100644
--- a/src/gallium/drivers/nv04/nv04_vbo.c
+++ b/src/gallium/drivers/nv04/nv04_vbo.c
@@ -9,7 +9,7 @@
 #include "nouveau/nouveau_channel.h"
 #include "nouveau/nouveau_pushbuf.h"
 
-boolean nv04_draw_elements( struct pipe_context *pipe,
+void nv04_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count)
@@ -65,15 +65,13 @@ boolean nv04_draw_elements( struct pipe_context *pipe,
 		pipe_buffer_unmap(pscreen, indexBuffer);
 		draw_set_mapped_element_buffer(draw, 0, NULL);
 	}
-
-	return TRUE;
 }
 
-boolean nv04_draw_arrays( struct pipe_context *pipe,
-				 unsigned prim, unsigned start, unsigned count)
+void nv04_draw_arrays( struct pipe_context *pipe,
+                       unsigned prim, unsigned start, unsigned count)
 {
 	printf("coucou in draw arrays\n");
-	return nv04_draw_elements(pipe, NULL, 0, prim, start, count);
+	nv04_draw_elements(pipe, NULL, 0, prim, start, count);
 }
 
 
diff --git a/src/gallium/drivers/nv10/nv10_context.c b/src/gallium/drivers/nv10/nv10_context.c
index 0dadeb03dd..1ecb73d06e 100644
--- a/src/gallium/drivers/nv10/nv10_context.c
+++ b/src/gallium/drivers/nv10/nv10_context.c
@@ -10,10 +10,14 @@ nv10_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
 
 	draw_flush(nv10->draw);
 
-	FIRE_RING(fence);
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
 }
 
 static void
@@ -31,225 +35,226 @@ static void nv10_init_hwctx(struct nv10_context *nv10)
 {
 	struct nv10_screen *screen = nv10->screen;
 	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 	int i;
 	float projectionmatrix[16];
 
-	BEGIN_RING(celsius, NV10TCL_DMA_NOTIFY, 1);
-	OUT_RING  (screen->sync->handle);
-	BEGIN_RING(celsius, NV10TCL_DMA_IN_MEMORY0, 2);
-	OUT_RING  (chan->vram->handle);
-	OUT_RING  (chan->gart->handle);
-	BEGIN_RING(celsius, NV10TCL_DMA_IN_MEMORY2, 2);
-	OUT_RING  (chan->vram->handle);
-	OUT_RING  (chan->vram->handle);
+	BEGIN_RING(chan, celsius, NV10TCL_DMA_NOTIFY, 1);
+	OUT_RING  (chan, screen->sync->handle);
+	BEGIN_RING(chan, celsius, NV10TCL_DMA_IN_MEMORY0, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->gart->handle);
+	BEGIN_RING(chan, celsius, NV10TCL_DMA_IN_MEMORY2, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
 
-	BEGIN_RING(celsius, NV10TCL_NOP, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_NOP, 1);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(celsius, NV10TCL_RT_HORIZ, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_RT_HORIZ, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 1);
-	OUT_RING  ((0x7ff<<16)|0x800);
-	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_VERT(0), 1);
-	OUT_RING  ((0x7ff<<16)|0x800);
+	BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 1);
+	OUT_RING  (chan, (0x7ff<<16)|0x800);
+	BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_CLIP_VERT(0), 1);
+	OUT_RING  (chan, (0x7ff<<16)|0x800);
 
 	for (i=1;i<8;i++) {
-		BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(i), 1);
-		OUT_RING  (0);
-		BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_VERT(i), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		OUT_RING  (chan, 0);
+		BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_CLIP_VERT(i), 1);
+		OUT_RING  (chan, 0);
 	}
 
-	BEGIN_RING(celsius, 0x290, 1);
-	OUT_RING  ((0x10<<16)|1);
-	BEGIN_RING(celsius, 0x3f4, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, 0x290, 1);
+	OUT_RING  (chan, (0x10<<16)|1);
+	BEGIN_RING(chan, celsius, 0x3f4, 1);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(celsius, NV10TCL_NOP, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_NOP, 1);
+	OUT_RING  (chan, 0);
 
 	if (nv10->screen->celsius->grclass != NV10TCL) {
 		/* For nv11, nv17 */
-		BEGIN_RING(celsius, 0x120, 3);
-		OUT_RING  (0);
-		OUT_RING  (1);
-		OUT_RING  (2);
+		BEGIN_RING(chan, celsius, 0x120, 3);
+		OUT_RING  (chan, 0);
+		OUT_RING  (chan, 1);
+		OUT_RING  (chan, 2);
 
-		BEGIN_RING(celsius, NV10TCL_NOP, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, celsius, NV10TCL_NOP, 1);
+		OUT_RING  (chan, 0);
 	}
 
-	BEGIN_RING(celsius, NV10TCL_NOP, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_NOP, 1);
+	OUT_RING  (chan, 0);
 
 	/* Set state */
-	BEGIN_RING(celsius, NV10TCL_FOG_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_FUNC, 2);
-	OUT_RING  (0x207);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_TX_ENABLE(0), 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-
-	BEGIN_RING(celsius, NV10TCL_RC_IN_ALPHA(0), 12);
-	OUT_RING  (0x30141010);
-	OUT_RING  (0);
-	OUT_RING  (0x20040000);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0x00000c00);
-	OUT_RING  (0);
-	OUT_RING  (0x00000c00);
-	OUT_RING  (0x18000000);
-	OUT_RING  (0x300e0300);
-	OUT_RING  (0x0c091c80);
-
-	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_DITHER_ENABLE, 2);
-	OUT_RING  (1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_WEIGHT_ENABLE, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_SRC, 4);
-	OUT_RING  (1);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0x8006);
-	BEGIN_RING(celsius, NV10TCL_STENCIL_MASK, 8);
-	OUT_RING  (0xff);
-	OUT_RING  (0x207);
-	OUT_RING  (0);
-	OUT_RING  (0xff);
-	OUT_RING  (0x1e00);
-	OUT_RING  (0x1e00);
-	OUT_RING  (0x1e00);
-	OUT_RING  (0x1d01);
-	BEGIN_RING(celsius, NV10TCL_NORMALIZE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_FOG_ENABLE, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_LIGHT_MODEL, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_COLOR_CONTROL, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_ENABLED_LIGHTS, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_DEPTH_FUNC, 1);
-	OUT_RING  (0x201);
-	BEGIN_RING(celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_POLYGON_OFFSET_FACTOR, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_POINT_SIZE, 1);
-	OUT_RING  (8);
-	BEGIN_RING(celsius, NV10TCL_POINT_PARAMETERS_ENABLE, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_LINE_WIDTH, 1);
-	OUT_RING  (8);
-	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
-	OUT_RING  (0x1b02);
-	OUT_RING  (0x1b02);
-	BEGIN_RING(celsius, NV10TCL_CULL_FACE, 2);
-	OUT_RING  (0x405);
-	OUT_RING  (0x901);
-	BEGIN_RING(celsius, NV10TCL_POLYGON_SMOOTH_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_CULL_FACE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_TX_GEN_S(0), 8);
+	BEGIN_RING(chan, celsius, NV10TCL_FOG_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_ALPHA_FUNC_FUNC, 2);
+	OUT_RING  (chan, 0x207);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_TX_ENABLE(0), 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+
+	BEGIN_RING(chan, celsius, NV10TCL_RC_IN_ALPHA(0), 12);
+	OUT_RING  (chan, 0x30141010);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0x20040000);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0x00000c00);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0x00000c00);
+	OUT_RING  (chan, 0x18000000);
+	OUT_RING  (chan, 0x300e0300);
+	OUT_RING  (chan, 0x0c091c80);
+
+	BEGIN_RING(chan, celsius, NV10TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_DITHER_ENABLE, 2);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_WEIGHT_ENABLE, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_BLEND_FUNC_SRC, 4);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0x8006);
+	BEGIN_RING(chan, celsius, NV10TCL_STENCIL_MASK, 8);
+	OUT_RING  (chan, 0xff);
+	OUT_RING  (chan, 0x207);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0xff);
+	OUT_RING  (chan, 0x1e00);
+	OUT_RING  (chan, 0x1e00);
+	OUT_RING  (chan, 0x1e00);
+	OUT_RING  (chan, 0x1d01);
+	BEGIN_RING(chan, celsius, NV10TCL_NORMALIZE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_FOG_ENABLE, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_LIGHT_MODEL, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_COLOR_CONTROL, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_ENABLED_LIGHTS, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_FUNC, 1);
+	OUT_RING  (chan, 0x201);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_POLYGON_OFFSET_FACTOR, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_POINT_SIZE, 1);
+	OUT_RING  (chan, 8);
+	BEGIN_RING(chan, celsius, NV10TCL_POINT_PARAMETERS_ENABLE, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_LINE_WIDTH, 1);
+	OUT_RING  (chan, 8);
+	BEGIN_RING(chan, celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (chan, 0x1b02);
+	OUT_RING  (chan, 0x1b02);
+	BEGIN_RING(chan, celsius, NV10TCL_CULL_FACE, 2);
+	OUT_RING  (chan, 0x405);
+	OUT_RING  (chan, 0x901);
+	BEGIN_RING(chan, celsius, NV10TCL_POLYGON_SMOOTH_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_TX_GEN_S(0), 8);
 	for (i=0;i<8;i++) {
-		OUT_RING  (0);
+		OUT_RING  (chan, 0);
 	}
-	BEGIN_RING(celsius, NV10TCL_FOG_EQUATION_CONSTANT, 3);
-	OUT_RING  (0x3fc00000);	/* -1.50 */
-	OUT_RING  (0xbdb8aa0a);	/* -0.09 */
-	OUT_RING  (0);		/*  0.00 */
+	BEGIN_RING(chan, celsius, NV10TCL_FOG_EQUATION_CONSTANT, 3);
+	OUT_RING  (chan, 0x3fc00000);	/* -1.50 */
+	OUT_RING  (chan, 0xbdb8aa0a);	/* -0.09 */
+	OUT_RING  (chan, 0);		/*  0.00 */
 
-	BEGIN_RING(celsius, NV10TCL_NOP, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_NOP, 1);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(celsius, NV10TCL_FOG_MODE, 2);
-	OUT_RING  (0x802);
-	OUT_RING  (2);
+	BEGIN_RING(chan, celsius, NV10TCL_FOG_MODE, 2);
+	OUT_RING  (chan, 0x802);
+	OUT_RING  (chan, 2);
 	/* for some reason VIEW_MATRIX_ENABLE need to be 6 instead of 4 when
 	 * using texturing, except when using the texture matrix
 	 */
-	BEGIN_RING(celsius, NV10TCL_VIEW_MATRIX_ENABLE, 1);
-	OUT_RING  (6);
-	BEGIN_RING(celsius, NV10TCL_COLOR_MASK, 1);
-	OUT_RING  (0x01010101);
+	BEGIN_RING(chan, celsius, NV10TCL_VIEW_MATRIX_ENABLE, 1);
+	OUT_RING  (chan, 6);
+	BEGIN_RING(chan, celsius, NV10TCL_COLOR_MASK, 1);
+	OUT_RING  (chan, 0x01010101);
 
 	/* Set vertex component */
-	BEGIN_RING(celsius, NV10TCL_VERTEX_COL_4F_R, 4);
-	OUT_RINGf (1.0);
-	OUT_RINGf (1.0);
-	OUT_RINGf (1.0);
-	OUT_RINGf (1.0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_COL2_3F_R, 3);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_NOR_3F_X, 3);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RINGf (1.0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_TX0_4F_S, 4);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (1.0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_TX1_4F_S, 4);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (1.0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_FOG_1F, 1);
-	OUT_RINGf (0.0);
-	BEGIN_RING(celsius, NV10TCL_EDGEFLAG_ENABLE, 1);
-	OUT_RING  (1);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_COL_4F_R, 4);
+	OUT_RINGf (chan, 1.0);
+	OUT_RINGf (chan, 1.0);
+	OUT_RINGf (chan, 1.0);
+	OUT_RINGf (chan, 1.0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_COL2_3F_R, 3);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_NOR_3F_X, 3);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RINGf (chan, 1.0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_TX0_4F_S, 4);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 1.0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_TX1_4F_S, 4);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 1.0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_FOG_1F, 1);
+	OUT_RINGf (chan, 0.0);
+	BEGIN_RING(chan, celsius, NV10TCL_EDGEFLAG_ENABLE, 1);
+	OUT_RING  (chan, 1);
 
 	memset(projectionmatrix, 0, sizeof(projectionmatrix));
-	BEGIN_RING(celsius, NV10TCL_PROJECTION_MATRIX(0), 16);
+	BEGIN_RING(chan, celsius, NV10TCL_PROJECTION_MATRIX(0), 16);
 	projectionmatrix[0*4+0] = 1.0;
 	projectionmatrix[1*4+1] = 1.0;
 	projectionmatrix[2*4+2] = 1.0;
 	projectionmatrix[3*4+3] = 1.0;
 	for (i=0;i<16;i++) {
-		OUT_RINGf  (projectionmatrix[i]);
+		OUT_RINGf  (chan, projectionmatrix[i]);
 	}
 
-	BEGIN_RING(celsius, NV10TCL_DEPTH_RANGE_NEAR, 2);
-	OUT_RING  (0.0);
-	OUT_RINGf  (16777216.0);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_RANGE_NEAR, 2);
+	OUT_RING  (chan, 0.0);
+	OUT_RINGf  (chan, 16777216.0);
 
-	BEGIN_RING(celsius, NV10TCL_VIEWPORT_TRANSLATE_X, 4);
-	OUT_RINGf  (-2048.0);
-	OUT_RINGf  (-2048.0);
-	OUT_RINGf  (16777215.0 * 0.5);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_TRANSLATE_X, 4);
+	OUT_RINGf  (chan, -2048.0);
+	OUT_RINGf  (chan, -2048.0);
+	OUT_RINGf  (chan, 16777215.0 * 0.5);
+	OUT_RING  (chan, 0);
 
-	FIRE_RING (NULL);
+	FIRE_RING (chan);
 }
 
 struct pipe_context *
diff --git a/src/gallium/drivers/nv10/nv10_context.h b/src/gallium/drivers/nv10/nv10_context.h
index 36a6aa7a74..ab4b825487 100644
--- a/src/gallium/drivers/nv10/nv10_context.h
+++ b/src/gallium/drivers/nv10/nv10_context.h
@@ -15,10 +15,6 @@
 #include "nouveau/nouveau_gldefs.h"
 #include "nouveau/nouveau_context.h"
 
-#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
-	struct nv10_screen *ctx = nv10->screen
-#include "nouveau/nouveau_push.h"
-
 #include "nv10_state.h"
 
 #define NOUVEAU_ERR(fmt, args...) \
@@ -144,9 +140,9 @@ extern void nv10_emit_hw_state(struct nv10_context *nv10);
 extern void nv10_state_tex_update(struct nv10_context *nv10);
 
 /* nv10_vbo.c */
-extern boolean nv10_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv10_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv10_draw_elements( struct pipe_context *pipe,
+extern void nv10_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count);
diff --git a/src/gallium/drivers/nv10/nv10_fragtex.c b/src/gallium/drivers/nv10/nv10_fragtex.c
index 906fdfeeb9..c1f7ccb9ab 100644
--- a/src/gallium/drivers/nv10/nv10_fragtex.c
+++ b/src/gallium/drivers/nv10/nv10_fragtex.c
@@ -52,6 +52,9 @@ nv10_fragtex_build(struct nv10_context *nv10, int unit)
 	struct nv10_miptree *nv10mt = nv10->tex_miptree[unit];
 	struct pipe_texture *pt = &nv10mt->base;
 	struct nv10_texture_format *tf;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 	uint32_t txf, txs, txp;
 
 	tf = nv10_fragtex_format(pt->format);
@@ -82,15 +85,15 @@ nv10_fragtex_build(struct nv10_context *nv10, int unit)
 		return;
 	}
 
-	BEGIN_RING(celsius, NV10TCL_TX_OFFSET(unit), 8);
-	OUT_RELOCl(nv10mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-	OUT_RELOCd(nv10mt->buffer,txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
-	OUT_RING  (ps->wrap);
-	OUT_RING  (0x40000000); /* enable */
-	OUT_RING  (txs);
-	OUT_RING  (ps->filt | 0x2000 /* magic */);
-	OUT_RING  ((pt->width0 << 16) | pt->height0);
-	OUT_RING  (ps->bcol);
+	BEGIN_RING(chan, celsius, NV10TCL_TX_OFFSET(unit), 8);
+	OUT_RELOCl(chan, nouveau_bo(nv10mt->buffer), 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(chan, nouveau_bo(nv10mt->buffer),txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING  (chan, ps->wrap);
+	OUT_RING  (chan, 0x40000000); /* enable */
+	OUT_RING  (chan, txs);
+	OUT_RING  (chan, ps->filt | 0x2000 /* magic */);
+	OUT_RING  (chan, (pt->width0 << 16) | pt->height0);
+	OUT_RING  (chan, ps->bcol);
 #endif
 }
 
@@ -99,6 +102,9 @@ nv10_fragtex_bind(struct nv10_context *nv10)
 {
 #if 0
 	struct nv10_fragment_program *fp = nv10->fragprog.active;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 	unsigned samplers, unit;
 
 	samplers = nv10->fp_samplers & ~fp->samplers;
@@ -106,8 +112,8 @@ nv10_fragtex_bind(struct nv10_context *nv10)
 		unit = ffs(samplers) - 1;
 		samplers &= ~(1 << unit);
 
-		BEGIN_RING(celsius, NV10TCL_TX_ENABLE(unit), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, celsius, NV10TCL_TX_ENABLE(unit), 1);
+		OUT_RING  (chan, 0);
 	}
 
 	samplers = nv10->dirty_samplers & fp->samplers;
diff --git a/src/gallium/drivers/nv10/nv10_prim_vbuf.c b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
index 7ba9777a22..c5dbe43dbc 100644
--- a/src/gallium/drivers/nv10/nv10_prim_vbuf.c
+++ b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
@@ -67,12 +67,15 @@ struct nv10_vbuf_render {
 
 void nv10_vtxbuf_bind( struct nv10_context* nv10 )
 {
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 	int i;
 	for(i = 0; i < 8; i++) {
-		BEGIN_RING(celsius, NV10TCL_VTXBUF_ADDRESS(i), 1);
-		OUT_RING(0/*nv10->vtxbuf*/);
-		BEGIN_RING(celsius, NV10TCL_VTXFMT(i), 1);
-		OUT_RING(0/*XXX*/);
+		BEGIN_RING(chan, celsius, NV10TCL_VTXBUF_ADDRESS(i), 1);
+		OUT_RING(chan, 0/*nv10->vtxbuf*/);
+		BEGIN_RING(chan, celsius, NV10TCL_VTXFMT(i), 1);
+		OUT_RING(chan, 0/*XXX*/);
 	}
 }
 
@@ -163,19 +166,22 @@ nv10_vbuf_render_draw( struct vbuf_render *render,
 {
 	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
 	struct nv10_context *nv10 = nv10_render->nv10;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 	int push, i;
 
 	nv10_emit_hw_state(nv10);
 
-	BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
-	OUT_RELOCl(nv10_render->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
+	OUT_RELOCl(chan, nouveau_bo(nv10_render->buffer), 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
 
-	BEGIN_RING(celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
-	OUT_RING(nv10_render->hwprim);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING(chan, nv10_render->hwprim);
 
 	if (nr_indices & 1) {
-		BEGIN_RING(celsius, NV10TCL_VB_ELEMENT_U32, 1);
-		OUT_RING  (indices[0]);
+		BEGIN_RING(chan, celsius, NV10TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (chan, indices[0]);
 		indices++; nr_indices--;
 	}
 
@@ -183,16 +189,16 @@ nv10_vbuf_render_draw( struct vbuf_render *render,
 		// XXX too big/small ? check the size
 		push = MIN2(nr_indices, 1200 * 2);
 
-		BEGIN_RING_NI(celsius, NV10TCL_VB_ELEMENT_U16, push >> 1);
+		BEGIN_RING_NI(chan, celsius, NV10TCL_VB_ELEMENT_U16, push >> 1);
 		for (i = 0; i < push; i+=2)
-			OUT_RING((indices[i+1] << 16) | indices[i]);
+			OUT_RING(chan, (indices[i+1] << 16) | indices[i]);
 
 		nr_indices -= push;
 		indices  += push;
 	}
 
-	BEGIN_RING(celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING  (chan, 0);
 }
 
 
diff --git a/src/gallium/drivers/nv10/nv10_screen.c b/src/gallium/drivers/nv10/nv10_screen.c
index 6a39ddeaac..69a6dab866 100644
--- a/src/gallium/drivers/nv10/nv10_screen.c
+++ b/src/gallium/drivers/nv10/nv10_screen.c
@@ -180,7 +180,6 @@ nv10_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return FALSE;
 	}
-	BIND_RING(chan, screen->celsius, 7);
 
 	/* 2D engine setup */
 	screen->eng2d = nv04_surface_2d_init(&screen->base);
diff --git a/src/gallium/drivers/nv10/nv10_state_emit.c b/src/gallium/drivers/nv10/nv10_state_emit.c
index 2577ab73b5..30a596ca60 100644
--- a/src/gallium/drivers/nv10/nv10_state_emit.c
+++ b/src/gallium/drivers/nv10/nv10_state_emit.c
@@ -4,25 +4,32 @@
 static void nv10_state_emit_blend(struct nv10_context* nv10)
 {
 	struct nv10_blend_state *b = nv10->blend;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 
-	BEGIN_RING(celsius, NV10TCL_DITHER_ENABLE, 1);
-	OUT_RING  (b->d_enable);
+	BEGIN_RING(chan, celsius, NV10TCL_DITHER_ENABLE, 1);
+	OUT_RING  (chan, b->d_enable);
 
-	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_ENABLE, 3);
-	OUT_RING  (b->b_enable);
-	OUT_RING  (b->b_srcfunc);
-	OUT_RING  (b->b_dstfunc);
+	BEGIN_RING(chan, celsius, NV10TCL_BLEND_FUNC_ENABLE, 3);
+	OUT_RING  (chan, b->b_enable);
+	OUT_RING  (chan, b->b_srcfunc);
+	OUT_RING  (chan, b->b_dstfunc);
 
-	BEGIN_RING(celsius, NV10TCL_COLOR_MASK, 1);
-	OUT_RING  (b->c_mask);
+	BEGIN_RING(chan, celsius, NV10TCL_COLOR_MASK, 1);
+	OUT_RING  (chan, b->c_mask);
 }
 
 static void nv10_state_emit_blend_color(struct nv10_context* nv10)
 {
 	struct pipe_blend_color *c = nv10->blend_color;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 
-	BEGIN_RING(celsius, NV10TCL_BLEND_COLOR, 1);
-	OUT_RING  ((float_to_ubyte(c->color[3]) << 24)|
+	BEGIN_RING(chan, celsius, NV10TCL_BLEND_COLOR, 1);
+	OUT_RING  (chan,
+		   (float_to_ubyte(c->color[3]) << 24)|
 		   (float_to_ubyte(c->color[0]) << 16)|
 		   (float_to_ubyte(c->color[1]) << 8) |
 		   (float_to_ubyte(c->color[2]) << 0));
@@ -31,60 +38,66 @@ static void nv10_state_emit_blend_color(struct nv10_context* nv10)
 static void nv10_state_emit_rast(struct nv10_context* nv10)
 {
 	struct nv10_rasterizer_state *r = nv10->rast;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 
-	BEGIN_RING(celsius, NV10TCL_SHADE_MODEL, 2);
-	OUT_RING  (r->shade_model);
-	OUT_RING  (r->line_width);
+	BEGIN_RING(chan, celsius, NV10TCL_SHADE_MODEL, 2);
+	OUT_RING  (chan, r->shade_model);
+	OUT_RING  (chan, r->line_width);
 
 
-	BEGIN_RING(celsius, NV10TCL_POINT_SIZE, 1);
-	OUT_RING  (r->point_size);
+	BEGIN_RING(chan, celsius, NV10TCL_POINT_SIZE, 1);
+	OUT_RING  (chan, r->point_size);
 
-	BEGIN_RING(celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
-	OUT_RING  (r->poly_mode_front);
-	OUT_RING  (r->poly_mode_back);
+	BEGIN_RING(chan, celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (chan, r->poly_mode_front);
+	OUT_RING  (chan, r->poly_mode_back);
 
 
-	BEGIN_RING(celsius, NV10TCL_CULL_FACE, 2);
-	OUT_RING  (r->cull_face);
-	OUT_RING  (r->front_face);
+	BEGIN_RING(chan, celsius, NV10TCL_CULL_FACE, 2);
+	OUT_RING  (chan, r->cull_face);
+	OUT_RING  (chan, r->front_face);
 
-	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 2);
-	OUT_RING  (r->line_smooth_en);
-	OUT_RING  (r->poly_smooth_en);
+	BEGIN_RING(chan, celsius, NV10TCL_LINE_SMOOTH_ENABLE, 2);
+	OUT_RING  (chan, r->line_smooth_en);
+	OUT_RING  (chan, r->poly_smooth_en);
 
-	BEGIN_RING(celsius, NV10TCL_CULL_FACE_ENABLE, 1);
-	OUT_RING  (r->cull_face_en);
+	BEGIN_RING(chan, celsius, NV10TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (chan, r->cull_face_en);
 }
 
 static void nv10_state_emit_dsa(struct nv10_context* nv10)
 {
 	struct nv10_depth_stencil_alpha_state *d = nv10->dsa;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 
-	BEGIN_RING(celsius, NV10TCL_DEPTH_FUNC, 1);
-	OUT_RING (d->depth.func);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_FUNC, 1);
+	OUT_RING (chan, d->depth.func);
 
-	BEGIN_RING(celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
-	OUT_RING (d->depth.write_enable);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING (chan, d->depth.write_enable);
 
-	BEGIN_RING(celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
-	OUT_RING (d->depth.test_enable);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING (chan, d->depth.test_enable);
 
 #if 0
-	BEGIN_RING(celsius, NV10TCL_STENCIL_ENABLE, 1);
-	OUT_RING (d->stencil.enable);
-	BEGIN_RING(celsius, NV10TCL_STENCIL_MASK, 7);
-	OUT_RINGp ((uint32_t *)&(d->stencil.wmask), 7);
+	BEGIN_RING(chan, celsius, NV10TCL_STENCIL_ENABLE, 1);
+	OUT_RING (chan, d->stencil.enable);
+	BEGIN_RING(chan, celsius, NV10TCL_STENCIL_MASK, 7);
+	OUT_RINGp (chan, (uint32_t *)&(d->stencil.wmask), 7);
 #endif
 
-	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
-	OUT_RING (d->alpha.enabled);
+	BEGIN_RING(chan, celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING (chan, d->alpha.enabled);
 
-	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_FUNC, 1);
-	OUT_RING (d->alpha.func);
+	BEGIN_RING(chan, celsius, NV10TCL_ALPHA_FUNC_FUNC, 1);
+	OUT_RING (chan, d->alpha.func);
 
-	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_REF, 1);
-	OUT_RING (d->alpha.ref);
+	BEGIN_RING(chan, celsius, NV10TCL_ALPHA_FUNC_REF, 1);
+	OUT_RING (chan, d->alpha.ref);
 }
 
 static void nv10_state_emit_viewport(struct nv10_context* nv10)
@@ -108,6 +121,10 @@ static void nv10_state_emit_framebuffer(struct nv10_context* nv10)
 	int colour_format = 0, zeta_format = 0;
         struct nv10_miptree *nv10mt = 0;
 
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
+
 	w = fb->cbufs[0]->width;
 	h = fb->cbufs[0]->height;
 	colour_format = fb->cbufs[0]->format;
@@ -144,11 +161,11 @@ static void nv10_state_emit_framebuffer(struct nv10_context* nv10)
 	}
 
 	if (zeta) {
-		BEGIN_RING(celsius, NV10TCL_RT_PITCH, 1);
-		OUT_RING  (rt->pitch | (zeta->pitch << 16));
+		BEGIN_RING(chan, celsius, NV10TCL_RT_PITCH, 1);
+		OUT_RING  (chan, rt->pitch | (zeta->pitch << 16));
 	} else {
-		BEGIN_RING(celsius, NV10TCL_RT_PITCH, 1);
-		OUT_RING  (rt->pitch | (rt->pitch << 16));
+		BEGIN_RING(chan, celsius, NV10TCL_RT_PITCH, 1);
+		OUT_RING  (chan, rt->pitch | (rt->pitch << 16));
 	}
 
 	nv10mt = (struct nv10_miptree *)rt->base.texture;
@@ -160,13 +177,13 @@ static void nv10_state_emit_framebuffer(struct nv10_context* nv10)
 		nv10->zeta = nv10mt->buffer;
 	}
 
-	BEGIN_RING(celsius, NV10TCL_RT_HORIZ, 3);
-	OUT_RING  ((w << 16) | 0);
-	OUT_RING  ((h << 16) | 0);
-	OUT_RING  (rt_format);
-	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 2);
-	OUT_RING  (((w - 1) << 16) | 0 | 0x08000800);
-	OUT_RING  (((h - 1) << 16) | 0 | 0x08000800);
+	BEGIN_RING(chan, celsius, NV10TCL_RT_HORIZ, 3);
+	OUT_RING  (chan, (w << 16) | 0);
+	OUT_RING  (chan, (h << 16) | 0);
+	OUT_RING  (chan, rt_format);
+	BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  (chan, ((w - 1) << 16) | 0 | 0x08000800);
+	OUT_RING  (chan, ((h - 1) << 16) | 0 | 0x08000800);
 }
 
 static void nv10_vertex_layout(struct nv10_context *nv10)
@@ -201,6 +218,10 @@ static void nv10_vertex_layout(struct nv10_context *nv10)
 void
 nv10_emit_hw_state(struct nv10_context *nv10)
 {
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
+	struct nouveau_bo *rt_bo;
 	int i;
 
 	if (nv10->dirty & NV10_NEW_VERTPROG) {
@@ -269,38 +290,41 @@ nv10_emit_hw_state(struct nv10_context *nv10)
 	 */
 
 	/* Render target */
+	rt_bo = nouveau_bo(nv10->rt[0]);
 // XXX figre out who's who for NV10TCL_DMA_* and fill accordingly
-//	BEGIN_RING(celsius, NV10TCL_DMA_COLOR0, 1);
-//	OUT_RELOCo(nv10->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	BEGIN_RING(celsius, NV10TCL_COLOR_OFFSET, 1);
-	OUT_RELOCl(nv10->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+//	BEGIN_RING(chan, celsius, NV10TCL_DMA_COLOR0, 1);
+//	OUT_RELOCo(chan, rt_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, celsius, NV10TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(chan, rt_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	if (nv10->zeta) {
+		struct nouveau_bo *zeta_bo = nouveau_bo(nv10->zeta);
 // XXX
-//		BEGIN_RING(celsius, NV10TCL_DMA_ZETA, 1);
-//		OUT_RELOCo(nv10->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(celsius, NV10TCL_ZETA_OFFSET, 1);
-		OUT_RELOCl(nv10->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+//		BEGIN_RING(chan, celsius, NV10TCL_DMA_ZETA, 1);
+//		OUT_RELOCo(chan, zeta_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(chan, celsius, NV10TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(chan, zeta_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 		/* XXX for when we allocate LMA on nv17 */
-/*		BEGIN_RING(celsius, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
-		OUT_RELOCl(nv10->zeta + lma_offset);*/
+/*		BEGIN_RING(chan, celsius, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
+		OUT_RELOCl(chan, nouveau_bo(nv10->zeta + lma_offset));*/
 	}
 
 	/* Vertex buffer */
-	BEGIN_RING(celsius, NV10TCL_DMA_VTXBUF0, 1);
-	OUT_RELOCo(nv10->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	BEGIN_RING(celsius, NV10TCL_COLOR_OFFSET, 1);
-	OUT_RELOCl(nv10->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, celsius, NV10TCL_DMA_VTXBUF0, 1);
+	OUT_RELOCo(chan, rt_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, celsius, NV10TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(chan, rt_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	/* Texture images */
 	for (i = 0; i < 2; i++) {
 		if (!(nv10->fp_samplers & (1 << i)))
 			continue;
-		BEGIN_RING(celsius, NV10TCL_TX_OFFSET(i), 1);
-		OUT_RELOCl(nv10->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
+		struct nouveau_bo *bo = nouveau_bo(nv10->tex[i].buffer);
+		BEGIN_RING(chan, celsius, NV10TCL_TX_OFFSET(i), 1);
+		OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM |
 			   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-		BEGIN_RING(celsius, NV10TCL_TX_FORMAT(i), 1);
-		OUT_RELOCd(nv10->tex[i].buffer, nv10->tex[i].format,
+		BEGIN_RING(chan, celsius, NV10TCL_TX_FORMAT(i), 1);
+		OUT_RELOCd(chan, bo, nv10->tex[i].format,
 			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
 			   NOUVEAU_BO_OR, NV10TCL_TX_FORMAT_DMA0,
 			   NV10TCL_TX_FORMAT_DMA1);
diff --git a/src/gallium/drivers/nv10/nv10_vbo.c b/src/gallium/drivers/nv10/nv10_vbo.c
index 0d26141248..9180c72c9b 100644
--- a/src/gallium/drivers/nv10/nv10_vbo.c
+++ b/src/gallium/drivers/nv10/nv10_vbo.c
@@ -9,7 +9,7 @@
 #include "nouveau/nouveau_channel.h"
 #include "nouveau/nouveau_pushbuf.h"
 
-boolean nv10_draw_elements( struct pipe_context *pipe,
+void nv10_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count)
@@ -65,14 +65,12 @@ boolean nv10_draw_elements( struct pipe_context *pipe,
 		pipe_buffer_unmap(pscreen, indexBuffer);
 		draw_set_mapped_element_buffer(draw, 0, NULL);
 	}
-
-	return TRUE;
 }
 
-boolean nv10_draw_arrays( struct pipe_context *pipe,
-				 unsigned prim, unsigned start, unsigned count)
+void nv10_draw_arrays( struct pipe_context *pipe,
+                       unsigned prim, unsigned start, unsigned count)
 {
-	return nv10_draw_elements(pipe, NULL, 0, prim, start, count);
+	nv10_draw_elements(pipe, NULL, 0, prim, start, count);
 }
 
 
diff --git a/src/gallium/drivers/nv20/nv20_context.c b/src/gallium/drivers/nv20/nv20_context.c
index 6a147a4159..5b80af2d22 100644
--- a/src/gallium/drivers/nv20/nv20_context.c
+++ b/src/gallium/drivers/nv20/nv20_context.c
@@ -10,10 +10,14 @@ nv20_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
 
 	draw_flush(nv20->draw);
 
-	FIRE_RING(fence);
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
 }
 
 static void
@@ -31,348 +35,352 @@ static void nv20_init_hwctx(struct nv20_context *nv20)
 {
 	struct nv20_screen *screen = nv20->screen;
 	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	int i;
 	float projectionmatrix[16];
-	const boolean is_nv25tcl = (nv20->screen->kelvin->grclass == NV25TCL);
+	const boolean is_nv25tcl = (kelvin->grclass == NV25TCL);
 
-	BEGIN_RING(kelvin, NV20TCL_DMA_NOTIFY, 1);
-	OUT_RING  (screen->sync->handle);
-	BEGIN_RING(kelvin, NV20TCL_DMA_TEXTURE0, 2);
-	OUT_RING  (chan->vram->handle);
-	OUT_RING  (chan->gart->handle); /* TEXTURE1 */
-	BEGIN_RING(kelvin, NV20TCL_DMA_COLOR, 2);
-	OUT_RING  (chan->vram->handle);
-	OUT_RING  (chan->vram->handle); /* ZETA */
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_NOTIFY, 1);
+	OUT_RING  (chan, screen->sync->handle);
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_TEXTURE0, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->gart->handle); /* TEXTURE1 */
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_COLOR, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle); /* ZETA */
 
-	BEGIN_RING(kelvin, NV20TCL_DMA_QUERY, 1);
-	OUT_RING  (0); /* renouveau: beef0351, unique */
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_QUERY, 1);
+	OUT_RING  (chan, 0); /* renouveau: beef0351, unique */
 
-	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RT_HORIZ, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 1);
-	OUT_RING  ((0xfff << 16) | 0x0);
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_VERT(0), 1);
-	OUT_RING  ((0xfff << 16) | 0x0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 1);
+	OUT_RING  (chan, (0xfff << 16) | 0x0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_VERT(0), 1);
+	OUT_RING  (chan, (0xfff << 16) | 0x0);
 
 	for (i = 1; i < NV20TCL_VIEWPORT_CLIP_HORIZ__SIZE; i++) {
-		BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(i), 1);
-		OUT_RING  (0);
-		BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_VERT(i), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		OUT_RING  (chan, 0);
+		BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_VERT(i), 1);
+		OUT_RING  (chan, 0);
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_MODE, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_MODE, 1);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(kelvin, 0x17e0, 3);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (1.0);
+	BEGIN_RING(chan, kelvin, 0x17e0, 3);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 1.0);
 
 	if (is_nv25tcl) {
-		BEGIN_RING(kelvin, NV20TCL_TX_RCOMP, 1);
-		OUT_RING  (NV20TCL_TX_RCOMP_LEQUAL | 0xdb0);
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_RCOMP, 1);
+		OUT_RING  (chan, NV20TCL_TX_RCOMP_LEQUAL | 0xdb0);
 	} else {
-		BEGIN_RING(kelvin, 0x1e68, 1);
-		OUT_RING  (0x4b800000); /* 16777216.000000 */
-		BEGIN_RING(kelvin, NV20TCL_TX_RCOMP, 1);
-		OUT_RING  (NV20TCL_TX_RCOMP_LEQUAL);
+		BEGIN_RING(chan, kelvin, 0x1e68, 1);
+		OUT_RING  (chan, 0x4b800000); /* 16777216.000000 */
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_RCOMP, 1);
+		OUT_RING  (chan, NV20TCL_TX_RCOMP_LEQUAL);
 	}
 
-	BEGIN_RING(kelvin, 0x290, 1);
-	OUT_RING  ((0x10 << 16) | 1);
-	BEGIN_RING(kelvin, 0x9fc, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, 0x1d80, 1);
-	OUT_RING  (1);
-	BEGIN_RING(kelvin, 0x9f8, 1);
-	OUT_RING  (4);
-	BEGIN_RING(kelvin, 0x17ec, 3);
-	OUT_RINGf (0.0);
-	OUT_RINGf (1.0);
-	OUT_RINGf (0.0);
+	BEGIN_RING(chan, kelvin, 0x290, 1);
+	OUT_RING  (chan, (0x10 << 16) | 1);
+	BEGIN_RING(chan, kelvin, 0x9fc, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, 0x1d80, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, kelvin, 0x9f8, 1);
+	OUT_RING  (chan, 4);
+	BEGIN_RING(chan, kelvin, 0x17ec, 3);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 1.0);
+	OUT_RINGf (chan, 0.0);
 
 	if (is_nv25tcl) {
-		BEGIN_RING(kelvin, 0x1d88, 1);
-		OUT_RING  (3);
+		BEGIN_RING(chan, kelvin, 0x1d88, 1);
+		OUT_RING  (chan, 3);
 
-		BEGIN_RING(kelvin, NV25TCL_DMA_IN_MEMORY9, 1);
-		OUT_RING  (chan->vram->handle);
-		BEGIN_RING(kelvin, NV25TCL_DMA_IN_MEMORY8, 1);
-		OUT_RING  (chan->vram->handle);
+		BEGIN_RING(chan, kelvin, NV25TCL_DMA_IN_MEMORY9, 1);
+		OUT_RING  (chan, chan->vram->handle);
+		BEGIN_RING(chan, kelvin, NV25TCL_DMA_IN_MEMORY8, 1);
+		OUT_RING  (chan, chan->vram->handle);
 	}
-	BEGIN_RING(kelvin, NV20TCL_DMA_FENCE, 1);
-	OUT_RING  (0);	/* renouveau: beef1e10 */
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_FENCE, 1);
+	OUT_RING  (chan, 0);	/* renouveau: beef1e10 */
 
-	BEGIN_RING(kelvin, 0x1e98, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, 0x1e98, 1);
+	OUT_RING  (chan, 0);
 #if 0
 	if (is_nv25tcl) {
-		BEGIN_RING(NvSub3D, NV25TCL_DMA_IN_MEMORY4, 2);
-		OUT_RING  (NvDmaTT);	/* renouveau: beef0202 */
-		OUT_RING  (NvDmaFB);	/* renouveau: beef0201 */
+		BEGIN_RING(chan, NvSub3D, NV25TCL_DMA_IN_MEMORY4, 2);
+		OUT_RING  (chan, NvDmaTT);	/* renouveau: beef0202 */
+		OUT_RING  (chan, NvDmaFB);	/* renouveau: beef0201 */
 
-		BEGIN_RING(NvSub3D, NV20TCL_DMA_TEXTURE1, 1);
-		OUT_RING  (NvDmaTT);	/* renouveau: beef0202 */
+		BEGIN_RING(chan, NvSub3D, NV20TCL_DMA_TEXTURE1, 1);
+		OUT_RING  (chan, NvDmaTT);	/* renouveau: beef0202 */
 	}
 #endif
-	BEGIN_RING(kelvin, NV20TCL_NOTIFY, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, NV20TCL_NOTIFY, 1);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(kelvin, 0x120, 3);
-	OUT_RING  (0);
-	OUT_RING  (1);
-	OUT_RING  (2);
+	BEGIN_RING(chan, kelvin, 0x120, 3);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 2);
 
 /* error: ILLEGAL_MTHD, PROTECTION_FAULT
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_TRANSLATE_X, 4);
-	OUT_RINGf (0.0);
-	OUT_RINGf (512.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_TRANSLATE_X, 4);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 512.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
 */
 
 	if (is_nv25tcl) {
-		BEGIN_RING(kelvin, 0x022c, 2);
-		OUT_RING  (0x280);
-		OUT_RING  (0x07d28000);
+		BEGIN_RING(chan, kelvin, 0x022c, 2);
+		OUT_RING  (chan, 0x280);
+		OUT_RING  (chan, 0x07d28000);
 	}
 
 /* * illegal method, protection fault
-	BEGIN_RING(NvSub3D, 0x1c2c, 1);
-	OUT_RING  (0); */
+	BEGIN_RING(chan, NvSub3D, 0x1c2c, 1);
+	OUT_RING  (chan, 0); */
 
 	if (is_nv25tcl) {
-		BEGIN_RING(kelvin, 0x1da4, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, kelvin, 0x1da4, 1);
+		OUT_RING  (chan, 0);
 	}
 
 /* * crashes with illegal method, protection fault
-	BEGIN_RING(NvSub3D, 0x1c18, 1);
-	OUT_RING  (0x200); */
+	BEGIN_RING(chan, NvSub3D, 0x1c18, 1);
+	OUT_RING  (chan, 0x200); */
 
-	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 2);
-	OUT_RING  ((0 << 16) | 0);
-	OUT_RING  ((0 << 16) | 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RT_HORIZ, 2);
+	OUT_RING  (chan, (0 << 16) | 0);
+	OUT_RING  (chan, (0 << 16) | 0);
 
 	/* *** Set state *** */
 
-	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_FUNC, 2);
-	OUT_RING  (NV20TCL_ALPHA_FUNC_FUNC_ALWAYS);
-	OUT_RING  (0);			/* NV20TCL_ALPHA_FUNC_REF */
+	BEGIN_RING(chan, kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_ALPHA_FUNC_FUNC, 2);
+	OUT_RING  (chan, NV20TCL_ALPHA_FUNC_FUNC_ALWAYS);
+	OUT_RING  (chan, 0);			/* NV20TCL_ALPHA_FUNC_REF */
 
 	for (i = 0; i < NV20TCL_TX_ENABLE__SIZE; ++i) {
-		BEGIN_RING(kelvin, NV20TCL_TX_ENABLE(i), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_ENABLE(i), 1);
+		OUT_RING  (chan, 0);
 	}
-	BEGIN_RING(kelvin, NV20TCL_TX_SHADER_OP, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_TX_SHADER_CULL_MODE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_IN_ALPHA(0), 4);
-	OUT_RING  (0x30d410d0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_OUT_RGB(0), 4);
-	OUT_RING  (0x00000c00);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_ENABLE, 1);
-	OUT_RING  (0x00011101);
-	BEGIN_RING(kelvin, NV20TCL_RC_FINAL0, 2);
-	OUT_RING  (0x130e0300);
-	OUT_RING  (0x0c091c80);
-	BEGIN_RING(kelvin, NV20TCL_RC_OUT_ALPHA(0), 4);
-	OUT_RING  (0x00000c00);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_IN_RGB(0), 4);
-	OUT_RING  (0x20c400c0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_COLOR0, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_CONSTANT_COLOR0(0), 4);
-	OUT_RING  (0x035125a0);
-	OUT_RING  (0);
-	OUT_RING  (0x40002000);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_MULTISAMPLE_CONTROL, 1);
-	OUT_RING  (0xffff0000);
-
-	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_DITHER_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_STENCIL_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_SRC, 4);
-	OUT_RING  (NV20TCL_BLEND_FUNC_SRC_ONE);
-	OUT_RING  (NV20TCL_BLEND_FUNC_DST_ZERO);
-	OUT_RING  (0);			/* NV20TCL_BLEND_COLOR */
-	OUT_RING  (NV20TCL_BLEND_EQUATION_FUNC_ADD);
-	BEGIN_RING(kelvin, NV20TCL_STENCIL_MASK, 7);
-	OUT_RING  (0xff);
-	OUT_RING  (NV20TCL_STENCIL_FUNC_FUNC_ALWAYS);
-	OUT_RING  (0);			/* NV20TCL_STENCIL_FUNC_REF */
-	OUT_RING  (0xff);		/* NV20TCL_STENCIL_FUNC_MASK */
-	OUT_RING  (NV20TCL_STENCIL_OP_FAIL_KEEP);
-	OUT_RING  (NV20TCL_STENCIL_OP_ZFAIL_KEEP);
-	OUT_RING  (NV20TCL_STENCIL_OP_ZPASS_KEEP);
-
-	BEGIN_RING(kelvin, NV20TCL_COLOR_LOGIC_OP_ENABLE, 2);
-	OUT_RING  (0);
-	OUT_RING  (NV20TCL_COLOR_LOGIC_OP_OP_COPY);
-	BEGIN_RING(kelvin, 0x17cc, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, NV20TCL_TX_SHADER_OP, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_TX_SHADER_CULL_MODE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_IN_ALPHA(0), 4);
+	OUT_RING  (chan, 0x30d410d0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_OUT_RGB(0), 4);
+	OUT_RING  (chan, 0x00000c00);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_ENABLE, 1);
+	OUT_RING  (chan, 0x00011101);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_FINAL0, 2);
+	OUT_RING  (chan, 0x130e0300);
+	OUT_RING  (chan, 0x0c091c80);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_OUT_ALPHA(0), 4);
+	OUT_RING  (chan, 0x00000c00);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_IN_RGB(0), 4);
+	OUT_RING  (chan, 0x20c400c0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_COLOR0, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_CONSTANT_COLOR0(0), 4);
+	OUT_RING  (chan, 0x035125a0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0x40002000);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_MULTISAMPLE_CONTROL, 1);
+	OUT_RING  (chan, 0xffff0000);
+
+	BEGIN_RING(chan, kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_DITHER_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_STENCIL_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_BLEND_FUNC_SRC, 4);
+	OUT_RING  (chan, NV20TCL_BLEND_FUNC_SRC_ONE);
+	OUT_RING  (chan, NV20TCL_BLEND_FUNC_DST_ZERO);
+	OUT_RING  (chan, 0);			/* NV20TCL_BLEND_COLOR */
+	OUT_RING  (chan, NV20TCL_BLEND_EQUATION_FUNC_ADD);
+	BEGIN_RING(chan, kelvin, NV20TCL_STENCIL_MASK, 7);
+	OUT_RING  (chan, 0xff);
+	OUT_RING  (chan, NV20TCL_STENCIL_FUNC_FUNC_ALWAYS);
+	OUT_RING  (chan, 0);			/* NV20TCL_STENCIL_FUNC_REF */
+	OUT_RING  (chan, 0xff);		/* NV20TCL_STENCIL_FUNC_MASK */
+	OUT_RING  (chan, NV20TCL_STENCIL_OP_FAIL_KEEP);
+	OUT_RING  (chan, NV20TCL_STENCIL_OP_ZFAIL_KEEP);
+	OUT_RING  (chan, NV20TCL_STENCIL_OP_ZPASS_KEEP);
+
+	BEGIN_RING(chan, kelvin, NV20TCL_COLOR_LOGIC_OP_ENABLE, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, NV20TCL_COLOR_LOGIC_OP_OP_COPY);
+	BEGIN_RING(chan, kelvin, 0x17cc, 1);
+	OUT_RING  (chan, 0);
 	if (is_nv25tcl) {
-		BEGIN_RING(kelvin, 0x1d84, 1);
-		OUT_RING  (1);
+		BEGIN_RING(chan, kelvin, 0x1d84, 1);
+		OUT_RING  (chan, 1);
 	}
-	BEGIN_RING(kelvin, NV20TCL_LIGHTING_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_LIGHT_CONTROL, 1);
-	OUT_RING  (0x00020000);
-	BEGIN_RING(kelvin, NV20TCL_SEPARATE_SPECULAR_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_LIGHT_MODEL_TWO_SIDE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_ENABLED_LIGHTS, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_NORMALIZE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_STIPPLE_PATTERN(0),
+	BEGIN_RING(chan, kelvin, NV20TCL_LIGHTING_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_LIGHT_CONTROL, 1);
+	OUT_RING  (chan, 0x00020000);
+	BEGIN_RING(chan, kelvin, NV20TCL_SEPARATE_SPECULAR_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_LIGHT_MODEL_TWO_SIDE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_ENABLED_LIGHTS, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_NORMALIZE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_STIPPLE_PATTERN(0),
 					NV20TCL_POLYGON_STIPPLE_PATTERN__SIZE);
 	for (i = 0; i < NV20TCL_POLYGON_STIPPLE_PATTERN__SIZE; ++i) {
-		OUT_RING(0xffffffff);
+		OUT_RING(chan, 0xffffffff);
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
-	OUT_RING  (0);
-	OUT_RING  (0);		/* NV20TCL.POLYGON_OFFSET_LINE_ENABLE */
-	OUT_RING  (0);		/* NV20TCL.POLYGON_OFFSET_FILL_ENABLE */
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_FUNC, 1);
-	OUT_RING  (NV20TCL_DEPTH_FUNC_LESS);
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_OFFSET_FACTOR, 2);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);	/* NV20TCL.POLYGON_OFFSET_UNITS */
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_UNK17D8, 1);
-	OUT_RING  (1);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);		/* NV20TCL.POLYGON_OFFSET_LINE_ENABLE */
+	OUT_RING  (chan, 0);		/* NV20TCL.POLYGON_OFFSET_FILL_ENABLE */
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_FUNC, 1);
+	OUT_RING  (chan, NV20TCL_DEPTH_FUNC_LESS);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_OFFSET_FACTOR, 2);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);	/* NV20TCL.POLYGON_OFFSET_UNITS */
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_UNK17D8, 1);
+	OUT_RING  (chan, 1);
 	if (!is_nv25tcl) {
-		BEGIN_RING(kelvin, 0x1d84, 1);
-		OUT_RING  (3);
+		BEGIN_RING(chan, kelvin, 0x1d84, 1);
+		OUT_RING  (chan, 3);
 	}
-	BEGIN_RING(kelvin, NV20TCL_POINT_SIZE, 1);
+	BEGIN_RING(chan, kelvin, NV20TCL_POINT_SIZE, 1);
 	if (!is_nv25tcl) {
-		OUT_RING  (8);
+		OUT_RING  (chan, 8);
 	} else {
-		OUT_RINGf (1.0);
+		OUT_RINGf (chan, 1.0);
 	}
 	if (!is_nv25tcl) {
-		BEGIN_RING(kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 2);
-		OUT_RING  (0);
-		OUT_RING  (0);		/* NV20TCL.POINT_SMOOTH_ENABLE */
+		BEGIN_RING(chan, kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 2);
+		OUT_RING  (chan, 0);
+		OUT_RING  (chan, 0);		/* NV20TCL.POINT_SMOOTH_ENABLE */
 	} else {
-		BEGIN_RING(kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 1);
-		OUT_RING  (0);
-		BEGIN_RING(kelvin, 0x0a1c, 1);
-		OUT_RING  (0x800);
+		BEGIN_RING(chan, kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 1);
+		OUT_RING  (chan, 0);
+		BEGIN_RING(chan, kelvin, 0x0a1c, 1);
+		OUT_RING  (chan, 0x800);
 	}
-	BEGIN_RING(kelvin, NV20TCL_LINE_WIDTH, 1);
-	OUT_RING  (8);
-	BEGIN_RING(kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
-	OUT_RING  (NV20TCL_POLYGON_MODE_FRONT_FILL);
-	OUT_RING  (NV20TCL_POLYGON_MODE_BACK_FILL);
-	BEGIN_RING(kelvin, NV20TCL_CULL_FACE, 2);
-	OUT_RING  (NV20TCL_CULL_FACE_BACK);
-	OUT_RING  (NV20TCL_FRONT_FACE_CCW);
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_SMOOTH_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_SHADE_MODEL, 1);
-	OUT_RING  (NV20TCL_SHADE_MODEL_SMOOTH);
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_STIPPLE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_TX_GEN_S(0), 4 * NV20TCL_TX_GEN_S__SIZE);
+	BEGIN_RING(chan, kelvin, NV20TCL_LINE_WIDTH, 1);
+	OUT_RING  (chan, 8);
+	BEGIN_RING(chan, kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (chan, NV20TCL_POLYGON_MODE_FRONT_FILL);
+	OUT_RING  (chan, NV20TCL_POLYGON_MODE_BACK_FILL);
+	BEGIN_RING(chan, kelvin, NV20TCL_CULL_FACE, 2);
+	OUT_RING  (chan, NV20TCL_CULL_FACE_BACK);
+	OUT_RING  (chan, NV20TCL_FRONT_FACE_CCW);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_SMOOTH_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_SHADE_MODEL, 1);
+	OUT_RING  (chan, NV20TCL_SHADE_MODEL_SMOOTH);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_STIPPLE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_TX_GEN_S(0), 4 * NV20TCL_TX_GEN_S__SIZE);
 	for (i=0; i < 4 * NV20TCL_TX_GEN_S__SIZE; ++i) {
-		OUT_RING(0);
+		OUT_RING(chan, 0);
 	}
-	BEGIN_RING(kelvin, NV20TCL_FOG_EQUATION_CONSTANT, 3);
-	OUT_RINGf (1.5);
-	OUT_RINGf (-0.090168);		/* NV20TCL.FOG_EQUATION_LINEAR */
-	OUT_RINGf (0.0);		/* NV20TCL.FOG_EQUATION_QUADRATIC */
-	BEGIN_RING(kelvin, NV20TCL_FOG_MODE, 2);
-	OUT_RING  (NV20TCL_FOG_MODE_EXP_2);
-	OUT_RING  (NV20TCL_FOG_COORD_DIST_COORD_FOG);
-	BEGIN_RING(kelvin, NV20TCL_FOG_ENABLE, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);			/* NV20TCL.FOG_COLOR */
-	BEGIN_RING(kelvin, NV20TCL_ENGINE, 1);
-	OUT_RING  (NV20TCL_ENGINE_FIXED);
+	BEGIN_RING(chan, kelvin, NV20TCL_FOG_EQUATION_CONSTANT, 3);
+	OUT_RINGf (chan, 1.5);
+	OUT_RINGf (chan, -0.090168);		/* NV20TCL.FOG_EQUATION_LINEAR */
+	OUT_RINGf (chan, 0.0);		/* NV20TCL.FOG_EQUATION_QUADRATIC */
+	BEGIN_RING(chan, kelvin, NV20TCL_FOG_MODE, 2);
+	OUT_RING  (chan, NV20TCL_FOG_MODE_EXP_SIGNED);
+	OUT_RING  (chan, NV20TCL_FOG_COORD_FOG);
+	BEGIN_RING(chan, kelvin, NV20TCL_FOG_ENABLE, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);			/* NV20TCL.FOG_COLOR */
+	BEGIN_RING(chan, kelvin, NV20TCL_ENGINE, 1);
+	OUT_RING  (chan, NV20TCL_ENGINE_FIXED);
 
 	for (i = 0; i < NV20TCL_TX_MATRIX_ENABLE__SIZE; ++i) {
-		BEGIN_RING(kelvin, NV20TCL_TX_MATRIX_ENABLE(i), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_MATRIX_ENABLE(i), 1);
+		OUT_RING  (chan, 0);
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_VTX_ATTR_4F_X(1), 4 * 15);
-	OUT_RINGf(1.0); OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(1.0);
-	OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(1.0); OUT_RINGf(1.0);
-	OUT_RINGf(1.0); OUT_RINGf(1.0); OUT_RINGf(1.0); OUT_RINGf(1.0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VTX_ATTR_4F_X(1), 4 * 15);
+	OUT_RINGf(chan, 1.0); OUT_RINGf(chan, 0.0); OUT_RINGf(chan, 0.0); OUT_RINGf(chan, 1.0);
+	OUT_RINGf(chan, 0.0); OUT_RINGf(chan, 0.0); OUT_RINGf(chan, 1.0); OUT_RINGf(chan, 1.0);
+	OUT_RINGf(chan, 1.0); OUT_RINGf(chan, 1.0); OUT_RINGf(chan, 1.0); OUT_RINGf(chan, 1.0);
 	for (i = 4; i < 16; ++i) {
-		OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(0.0);	OUT_RINGf(1.0);
+		OUT_RINGf(chan, 0.0);
+		OUT_RINGf(chan, 0.0);
+		OUT_RINGf(chan, 0.0);
+		OUT_RINGf(chan, 1.0);
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_EDGEFLAG_ENABLE, 1);
-	OUT_RING  (1);
-	BEGIN_RING(kelvin, NV20TCL_COLOR_MASK, 1);
-	OUT_RING (0x00010101);
-	BEGIN_RING(kelvin, NV20TCL_CLEAR_VALUE, 1);
-	OUT_RING (0);
+	BEGIN_RING(chan, kelvin, NV20TCL_EDGEFLAG_ENABLE, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, kelvin, NV20TCL_COLOR_MASK, 1);
+	OUT_RING (chan, 0x00010101);
+	BEGIN_RING(chan, kelvin, NV20TCL_CLEAR_VALUE, 1);
+	OUT_RING (chan, 0);
 
 	memset(projectionmatrix, 0, sizeof(projectionmatrix));
 	projectionmatrix[0*4+0] = 1.0;
 	projectionmatrix[1*4+1] = 1.0;
 	projectionmatrix[2*4+2] = 16777215.0;
 	projectionmatrix[3*4+3] = 1.0;
-	BEGIN_RING(kelvin, NV20TCL_PROJECTION_MATRIX(0), 16);
+	BEGIN_RING(chan, kelvin, NV20TCL_PROJECTION_MATRIX(0), 16);
 	for (i = 0; i < 16; i++) {
-		OUT_RINGf  (projectionmatrix[i]);
+		OUT_RINGf  (chan, projectionmatrix[i]);
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_RANGE_NEAR, 2);
-	OUT_RINGf (0.0);
-	OUT_RINGf (16777216.0); /* [0, 1] scaled approx to [0, 2^24] */
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_RANGE_NEAR, 2);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 16777216.0); /* [0, 1] scaled approx to [0, 2^24] */
 
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_TRANSLATE_X, 4);
-	OUT_RINGf (0.0); /* x-offset, w/2 + 1.031250 */
-	OUT_RINGf (0.0); /* y-offset, h/2 + 0.030762 */
-	OUT_RINGf (0.0);
-	OUT_RINGf (16777215.0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_TRANSLATE_X, 4);
+	OUT_RINGf (chan, 0.0); /* x-offset, w/2 + 1.031250 */
+	OUT_RINGf (chan, 0.0); /* y-offset, h/2 + 0.030762 */
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 16777215.0);
 
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_SCALE_X, 4);
-	OUT_RINGf (0.0); /* no effect?, w/2 */
-	OUT_RINGf (0.0); /* no effect?, h/2 */
-	OUT_RINGf (16777215.0 * 0.5);
-	OUT_RINGf (65535.0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_SCALE_X, 4);
+	OUT_RINGf (chan, 0.0); /* no effect?, w/2 */
+	OUT_RINGf (chan, 0.0); /* no effect?, h/2 */
+	OUT_RINGf (chan, 16777215.0 * 0.5);
+	OUT_RINGf (chan, 65535.0);
 
-	FIRE_RING (NULL);
+	FIRE_RING (chan);
 }
 
 struct pipe_context *
diff --git a/src/gallium/drivers/nv20/nv20_context.h b/src/gallium/drivers/nv20/nv20_context.h
index a4eaa95660..c7dfadaa31 100644
--- a/src/gallium/drivers/nv20/nv20_context.h
+++ b/src/gallium/drivers/nv20/nv20_context.h
@@ -15,10 +15,6 @@
 #include "nouveau/nouveau_gldefs.h"
 #include "nouveau/nouveau_context.h"
 
-#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
-	struct nv20_screen *ctx = nv20->screen
-#include "nouveau/nouveau_push.h"
-
 #include "nv20_state.h"
 
 #define NOUVEAU_ERR(fmt, args...) \
@@ -143,9 +139,9 @@ extern void nv20_emit_hw_state(struct nv20_context *nv20);
 extern void nv20_state_tex_update(struct nv20_context *nv20);
 
 /* nv20_vbo.c */
-extern boolean nv20_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv20_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv20_draw_elements( struct pipe_context *pipe,
+extern void nv20_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count);
diff --git a/src/gallium/drivers/nv20/nv20_fragtex.c b/src/gallium/drivers/nv20/nv20_fragtex.c
index 2db4a4015a..dedbec73f3 100644
--- a/src/gallium/drivers/nv20/nv20_fragtex.c
+++ b/src/gallium/drivers/nv20/nv20_fragtex.c
@@ -52,6 +52,9 @@ nv20_fragtex_build(struct nv20_context *nv20, int unit)
 	struct nv20_miptree *nv20mt = nv20->tex_miptree[unit];
 	struct pipe_texture *pt = &nv20mt->base;
 	struct nv20_texture_format *tf;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	uint32_t txf, txs, txp;
 
 	tf = nv20_fragtex_format(pt->format);
@@ -82,15 +85,15 @@ nv20_fragtex_build(struct nv20_context *nv20, int unit)
 		return;
 	}
 
-	BEGIN_RING(kelvin, NV10TCL_TX_OFFSET(unit), 8);
-	OUT_RELOCl(nv20mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-	OUT_RELOCd(nv20mt->buffer,txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
-	OUT_RING  (ps->wrap);
-	OUT_RING  (0x40000000); /* enable */
-	OUT_RING  (txs);
-	OUT_RING  (ps->filt | 0x2000 /* magic */);
-	OUT_RING  ((pt->width0 << 16) | pt->height0);
-	OUT_RING  (ps->bcol);
+	BEGIN_RING(chan, kelvin, NV10TCL_TX_OFFSET(unit), 8);
+	OUT_RELOCl(chan, nouveau_bo(nv20mt->buffer), 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(chan, nouveau_bo(nv20mt->buffer),txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING  (chan, ps->wrap);
+	OUT_RING  (chan, 0x40000000); /* enable */
+	OUT_RING  (chan, txs);
+	OUT_RING  (chan, ps->filt | 0x2000 /* magic */);
+	OUT_RING  (chan, (pt->width0 << 16) | pt->height0);
+	OUT_RING  (chan, ps->bcol);
 #endif
 }
 
@@ -99,6 +102,9 @@ nv20_fragtex_bind(struct nv20_context *nv20)
 {
 #if 0
 	struct nv20_fragment_program *fp = nv20->fragprog.active;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	unsigned samplers, unit;
 
 	samplers = nv20->fp_samplers & ~fp->samplers;
@@ -106,8 +112,8 @@ nv20_fragtex_bind(struct nv20_context *nv20)
 		unit = ffs(samplers) - 1;
 		samplers &= ~(1 << unit);
 
-		BEGIN_RING(kelvin, NV10TCL_TX_ENABLE(unit), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, kelvin, NV10TCL_TX_ENABLE(unit), 1);
+		OUT_RING  (chan, 0);
 	}
 
 	samplers = nv20->dirty_samplers & fp->samplers;
diff --git a/src/gallium/drivers/nv20/nv20_prim_vbuf.c b/src/gallium/drivers/nv20/nv20_prim_vbuf.c
index ddfcdb8057..2e145672da 100644
--- a/src/gallium/drivers/nv20/nv20_prim_vbuf.c
+++ b/src/gallium/drivers/nv20/nv20_prim_vbuf.c
@@ -81,12 +81,15 @@ nv20_vbuf_render(struct vbuf_render *render)
 void nv20_vtxbuf_bind( struct nv20_context* nv20 )
 {
 #if 0
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	int i;
 	for(i = 0; i < NV20TCL_VTXBUF_ADDRESS__SIZE; i++) {
-		BEGIN_RING(kelvin, NV20TCL_VTXBUF_ADDRESS(i), 1);
-		OUT_RING(0/*nv20->vtxbuf*/);
-		BEGIN_RING(kelvin, NV20TCL_VTXFMT(i) ,1);
-		OUT_RING(0/*XXX*/);
+		BEGIN_RING(chan, kelvin, NV20TCL_VTXBUF_ADDRESS(i), 1);
+		OUT_RING(chan, 0/*nv20->vtxbuf*/);
+		BEGIN_RING(chan, kelvin, NV20TCL_VTXFMT(i) ,1);
+		OUT_RING(chan, 0/*XXX*/);
 	}
 #endif
 }
@@ -202,6 +205,9 @@ nv20__vtxhwformat(unsigned stride, unsigned fields, unsigned type)
 static unsigned
 nv20__emit_format(struct nv20_context *nv20, enum attrib_emit type, int hwattr)
 {
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	uint32_t hwfmt = 0;
 	unsigned fields;
 
@@ -231,8 +237,8 @@ nv20__emit_format(struct nv20_context *nv20, enum attrib_emit type, int hwattr)
 		return 0;
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_VTXFMT(hwattr), 1);
-	OUT_RING(hwfmt);
+	BEGIN_RING(chan, kelvin, NV20TCL_VTXFMT(hwattr), 1);
+	OUT_RING(chan, hwfmt);
 	return fields;
 }
 
@@ -262,6 +268,9 @@ nv20__draw_mbuffer(struct nv20_vbuf_render *nv20_render,
 		uint nr_indices)
 {
 	struct nv20_context *nv20 = nv20_render->nv20;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	struct vertex_info *vinfo = &nv20->vertex_info;
 	unsigned nr_fields;
 	int max_push;
@@ -270,29 +279,29 @@ nv20__draw_mbuffer(struct nv20_vbuf_render *nv20_render,
 
 	nr_fields = nv20__emit_vertex_array_format(nv20);
 
-	BEGIN_RING(kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
-	OUT_RING(nv20_render->hwprim);
+	BEGIN_RING(chan, kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING(chan, nv20_render->hwprim);
 
 	max_push = 1200 / nr_fields;
 	while (nr_indices) {
 		int i;
 		int push = MIN2(nr_indices, max_push);
 
-		BEGIN_RING_NI(kelvin, NV20TCL_VERTEX_DATA, push * nr_fields);
+		BEGIN_RING_NI(chan, kelvin, NV20TCL_VERTEX_DATA, push * nr_fields);
 		for (i = 0; i < push; i++) {
 			/* XXX: fixme to handle other than floats? */
 			int f = nr_fields;
 			float *attrv = (float*)&data[indices[i] * vsz];
 			while (f-- > 0)
-				OUT_RINGf(*attrv++);
+				OUT_RINGf(chan, *attrv++);
 		}
 
 		nr_indices -= push;
 		indices += push;
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
-	OUT_RING(NV20TCL_VERTEX_BEGIN_END_STOP);
+	BEGIN_RING(chan, kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING(chan, NV20TCL_VERTEX_BEGIN_END_STOP);
 }
 
 static void
@@ -301,20 +310,23 @@ nv20__draw_pbuffer(struct nv20_vbuf_render *nv20_render,
 		uint nr_indices)
 {
 	struct nv20_context *nv20 = nv20_render->nv20;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	int push, i;
 
 	NOUVEAU_ERR("nv20__draw_pbuffer: this path is broken.\n");
 
-	BEGIN_RING(kelvin, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
-	OUT_RELOCl(nv20_render->pbuffer, 0,
+	BEGIN_RING(chan, kelvin, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
+	OUT_RELOCl(chan, nouveau_bo(nv20_render->pbuffer), 0,
 			NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
 
-	BEGIN_RING(kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
-	OUT_RING(nv20_render->hwprim);
+	BEGIN_RING(chan, kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING(chan, nv20_render->hwprim);
 
 	if (nr_indices & 1) {
-		BEGIN_RING(kelvin, NV10TCL_VB_ELEMENT_U32, 1);
-		OUT_RING  (indices[0]);
+		BEGIN_RING(chan, kelvin, NV10TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (chan, indices[0]);
 		indices++; nr_indices--;
 	}
 
@@ -322,16 +334,16 @@ nv20__draw_pbuffer(struct nv20_vbuf_render *nv20_render,
 		// XXX too big/small ? check the size
 		push = MIN2(nr_indices, 1200 * 2);
 
-		BEGIN_RING_NI(kelvin, NV10TCL_VB_ELEMENT_U16, push >> 1);
+		BEGIN_RING_NI(chan, kelvin, NV10TCL_VB_ELEMENT_U16, push >> 1);
 		for (i = 0; i < push; i+=2)
-			OUT_RING((indices[i+1] << 16) | indices[i]);
+			OUT_RING(chan, (indices[i+1] << 16) | indices[i]);
 
 		nr_indices -= push;
 		indices  += push;
 	}
 
-	BEGIN_RING(kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING  (chan, 0);
 }
 
 static void
diff --git a/src/gallium/drivers/nv20/nv20_screen.c b/src/gallium/drivers/nv20/nv20_screen.c
index a0973f1ebd..d091335063 100644
--- a/src/gallium/drivers/nv20/nv20_screen.c
+++ b/src/gallium/drivers/nv20/nv20_screen.c
@@ -176,7 +176,6 @@ nv20_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return FALSE;
 	}
-	BIND_RING(chan, screen->kelvin, 7);
 
 	/* 2D engine setup */
 	screen->eng2d = nv04_surface_2d_init(&screen->base);
diff --git a/src/gallium/drivers/nv20/nv20_state_emit.c b/src/gallium/drivers/nv20/nv20_state_emit.c
index 63cba1f412..6bbd1fdae9 100644
--- a/src/gallium/drivers/nv20/nv20_state_emit.c
+++ b/src/gallium/drivers/nv20/nv20_state_emit.c
@@ -5,27 +5,34 @@
 static void nv20_state_emit_blend(struct nv20_context* nv20)
 {
 	struct nv20_blend_state *b = nv20->blend;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 
-	BEGIN_RING(kelvin, NV20TCL_DITHER_ENABLE, 1);
-	OUT_RING  (b->d_enable);
+	BEGIN_RING(chan, kelvin, NV20TCL_DITHER_ENABLE, 1);
+	OUT_RING  (chan, b->d_enable);
 
-	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
-	OUT_RING  (b->b_enable);
+	BEGIN_RING(chan, kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (chan, b->b_enable);
 
-	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_SRC, 2);
-	OUT_RING  (b->b_srcfunc);
-	OUT_RING  (b->b_dstfunc);
+	BEGIN_RING(chan, kelvin, NV20TCL_BLEND_FUNC_SRC, 2);
+	OUT_RING  (chan, b->b_srcfunc);
+	OUT_RING  (chan, b->b_dstfunc);
 
-	BEGIN_RING(kelvin, NV20TCL_COLOR_MASK, 1);
-	OUT_RING  (b->c_mask);
+	BEGIN_RING(chan, kelvin, NV20TCL_COLOR_MASK, 1);
+	OUT_RING  (chan, b->c_mask);
 }
 
 static void nv20_state_emit_blend_color(struct nv20_context* nv20)
 {
 	struct pipe_blend_color *c = nv20->blend_color;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 
-	BEGIN_RING(kelvin, NV20TCL_BLEND_COLOR, 1);
-	OUT_RING  ((float_to_ubyte(c->color[3]) << 24)|
+	BEGIN_RING(chan, kelvin, NV20TCL_BLEND_COLOR, 1);
+	OUT_RING  (chan,
+		   (float_to_ubyte(c->color[3]) << 24)|
 		   (float_to_ubyte(c->color[0]) << 16)|
 		   (float_to_ubyte(c->color[1]) << 8) |
 		   (float_to_ubyte(c->color[2]) << 0));
@@ -34,63 +41,69 @@ static void nv20_state_emit_blend_color(struct nv20_context* nv20)
 static void nv20_state_emit_rast(struct nv20_context* nv20)
 {
 	struct nv20_rasterizer_state *r = nv20->rast;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 
-	BEGIN_RING(kelvin, NV20TCL_SHADE_MODEL, 2);
-	OUT_RING  (r->shade_model);
-	OUT_RING  (r->line_width);
+	BEGIN_RING(chan, kelvin, NV20TCL_SHADE_MODEL, 2);
+	OUT_RING  (chan, r->shade_model);
+	OUT_RING  (chan, r->line_width);
 
 
-	BEGIN_RING(kelvin, NV20TCL_POINT_SIZE, 1);
-	OUT_RING  (r->point_size);
+	BEGIN_RING(chan, kelvin, NV20TCL_POINT_SIZE, 1);
+	OUT_RING  (chan, r->point_size);
 
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
-	OUT_RING  (r->poly_mode_front);
-	OUT_RING  (r->poly_mode_back);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (chan, r->poly_mode_front);
+	OUT_RING  (chan, r->poly_mode_back);
 
 
-	BEGIN_RING(kelvin, NV20TCL_CULL_FACE, 2);
-	OUT_RING  (r->cull_face);
-	OUT_RING  (r->front_face);
+	BEGIN_RING(chan, kelvin, NV20TCL_CULL_FACE, 2);
+	OUT_RING  (chan, r->cull_face);
+	OUT_RING  (chan, r->front_face);
 
-	BEGIN_RING(kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 2);
-	OUT_RING  (r->line_smooth_en);
-	OUT_RING  (r->poly_smooth_en);
+	BEGIN_RING(chan, kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 2);
+	OUT_RING  (chan, r->line_smooth_en);
+	OUT_RING  (chan, r->poly_smooth_en);
 
-	BEGIN_RING(kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
-	OUT_RING  (r->cull_face_en);
+	BEGIN_RING(chan, kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (chan, r->cull_face_en);
 }
 
 static void nv20_state_emit_dsa(struct nv20_context* nv20)
 {
 	struct nv20_depth_stencil_alpha_state *d = nv20->dsa;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_FUNC, 1);
-	OUT_RING (d->depth.func);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_FUNC, 1);
+	OUT_RING (chan, d->depth.func);
 
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
-	OUT_RING (d->depth.write_enable);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING (chan, d->depth.write_enable);
 
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
-	OUT_RING (d->depth.test_enable);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING (chan, d->depth.test_enable);
 
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_UNK17D8, 1);
-	OUT_RING (1);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_UNK17D8, 1);
+	OUT_RING (chan, 1);
 
 #if 0
-	BEGIN_RING(kelvin, NV20TCL_STENCIL_ENABLE, 1);
-	OUT_RING (d->stencil.enable);
-	BEGIN_RING(kelvin, NV20TCL_STENCIL_MASK, 7);
-	OUT_RINGp ((uint32_t *)&(d->stencil.wmask), 7);
+	BEGIN_RING(chan, kelvin, NV20TCL_STENCIL_ENABLE, 1);
+	OUT_RING (chan, d->stencil.enable);
+	BEGIN_RING(chan, kelvin, NV20TCL_STENCIL_MASK, 7);
+	OUT_RINGp (chan, (uint32_t *)&(d->stencil.wmask), 7);
 #endif
 
-	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
-	OUT_RING (d->alpha.enabled);
+	BEGIN_RING(chan, kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING (chan, d->alpha.enabled);
 
-	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_FUNC, 1);
-	OUT_RING (d->alpha.func);
+	BEGIN_RING(chan, kelvin, NV20TCL_ALPHA_FUNC_FUNC, 1);
+	OUT_RING (chan, d->alpha.func);
 
-	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_REF, 1);
-	OUT_RING (d->alpha.ref);
+	BEGIN_RING(chan, kelvin, NV20TCL_ALPHA_FUNC_REF, 1);
+	OUT_RING (chan, d->alpha.ref);
 }
 
 static void nv20_state_emit_viewport(struct nv20_context* nv20)
@@ -101,9 +114,13 @@ static void nv20_state_emit_scissor(struct nv20_context* nv20)
 {
 	/* NV20TCL_SCISSOR_* is probably a software method */
 /*	struct pipe_scissor_state *s = nv20->scissor;
-	BEGIN_RING(kelvin, NV20TCL_SCISSOR_HORIZ, 2);
-	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
-	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
+
+	BEGIN_RING(chan, kelvin, NV20TCL_SCISSOR_HORIZ, 2);
+	OUT_RING  (chan, ((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (chan, ((s->maxy - s->miny) << 16) | s->miny);*/
 }
 
 static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
@@ -113,6 +130,9 @@ static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
 	uint32_t rt_format, w, h;
 	int colour_format = 0, zeta_format = 0;
 	struct nv20_miptree *nv20mt = 0;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 
 	w = fb->cbufs[0]->width;
 	h = fb->cbufs[0]->height;
@@ -150,11 +170,11 @@ static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
 	}
 
 	if (zeta) {
-		BEGIN_RING(kelvin, NV20TCL_RT_PITCH, 1);
-		OUT_RING  (rt->pitch | (zeta->pitch << 16));
+		BEGIN_RING(chan, kelvin, NV20TCL_RT_PITCH, 1);
+		OUT_RING  (chan, rt->pitch | (zeta->pitch << 16));
 	} else {
-		BEGIN_RING(kelvin, NV20TCL_RT_PITCH, 1);
-		OUT_RING  (rt->pitch | (rt->pitch << 16));
+		BEGIN_RING(chan, kelvin, NV20TCL_RT_PITCH, 1);
+		OUT_RING  (chan, rt->pitch | (rt->pitch << 16));
 	}
 
 	nv20mt = (struct nv20_miptree *)rt->base.texture;
@@ -166,13 +186,13 @@ static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
 		nv20->zeta = nv20mt->buffer;
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 3);
-	OUT_RING  ((w << 16) | 0);
-	OUT_RING  ((h << 16) | 0); /*NV20TCL_RT_VERT */
-	OUT_RING  (rt_format); /* NV20TCL_RT_FORMAT */
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 2);
-	OUT_RING  (((w - 1) << 16) | 0);
-	OUT_RING  (((h - 1) << 16) | 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RT_HORIZ, 3);
+	OUT_RING  (chan, (w << 16) | 0);
+	OUT_RING  (chan, (h << 16) | 0); /*NV20TCL_RT_VERT */
+	OUT_RING  (chan, rt_format); /* NV20TCL_RT_FORMAT */
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  (chan, ((w - 1) << 16) | 0);
+	OUT_RING  (chan, ((h - 1) << 16) | 0);
 }
 
 static void nv20_vertex_layout(struct nv20_context *nv20)
@@ -293,6 +313,10 @@ static void nv20_vertex_layout(struct nv20_context *nv20)
 void
 nv20_emit_hw_state(struct nv20_context *nv20)
 {
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
+	struct nouveau_bo *rt_bo;
 	int i;
 
 	if (nv20->dirty & NV20_NEW_VERTPROG) {
@@ -361,36 +385,39 @@ nv20_emit_hw_state(struct nv20_context *nv20)
 	 */
 
 	/* Render target */
-	BEGIN_RING(kelvin, NV20TCL_DMA_COLOR, 1);
-	OUT_RELOCo(nv20->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	BEGIN_RING(kelvin, NV20TCL_COLOR_OFFSET, 1);
-	OUT_RELOCl(nv20->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	rt_bo = nouveau_bo(nv20->rt[0]);
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_COLOR, 1);
+	OUT_RELOCo(chan, rt_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, kelvin, NV20TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(chan, rt_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	if (nv20->zeta) {
-		BEGIN_RING(kelvin, NV20TCL_DMA_ZETA, 1);
-		OUT_RELOCo(nv20->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(kelvin, NV20TCL_ZETA_OFFSET, 1);
-		OUT_RELOCl(nv20->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		struct nouveau_bo *zeta_bo = nouveau_bo(nv20->zeta);
+		BEGIN_RING(chan, kelvin, NV20TCL_DMA_ZETA, 1);
+		OUT_RELOCo(chan, zeta_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(chan, kelvin, NV20TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(chan, zeta_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 		/* XXX for when we allocate LMA on nv17 */
-/*		BEGIN_RING(kelvin, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
-		OUT_RELOCl(nv20->zeta + lma_offset);*/
+/*		BEGIN_RING(chan, kelvin, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
+		OUT_RELOCl(chan, nouveau_bo(nv20->zeta + lma_offset));*/
 	}
 
 	/* Vertex buffer */
-	BEGIN_RING(kelvin, NV20TCL_DMA_VTXBUF0, 1);
-	OUT_RELOCo(nv20->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	BEGIN_RING(kelvin, NV20TCL_COLOR_OFFSET, 1);
-	OUT_RELOCl(nv20->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_VTXBUF0, 1);
+	OUT_RELOCo(chan, rt_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, kelvin, NV20TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(chan, rt_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	/* Texture images */
 	for (i = 0; i < 2; i++) {
 		if (!(nv20->fp_samplers & (1 << i)))
 			continue;
-		BEGIN_RING(kelvin, NV20TCL_TX_OFFSET(i), 1);
-		OUT_RELOCl(nv20->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
+		struct nouveau_bo *bo = nouveau_bo(nv20->tex[i].buffer);
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_OFFSET(i), 1);
+		OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM |
 			   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-		BEGIN_RING(kelvin, NV20TCL_TX_FORMAT(i), 1);
-		OUT_RELOCd(nv20->tex[i].buffer, nv20->tex[i].format,
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_FORMAT(i), 1);
+		OUT_RELOCd(chan, bo, nv20->tex[i].format,
 			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
 			   NOUVEAU_BO_OR, NV20TCL_TX_FORMAT_DMA0,
 			   NV20TCL_TX_FORMAT_DMA1);
diff --git a/src/gallium/drivers/nv20/nv20_vbo.c b/src/gallium/drivers/nv20/nv20_vbo.c
index 4bf461eba9..52991a0d85 100644
--- a/src/gallium/drivers/nv20/nv20_vbo.c
+++ b/src/gallium/drivers/nv20/nv20_vbo.c
@@ -9,7 +9,7 @@
 #include "nouveau/nouveau_channel.h"
 #include "nouveau/nouveau_pushbuf.h"
 
-boolean nv20_draw_elements( struct pipe_context *pipe,
+void nv20_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count)
@@ -67,13 +67,12 @@ boolean nv20_draw_elements( struct pipe_context *pipe,
 	}
 
 	draw_flush(nv20->draw);
-	return TRUE;
 }
 
-boolean nv20_draw_arrays( struct pipe_context *pipe,
+void nv20_draw_arrays( struct pipe_context *pipe,
 				 unsigned prim, unsigned start, unsigned count)
 {
-	return nv20_draw_elements(pipe, NULL, 0, prim, start, count);
+	nv20_draw_elements(pipe, NULL, 0, prim, start, count);
 }
 
 
diff --git a/src/gallium/drivers/nv30/nv30_context.c b/src/gallium/drivers/nv30/nv30_context.c
index 38b39159f1..54572e9ab3 100644
--- a/src/gallium/drivers/nv30/nv30_context.c
+++ b/src/gallium/drivers/nv30/nv30_context.c
@@ -10,15 +10,20 @@ nv30_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 
 	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
-		BEGIN_RING(rankine, 0x1fd8, 1);
-		OUT_RING  (2);
-		BEGIN_RING(rankine, 0x1fd8, 1);
-		OUT_RING  (1);
+		BEGIN_RING(chan, rankine, 0x1fd8, 1);
+		OUT_RING  (chan, 2);
+		BEGIN_RING(chan, rankine, 0x1fd8, 1);
+		OUT_RING  (chan, 1);
 	}
 
-	FIRE_RING(fence);
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
 }
 
 static void
diff --git a/src/gallium/drivers/nv30/nv30_context.h b/src/gallium/drivers/nv30/nv30_context.h
index 864ddaeb59..e59449287b 100644
--- a/src/gallium/drivers/nv30/nv30_context.h
+++ b/src/gallium/drivers/nv30/nv30_context.h
@@ -14,10 +14,6 @@
 #include "nouveau/nouveau_winsys.h"
 #include "nouveau/nouveau_gldefs.h"
 #include "nouveau/nouveau_context.h"
-
-#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
-	struct nv30_screen *ctx = nv30->screen
-#include "nouveau/nouveau_push.h"
 #include "nouveau/nouveau_stateobj.h"
 
 #include "nv30_state.h"
@@ -198,9 +194,9 @@ extern struct nv30_state_entry nv30_state_fragtex;
 extern struct nv30_state_entry nv30_state_vbo;
 
 /* nv30_vbo.c */
-extern boolean nv30_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv30_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv30_draw_elements(struct pipe_context *pipe,
+extern void nv30_draw_elements(struct pipe_context *pipe,
 				  struct pipe_buffer *indexBuffer,
 				  unsigned indexSize,
 				  unsigned mode, unsigned start,
diff --git a/src/gallium/drivers/nv30/nv30_fragprog.c b/src/gallium/drivers/nv30/nv30_fragprog.c
index d1ff18e2df..2d565cb631 100644
--- a/src/gallium/drivers/nv30/nv30_fragprog.c
+++ b/src/gallium/drivers/nv30/nv30_fragprog.c
@@ -837,7 +837,7 @@ nv30_fragprog_validate(struct nv30_context *nv30)
 	fp->buffer = pscreen->buffer_create(pscreen, 0x100, 0, fp->insn_len * 4);
 	nv30_fragprog_upload(nv30, fp);
 
-	so = so_new(8, 1);
+	so = so_new(4, 4, 1);
 	so_method(so, nv30->screen->rankine, NV34TCL_FP_ACTIVE_PROGRAM, 1);
 	so_reloc (so, nouveau_bo(fp->buffer), 0, NOUVEAU_BO_VRAM |
 		      NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
diff --git a/src/gallium/drivers/nv30/nv30_fragtex.c b/src/gallium/drivers/nv30/nv30_fragtex.c
index b3293ee700..9893567891 100644
--- a/src/gallium/drivers/nv30/nv30_fragtex.c
+++ b/src/gallium/drivers/nv30/nv30_fragtex.c
@@ -106,7 +106,7 @@ nv30_fragtex_build(struct nv30_context *nv30, int unit)
 
 	txs = tf->swizzle;
 
-	so = so_new(16, 2);
+	so = so_new(1, 8, 2);
 	so_method(so, nv30->screen->rankine, NV34TCL_TX_OFFSET(unit), 8);
 	so_reloc (so, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
 	so_reloc (so, bo, txf, tex_flags | NOUVEAU_BO_OR,
@@ -135,7 +135,7 @@ nv30_fragtex_validate(struct nv30_context *nv30)
 		unit = ffs(samplers) - 1;
 		samplers &= ~(1 << unit);
 
-		so = so_new(2, 0);
+		so = so_new(1, 1, 0);
 		so_method(so, nv30->screen->rankine, NV34TCL_TX_ENABLE(unit), 1);
 		so_data  (so, 0);
 		so_ref(so, &nv30->state.hw[NV30_STATE_FRAGTEX0 + unit]);
diff --git a/src/gallium/drivers/nv30/nv30_query.c b/src/gallium/drivers/nv30/nv30_query.c
index 1d1c8a484e..e27e9ccbf6 100644
--- a/src/gallium/drivers/nv30/nv30_query.c
+++ b/src/gallium/drivers/nv30/nv30_query.c
@@ -41,6 +41,9 @@ nv30_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
 	struct nv30_query *q = nv30_query(pq);
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 
 	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
 
@@ -57,10 +60,10 @@ nv30_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 		assert(0);
 	nouveau_notifier_reset(nv30->screen->query, q->object->start);
 
-	BEGIN_RING(rankine, NV34TCL_QUERY_RESET, 1);
-	OUT_RING  (1);
-	BEGIN_RING(rankine, NV34TCL_QUERY_UNK17CC, 1);
-	OUT_RING  (1);
+	BEGIN_RING(chan, rankine, NV34TCL_QUERY_RESET, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, rankine, NV34TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (chan, 1);
 
 	q->ready = FALSE;
 }
@@ -69,12 +72,15 @@ static void
 nv30_query_end(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 	struct nv30_query *q = nv30_query(pq);
 
-	BEGIN_RING(rankine, NV34TCL_QUERY_GET, 1);
-	OUT_RING  ((0x01 << NV34TCL_QUERY_GET_UNK24_SHIFT) |
+	BEGIN_RING(chan, rankine, NV34TCL_QUERY_GET, 1);
+	OUT_RING  (chan, (0x01 << NV34TCL_QUERY_GET_UNK24_SHIFT) |
 		   ((q->object->start * 32) << NV34TCL_QUERY_GET_OFFSET_SHIFT));
-	FIRE_RING(NULL);
+	FIRE_RING(chan);
 }
 
 static boolean
diff --git a/src/gallium/drivers/nv30/nv30_screen.c b/src/gallium/drivers/nv30/nv30_screen.c
index 760467f736..9ed48178dc 100644
--- a/src/gallium/drivers/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nv30/nv30_screen.c
@@ -233,7 +233,6 @@ nv30_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return FALSE;
 	}
-	BIND_RING(chan, screen->rankine, 7);
 
 	/* 2D engine setup */
 	screen->eng2d = nv04_surface_2d_init(&screen->base);
@@ -270,7 +269,7 @@ nv30_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	/* Static rankine initialisation */
-	so = so_new(128, 0);
+	so = so_new(36, 60, 0);
 	so_method(so, screen->rankine, NV34TCL_DMA_NOTIFY, 1);
 	so_data  (so, screen->sync->handle);
 	so_method(so, screen->rankine, NV34TCL_DMA_TEXTURE0, 2);
diff --git a/src/gallium/drivers/nv30/nv30_state.c b/src/gallium/drivers/nv30/nv30_state.c
index e6321b480f..a80dfb0488 100644
--- a/src/gallium/drivers/nv30/nv30_state.c
+++ b/src/gallium/drivers/nv30/nv30_state.c
@@ -14,7 +14,7 @@ nv30_blend_state_create(struct pipe_context *pipe,
 	struct nv30_context *nv30 = nv30_context(pipe);
 	struct nouveau_grobj *rankine = nv30->screen->rankine;
 	struct nv30_blend_state *bso = CALLOC(1, sizeof(*bso));
-	struct nouveau_stateobj *so = so_new(16, 0);
+	struct nouveau_stateobj *so = so_new(5, 8, 0);
 
 	if (cso->blend_enable) {
 		so_method(so, rankine, NV34TCL_BLEND_FUNC_ENABLE, 3);
@@ -300,7 +300,7 @@ nv30_rasterizer_state_create(struct pipe_context *pipe,
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
 	struct nv30_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
-	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_stateobj *so = so_new(9, 19, 0);
 	struct nouveau_grobj *rankine = nv30->screen->rankine;
 
 	/*XXX: ignored:
@@ -435,7 +435,7 @@ nv30_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
 	struct nv30_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
-	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_stateobj *so = so_new(5, 21, 0);
 	struct nouveau_grobj *rankine = nv30->screen->rankine;
 
 	so_method(so, rankine, NV34TCL_DEPTH_FUNC, 3);
diff --git a/src/gallium/drivers/nv30/nv30_state_blend.c b/src/gallium/drivers/nv30/nv30_state_blend.c
index 64cf9ae93a..c36d58c040 100644
--- a/src/gallium/drivers/nv30/nv30_state_blend.c
+++ b/src/gallium/drivers/nv30/nv30_state_blend.c
@@ -18,7 +18,7 @@ struct nv30_state_entry nv30_state_blend = {
 static boolean
 nv30_state_blend_colour_validate(struct nv30_context *nv30)
 {
-	struct nouveau_stateobj *so = so_new(2, 0);
+	struct nouveau_stateobj *so = so_new(1, 1, 0);
 	struct pipe_blend_color *bcol = &nv30->blend_colour;
 
 	so_method(so, nv30->screen->rankine, NV34TCL_BLEND_COLOR, 1);
diff --git a/src/gallium/drivers/nv30/nv30_state_fb.c b/src/gallium/drivers/nv30/nv30_state_fb.c
index 6f6d1740d6..2ed2ea55e8 100644
--- a/src/gallium/drivers/nv30/nv30_state_fb.c
+++ b/src/gallium/drivers/nv30/nv30_state_fb.c
@@ -10,7 +10,7 @@ nv30_state_framebuffer_validate(struct nv30_context *nv30)
 	struct nv04_surface *rt[2], *zeta = NULL;
 	uint32_t rt_enable = 0, rt_format = 0;
 	int i, colour_format = 0, zeta_format = 0, depth_only = 0;
-	struct nouveau_stateobj *so = so_new(64, 10);
+	struct nouveau_stateobj *so = so_new(12, 18, 10);
 	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
 	unsigned w = fb->width;
 	unsigned h = fb->height;
diff --git a/src/gallium/drivers/nv30/nv30_state_scissor.c b/src/gallium/drivers/nv30/nv30_state_scissor.c
index 3ac7a8471e..ba61a9e24a 100644
--- a/src/gallium/drivers/nv30/nv30_state_scissor.c
+++ b/src/gallium/drivers/nv30/nv30_state_scissor.c
@@ -12,7 +12,7 @@ nv30_state_scissor_validate(struct nv30_context *nv30)
 		return FALSE;
 	nv30->state.scissor_enabled = rast->scissor;
 
-	so = so_new(3, 0);
+	so = so_new(1, 2, 0);
 	so_method(so, nv30->screen->rankine, NV34TCL_SCISSOR_HORIZ, 2);
 	if (nv30->state.scissor_enabled) {
 		so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
diff --git a/src/gallium/drivers/nv30/nv30_state_stipple.c b/src/gallium/drivers/nv30/nv30_state_stipple.c
index d0c791ac08..ed520a4f43 100644
--- a/src/gallium/drivers/nv30/nv30_state_stipple.c
+++ b/src/gallium/drivers/nv30/nv30_state_stipple.c
@@ -14,14 +14,14 @@ nv30_state_stipple_validate(struct nv30_context *nv30)
 	if (rast->poly_stipple_enable) {
 		unsigned i;
 
-		so = so_new(35, 0);
+		so = so_new(2, 33, 0);
 		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
 		so_data  (so, 1);
 		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32);
 		for (i = 0; i < 32; i++)
 			so_data(so, nv30->stipple[i]);
 	} else {
-		so = so_new(2, 0);
+		so = so_new(1, 1, 0);
 		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
 		so_data  (so, 0);
 	}
diff --git a/src/gallium/drivers/nv30/nv30_state_viewport.c b/src/gallium/drivers/nv30/nv30_state_viewport.c
index c3eb413dac..2d7781292b 100644
--- a/src/gallium/drivers/nv30/nv30_state_viewport.c
+++ b/src/gallium/drivers/nv30/nv30_state_viewport.c
@@ -19,7 +19,7 @@ nv30_state_viewport_validate(struct nv30_context *nv30)
 		return FALSE;
 	nv30->state.viewport_bypass = bypass;
 
-	so = so_new(11, 0);
+	so = so_new(3, 10, 0);
 	if (!bypass) {
 		so_method(so, nv30->screen->rankine,
 			  NV34TCL_VIEWPORT_TRANSLATE_X, 8);
diff --git a/src/gallium/drivers/nv30/nv30_vbo.c b/src/gallium/drivers/nv30/nv30_vbo.c
index e32b8141af..1c5db03ea2 100644
--- a/src/gallium/drivers/nv30/nv30_vbo.c
+++ b/src/gallium/drivers/nv30/nv30_vbo.c
@@ -163,19 +163,21 @@ nv30_vbo_static_attrib(struct nv30_context *nv30, struct nouveau_stateobj *so,
 	return TRUE;
 }
 
-boolean
+void
 nv30_draw_arrays(struct pipe_context *pipe,
 		 unsigned mode, unsigned start, unsigned count)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
-	struct nouveau_channel *chan = nv30->screen->base.channel;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 	unsigned restart = 0;
 
 	nv30_vbo_set_idxbuf(nv30, NULL, 0);
 	if (FORCE_SWTNL || !nv30_state_validate(nv30)) {
 		/*return nv30_draw_elements_swtnl(pipe, NULL, 0,
 						mode, start, count);*/
-		return FALSE;
+		return;
 	}
 
 	while (count) {
@@ -186,17 +188,17 @@ nv30_draw_arrays(struct pipe_context *pipe,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
 					mode, start, count, &restart);
 		if (!vc) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		nr = (vc & 0xff);
 		if (nr) {
-			BEGIN_RING(rankine, NV34TCL_VB_VERTEX_BATCH, 1);
-			OUT_RING  (((nr - 1) << 24) | start);
+			BEGIN_RING(chan, rankine, NV34TCL_VB_VERTEX_BATCH, 1);
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
 			start += nr;
 		}
 
@@ -206,15 +208,15 @@ nv30_draw_arrays(struct pipe_context *pipe,
 
 			nr -= push;
 
-			BEGIN_RING_NI(rankine, NV34TCL_VB_VERTEX_BATCH, push);
+			BEGIN_RING_NI(chan, rankine, NV34TCL_VB_VERTEX_BATCH, push);
 			while (push--) {
-				OUT_RING(((0x100 - 1) << 24) | start);
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
 				start += 0x100;
 			}
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		count -= vc;
 		start = restart;
@@ -228,7 +230,9 @@ static INLINE void
 nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv30->screen->base.channel;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 
 	while (count) {
 		uint8_t *elts = (uint8_t *)ib + start;
@@ -239,17 +243,17 @@ nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		if (vc & 1) {
-			BEGIN_RING(rankine, NV34TCL_VB_ELEMENT_U32, 1);
-			OUT_RING  (elts[0]);
+			BEGIN_RING(chan, rankine, NV34TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (chan, elts[0]);
 			elts++; vc--;
 		}
 
@@ -258,16 +262,16 @@ nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
 
 			push = MIN2(vc, 2047 * 2);
 
-			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
+			BEGIN_RING_NI(chan, rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
 			for (i = 0; i < push; i+=2)
-				OUT_RING((elts[i+1] << 16) | elts[i]);
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
@@ -277,7 +281,9 @@ static INLINE void
 nv30_draw_elements_u16(struct nv30_context *nv30, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv30->screen->base.channel;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 
 	while (count) {
 		uint16_t *elts = (uint16_t *)ib + start;
@@ -288,17 +294,17 @@ nv30_draw_elements_u16(struct nv30_context *nv30, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		if (vc & 1) {
-			BEGIN_RING(rankine, NV34TCL_VB_ELEMENT_U32, 1);
-			OUT_RING  (elts[0]);
+			BEGIN_RING(chan, rankine, NV34TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (chan, elts[0]);
 			elts++; vc--;
 		}
 
@@ -307,16 +313,16 @@ nv30_draw_elements_u16(struct nv30_context *nv30, void *ib,
 
 			push = MIN2(vc, 2047 * 2);
 
-			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
+			BEGIN_RING_NI(chan, rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
 			for (i = 0; i < push; i+=2)
-				OUT_RING((elts[i+1] << 16) | elts[i]);
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
@@ -326,7 +332,9 @@ static INLINE void
 nv30_draw_elements_u32(struct nv30_context *nv30, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv30->screen->base.channel;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 
 	while (count) {
 		uint32_t *elts = (uint32_t *)ib + start;
@@ -337,32 +345,32 @@ nv30_draw_elements_u32(struct nv30_context *nv30, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 5, 1,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		while (vc) {
 			push = MIN2(vc, 2047);
 
-			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U32, push);
-			OUT_RINGp    (elts, push);
+			BEGIN_RING_NI(chan, rankine, NV34TCL_VB_ELEMENT_U32, push);
+			OUT_RINGp    (chan, elts, push);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
 }
 
-static boolean
+static void
 nv30_draw_elements_inline(struct pipe_context *pipe,
 			  struct pipe_buffer *ib, unsigned ib_size,
 			  unsigned mode, unsigned start, unsigned count)
@@ -393,15 +401,16 @@ nv30_draw_elements_inline(struct pipe_context *pipe,
 	}
 
 	pipe_buffer_unmap(pscreen, ib);
-	return TRUE;
 }
 
-static boolean
+static void
 nv30_draw_elements_vbo(struct pipe_context *pipe,
 		       unsigned mode, unsigned start, unsigned count)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
-	struct nouveau_channel *chan = nv30->screen->base.channel;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 	unsigned restart = 0;
 
 	while (count) {
@@ -412,17 +421,17 @@ nv30_draw_elements_vbo(struct pipe_context *pipe,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
 					mode, start, count, &restart);
 		if (!vc) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		nr = (vc & 0xff);
 		if (nr) {
-			BEGIN_RING(rankine, NV34TCL_VB_INDEX_BATCH, 1);
-			OUT_RING  (((nr - 1) << 24) | start);
+			BEGIN_RING(chan, rankine, NV34TCL_VB_INDEX_BATCH, 1);
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
 			start += nr;
 		}
 
@@ -432,24 +441,22 @@ nv30_draw_elements_vbo(struct pipe_context *pipe,
 
 			nr -= push;
 
-			BEGIN_RING_NI(rankine, NV34TCL_VB_INDEX_BATCH, push);
+			BEGIN_RING_NI(chan, rankine, NV34TCL_VB_INDEX_BATCH, push);
 			while (push--) {
-				OUT_RING(((0x100 - 1) << 24) | start);
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
 				start += 0x100;
 			}
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		count -= vc;
 		start = restart;
 	}
-
-	return TRUE;
 }
 
-boolean
+void
 nv30_draw_elements(struct pipe_context *pipe,
 		   struct pipe_buffer *indexBuffer, unsigned indexSize,
 		   unsigned mode, unsigned start, unsigned count)
@@ -461,7 +468,7 @@ nv30_draw_elements(struct pipe_context *pipe,
 	if (FORCE_SWTNL || !nv30_state_validate(nv30)) {
 		/*return nv30_draw_elements_swtnl(pipe, NULL, 0,
 						mode, start, count);*/
-		return FALSE;	
+		return;	
 	}
 
 	if (idxbuf) {
@@ -472,7 +479,6 @@ nv30_draw_elements(struct pipe_context *pipe,
 	}
 
 	pipe->flush(pipe, 0, NULL);
-	return TRUE;
 }
 
 static boolean
@@ -485,9 +491,9 @@ nv30_vbo_validate(struct nv30_context *nv30)
 	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
 	int hw;
 
-	vtxbuf = so_new(20, 18);
+	vtxbuf = so_new(3, 17, 18);
 	so_method(vtxbuf, rankine, NV34TCL_VTXBUF_ADDRESS(0), nv30->vtxelt_nr);
-	vtxfmt = so_new(17, 0);
+	vtxfmt = so_new(1, 16, 0);
 	so_method(vtxfmt, rankine, NV34TCL_VTXFMT(0), nv30->vtxelt_nr);
 
 	for (hw = 0; hw < nv30->vtxelt_nr; hw++) {
@@ -500,7 +506,7 @@ nv30_vbo_validate(struct nv30_context *nv30)
 
 		if (!vb->stride) {
 			if (!sattr)
-				sattr = so_new(16 * 5, 0);
+				sattr = so_new(16, 16 * 4, 0);
 
 			if (nv30_vbo_static_attrib(nv30, sattr, hw, ve, vb)) {
 				so_data(vtxbuf, 0);
diff --git a/src/gallium/drivers/nv30/nv30_vertprog.c b/src/gallium/drivers/nv30/nv30_vertprog.c
index 5d60984622..e77a5be3f2 100644
--- a/src/gallium/drivers/nv30/nv30_vertprog.c
+++ b/src/gallium/drivers/nv30/nv30_vertprog.c
@@ -650,7 +650,9 @@ static boolean
 nv30_vertprog_validate(struct nv30_context *nv30)
 { 
 	struct pipe_screen *pscreen = nv30->pipe.screen;
-	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 	struct nv30_vertex_program *vp;
 	struct pipe_buffer *constbuf;
 	boolean upload_code = FALSE, upload_data = FALSE;
@@ -684,7 +686,7 @@ nv30_vertprog_validate(struct nv30_context *nv30)
 				assert(0);
 		}
 
-		so = so_new(2, 0);
+		so = so_new(1, 1, 0);
 		so_method(so, rankine, NV34TCL_VP_START_FROM_ID, 1);
 		so_data  (so, vp->exec->start);
 		so_ref(so, &vp->so);
@@ -770,9 +772,9 @@ nv30_vertprog_validate(struct nv30_context *nv30)
 				       4 * sizeof(float));
 			}
 
-			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_CONST_ID, 5);
-			OUT_RING  (i + vp->data->start);
-			OUT_RINGp ((uint32_t *)vpd->value, 4);
+			BEGIN_RING(chan, rankine, NV34TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (chan, i + vp->data->start);
+			OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
 		}
 
 		if (constbuf)
@@ -788,11 +790,11 @@ nv30_vertprog_validate(struct nv30_context *nv30)
 				vp->insns[i].data[2], vp->insns[i].data[3]);
 		}
 #endif
-		BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
-		OUT_RING  (vp->exec->start);
+		BEGIN_RING(chan, rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (chan, vp->exec->start);
 		for (i = 0; i < vp->nr_insns; i++) {
-			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
-			OUT_RINGp (vp->insns[i].data, 4);
+			BEGIN_RING(chan, rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (chan, vp->insns[i].data, 4);
 		}
 	}
 
diff --git a/src/gallium/drivers/nv40/nv40_context.c b/src/gallium/drivers/nv40/nv40_context.c
index d56c7a6b49..f79ae4db84 100644
--- a/src/gallium/drivers/nv40/nv40_context.c
+++ b/src/gallium/drivers/nv40/nv40_context.c
@@ -10,15 +10,20 @@ nv40_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
-		BEGIN_RING(curie, 0x1fd8, 1);
-		OUT_RING  (2);
-		BEGIN_RING(curie, 0x1fd8, 1);
-		OUT_RING  (1);
+		BEGIN_RING(chan, curie, 0x1fd8, 1);
+		OUT_RING  (chan, 2);
+		BEGIN_RING(chan, curie, 0x1fd8, 1);
+		OUT_RING  (chan, 1);
 	}
 
-	FIRE_RING(fence);
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
 }
 
 static void
diff --git a/src/gallium/drivers/nv40/nv40_context.h b/src/gallium/drivers/nv40/nv40_context.h
index 83fcf1785d..e219bb537a 100644
--- a/src/gallium/drivers/nv40/nv40_context.h
+++ b/src/gallium/drivers/nv40/nv40_context.h
@@ -14,10 +14,6 @@
 #include "nouveau/nouveau_winsys.h"
 #include "nouveau/nouveau_gldefs.h"
 #include "nouveau/nouveau_context.h"
-
-#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
-	struct nv40_screen *ctx = nv40->screen
-#include "nouveau/nouveau_push.h"
 #include "nouveau/nouveau_stateobj.h"
 
 #include "nv40_state.h"
@@ -183,7 +179,7 @@ extern void nv40_screen_init_miptree_functions(struct pipe_screen *pscreen);
 
 /* nv40_draw.c */
 extern struct draw_stage *nv40_draw_render_stage(struct nv40_context *nv40);
-extern boolean nv40_draw_elements_swtnl(struct pipe_context *pipe,
+extern void nv40_draw_elements_swtnl(struct pipe_context *pipe,
 					struct pipe_buffer *idxbuf,
 					unsigned ib_size, unsigned mode,
 					unsigned start, unsigned count);
@@ -219,9 +215,9 @@ extern struct nv40_state_entry nv40_state_vbo;
 extern struct nv40_state_entry nv40_state_vtxfmt;
 
 /* nv40_vbo.c */
-extern boolean nv40_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv40_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv40_draw_elements(struct pipe_context *pipe,
+extern void nv40_draw_elements(struct pipe_context *pipe,
 				  struct pipe_buffer *indexBuffer,
 				  unsigned indexSize,
 				  unsigned mode, unsigned start,
diff --git a/src/gallium/drivers/nv40/nv40_draw.c b/src/gallium/drivers/nv40/nv40_draw.c
index 3875bc3545..d826f8c2f5 100644
--- a/src/gallium/drivers/nv40/nv40_draw.c
+++ b/src/gallium/drivers/nv40/nv40_draw.c
@@ -31,6 +31,9 @@ nv40_render_stage(struct draw_stage *stage)
 static INLINE void
 nv40_render_vertex(struct nv40_context *nv40, const struct vertex_header *v)
 {
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 	unsigned i;
 
 	for (i = 0; i < nv40->swtnl.nr_attribs; i++) {
@@ -41,30 +44,30 @@ nv40_render_vertex(struct nv40_context *nv40, const struct vertex_header *v)
 		case EMIT_OMIT:
 			break;
 		case EMIT_1F:
-			BEGIN_RING(curie, NV40TCL_VTX_ATTR_1F(hw), 1);
-			OUT_RING  (fui(v->data[idx][0]));
+			BEGIN_RING(chan, curie, NV40TCL_VTX_ATTR_1F(hw), 1);
+			OUT_RING  (chan, fui(v->data[idx][0]));
 			break;
 		case EMIT_2F:
-			BEGIN_RING(curie, NV40TCL_VTX_ATTR_2F_X(hw), 2);
-			OUT_RING  (fui(v->data[idx][0]));
-			OUT_RING  (fui(v->data[idx][1]));
+			BEGIN_RING(chan, curie, NV40TCL_VTX_ATTR_2F_X(hw), 2);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
 			break;
 		case EMIT_3F:
-			BEGIN_RING(curie, NV40TCL_VTX_ATTR_3F_X(hw), 3);
-			OUT_RING  (fui(v->data[idx][0]));
-			OUT_RING  (fui(v->data[idx][1]));
-			OUT_RING  (fui(v->data[idx][2]));
+			BEGIN_RING(chan, curie, NV40TCL_VTX_ATTR_3F_X(hw), 3);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
+			OUT_RING  (chan, fui(v->data[idx][2]));
 			break;
 		case EMIT_4F:
-			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4F_X(hw), 4);
-			OUT_RING  (fui(v->data[idx][0]));
-			OUT_RING  (fui(v->data[idx][1]));
-			OUT_RING  (fui(v->data[idx][2]));
-			OUT_RING  (fui(v->data[idx][3]));
+			BEGIN_RING(chan, curie, NV40TCL_VTX_ATTR_4F_X(hw), 4);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
+			OUT_RING  (chan, fui(v->data[idx][2]));
+			OUT_RING  (chan, fui(v->data[idx][3]));
 			break;
 		case EMIT_4UB:
-			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4UB(hw), 1);
-			OUT_RING  (pack_ub4(float_to_ubyte(v->data[idx][0]),
+			BEGIN_RING(chan, curie, NV40TCL_VTX_ATTR_4UB(hw), 1);
+			OUT_RING  (chan, pack_ub4(float_to_ubyte(v->data[idx][0]),
 					    float_to_ubyte(v->data[idx][1]),
 					    float_to_ubyte(v->data[idx][2]),
 					    float_to_ubyte(v->data[idx][3])));
@@ -82,7 +85,11 @@ nv40_render_prim(struct draw_stage *stage, struct prim_header *prim,
 {
 	struct nv40_render_stage *rs = nv40_render_stage(stage);
 	struct nv40_context *nv40 = rs->nv40;
-	struct nouveau_pushbuf *pb = nv40->screen->base.channel->pushbuf;
+
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_pushbuf *pb = chan->pushbuf;
+	struct nouveau_grobj *curie = screen->curie;
 	unsigned i;
 
 	/* Ensure there's room for 4xfloat32 + potentially 3 begin/end */
@@ -91,19 +98,19 @@ nv40_render_prim(struct draw_stage *stage, struct prim_header *prim,
 			NOUVEAU_ERR("AIII, missed flush\n");
 			assert(0);
 		}
-		FIRE_RING(NULL);
+		FIRE_RING(chan);
 		nv40_state_emit(nv40);
 	}
 
 	/* Switch primitive modes if necessary */
 	if (rs->prim != mode) {
 		if (rs->prim != NV40TCL_BEGIN_END_STOP) {
-			BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-			OUT_RING  (NV40TCL_BEGIN_END_STOP);	
+			BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+			OUT_RING  (chan, NV40TCL_BEGIN_END_STOP);
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (mode);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, mode);
 		rs->prim = mode;
 	}
 
@@ -115,8 +122,8 @@ nv40_render_prim(struct draw_stage *stage, struct prim_header *prim,
 	 * off the primitive now.
 	 */
 	if (pb->remaining < ((count * 20) + 6)) {
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, NV40TCL_BEGIN_END_STOP);
 		rs->prim = NV40TCL_BEGIN_END_STOP;
 	}
 }
@@ -144,10 +151,13 @@ nv40_render_flush(struct draw_stage *draw, unsigned flags)
 {
 	struct nv40_render_stage *rs = nv40_render_stage(draw);
 	struct nv40_context *nv40 = rs->nv40;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	if (rs->prim != NV40TCL_BEGIN_END_STOP) {
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, NV40TCL_BEGIN_END_STOP);
 		rs->prim = NV40TCL_BEGIN_END_STOP;
 	}
 }
@@ -226,7 +236,7 @@ nv40_draw_render_stage(struct nv40_context *nv40)
 	return &render->stage;
 }
 
-boolean
+void
 nv40_draw_elements_swtnl(struct pipe_context *pipe,
 			 struct pipe_buffer *idxbuf, unsigned idxbuf_size,
 			 unsigned mode, unsigned start, unsigned count)
@@ -237,7 +247,7 @@ nv40_draw_elements_swtnl(struct pipe_context *pipe,
 	void *map;
 
 	if (!nv40_state_validate_swtnl(nv40))
-		return FALSE;
+		return;
 	nv40->state.dirty &= ~(1ULL << NV40_STATE_VTXBUF);
 	nv40_state_emit(nv40);
 
@@ -278,8 +288,6 @@ nv40_draw_elements_swtnl(struct pipe_context *pipe,
 
 	draw_flush(nv40->draw);
 	pipe->flush(pipe, 0, NULL);
-
-	return TRUE;
 }
 
 static INLINE void
diff --git a/src/gallium/drivers/nv40/nv40_fragprog.c b/src/gallium/drivers/nv40/nv40_fragprog.c
index bb9c85cc43..1237066c39 100644
--- a/src/gallium/drivers/nv40/nv40_fragprog.c
+++ b/src/gallium/drivers/nv40/nv40_fragprog.c
@@ -919,7 +919,7 @@ nv40_fragprog_validate(struct nv40_context *nv40)
 	fp->buffer = pscreen->buffer_create(pscreen, 0x100, 0, fp->insn_len * 4);
 	nv40_fragprog_upload(nv40, fp);
 
-	so = so_new(4, 1);
+	so = so_new(2, 2, 1);
 	so_method(so, nv40->screen->curie, NV40TCL_FP_ADDRESS, 1);
 	so_reloc (so, nouveau_bo(fp->buffer), 0, NOUVEAU_BO_VRAM |
 		      NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
diff --git a/src/gallium/drivers/nv40/nv40_fragtex.c b/src/gallium/drivers/nv40/nv40_fragtex.c
index 44abc84596..aad9198210 100644
--- a/src/gallium/drivers/nv40/nv40_fragtex.c
+++ b/src/gallium/drivers/nv40/nv40_fragtex.c
@@ -108,7 +108,7 @@ nv40_fragtex_build(struct nv40_context *nv40, int unit)
 
 	txs = tf->swizzle;
 
-	so = so_new(16, 2);
+	so = so_new(2, 9, 2);
 	so_method(so, nv40->screen->curie, NV40TCL_TEX_OFFSET(unit), 8);
 	so_reloc (so, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
 	so_reloc (so, bo, txf, tex_flags | NOUVEAU_BO_OR,
@@ -139,7 +139,7 @@ nv40_fragtex_validate(struct nv40_context *nv40)
 		unit = ffs(samplers) - 1;
 		samplers &= ~(1 << unit);
 
-		so = so_new(2, 0);
+		so = so_new(1, 1, 0);
 		so_method(so, nv40->screen->curie, NV40TCL_TEX_ENABLE(unit), 1);
 		so_data  (so, 0);
 		so_ref(so, &nv40->state.hw[NV40_STATE_FRAGTEX0 + unit]);
diff --git a/src/gallium/drivers/nv40/nv40_query.c b/src/gallium/drivers/nv40/nv40_query.c
index 7874aedd42..8ed4a67dd0 100644
--- a/src/gallium/drivers/nv40/nv40_query.c
+++ b/src/gallium/drivers/nv40/nv40_query.c
@@ -41,6 +41,9 @@ nv40_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nv40_query *q = nv40_query(pq);
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
 
@@ -57,10 +60,10 @@ nv40_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 		assert(0);
 	nouveau_notifier_reset(nv40->screen->query, q->object->start);
 
-	BEGIN_RING(curie, NV40TCL_QUERY_RESET, 1);
-	OUT_RING  (1);
-	BEGIN_RING(curie, NV40TCL_QUERY_UNK17CC, 1);
-	OUT_RING  (1);
+	BEGIN_RING(chan, curie, NV40TCL_QUERY_RESET, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, curie, NV40TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (chan, 1);
 
 	q->ready = FALSE;
 }
@@ -70,11 +73,14 @@ nv40_query_end(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nv40_query *q = nv40_query(pq);
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
-	BEGIN_RING(curie, NV40TCL_QUERY_GET, 1);
-	OUT_RING  ((0x01 << NV40TCL_QUERY_GET_UNK24_SHIFT) |
+	BEGIN_RING(chan, curie, NV40TCL_QUERY_GET, 1);
+	OUT_RING  (chan, (0x01 << NV40TCL_QUERY_GET_UNK24_SHIFT) |
 		   ((q->object->start * 32) << NV40TCL_QUERY_GET_OFFSET_SHIFT));
-	FIRE_RING(NULL);
+	FIRE_RING(chan);
 }
 
 static boolean
diff --git a/src/gallium/drivers/nv40/nv40_screen.c b/src/gallium/drivers/nv40/nv40_screen.c
index d01e712805..9e55e5a089 100644
--- a/src/gallium/drivers/nv40/nv40_screen.c
+++ b/src/gallium/drivers/nv40/nv40_screen.c
@@ -215,7 +215,6 @@ nv40_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return FALSE;
 	}
-	BIND_RING(chan, screen->curie, 7);
 
 	/* 2D engine setup */
 	screen->eng2d = nv04_surface_2d_init(&screen->base);
@@ -252,7 +251,7 @@ nv40_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	/* Static curie initialisation */
-	so = so_new(128, 0);
+	so = so_new(16, 25, 0);
 	so_method(so, screen->curie, NV40TCL_DMA_NOTIFY, 1);
 	so_data  (so, screen->sync->handle);
 	so_method(so, screen->curie, NV40TCL_DMA_TEXTURE0, 2);
diff --git a/src/gallium/drivers/nv40/nv40_state.c b/src/gallium/drivers/nv40/nv40_state.c
index ed55d29aff..ed0ca9e02c 100644
--- a/src/gallium/drivers/nv40/nv40_state.c
+++ b/src/gallium/drivers/nv40/nv40_state.c
@@ -16,7 +16,7 @@ nv40_blend_state_create(struct pipe_context *pipe,
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nouveau_grobj *curie = nv40->screen->curie;
 	struct nv40_blend_state *bso = CALLOC(1, sizeof(*bso));
-	struct nouveau_stateobj *so = so_new(16, 0);
+	struct nouveau_stateobj *so = so_new(5, 8, 0);
 
 	if (cso->blend_enable) {
 		so_method(so, curie, NV40TCL_BLEND_ENABLE, 3);
@@ -310,7 +310,7 @@ nv40_rasterizer_state_create(struct pipe_context *pipe,
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nv40_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
-	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_stateobj *so = so_new(8, 18, 0);
 	struct nouveau_grobj *curie = nv40->screen->curie;
 
 	/*XXX: ignored:
@@ -445,7 +445,7 @@ nv40_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nv40_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
-	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_stateobj *so = so_new(4, 21, 0);
 	struct nouveau_grobj *curie = nv40->screen->curie;
 
 	so_method(so, curie, NV40TCL_DEPTH_FUNC, 3);
diff --git a/src/gallium/drivers/nv40/nv40_state_blend.c b/src/gallium/drivers/nv40/nv40_state_blend.c
index 8cd05ce66e..3ff00a37f6 100644
--- a/src/gallium/drivers/nv40/nv40_state_blend.c
+++ b/src/gallium/drivers/nv40/nv40_state_blend.c
@@ -18,7 +18,7 @@ struct nv40_state_entry nv40_state_blend = {
 static boolean
 nv40_state_blend_colour_validate(struct nv40_context *nv40)
 {
-	struct nouveau_stateobj *so = so_new(2, 0);
+	struct nouveau_stateobj *so = so_new(1, 1, 0);
 	struct pipe_blend_color *bcol = &nv40->blend_colour;
 
 	so_method(so, nv40->screen->curie, NV40TCL_BLEND_COLOR, 1);
diff --git a/src/gallium/drivers/nv40/nv40_state_emit.c b/src/gallium/drivers/nv40/nv40_state_emit.c
index 789ed16126..13fe854915 100644
--- a/src/gallium/drivers/nv40/nv40_state_emit.c
+++ b/src/gallium/drivers/nv40/nv40_state_emit.c
@@ -54,9 +54,10 @@ nv40_state_do_validate(struct nv40_context *nv40,
 void
 nv40_state_emit(struct nv40_context *nv40)
 {
-	struct nouveau_channel *chan = nv40->screen->base.channel;
 	struct nv40_state *state = &nv40->state;
 	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 	unsigned i;
 	uint64_t states;
 
@@ -80,10 +81,10 @@ nv40_state_emit(struct nv40_context *nv40)
 
 	if (state->dirty & ((1ULL << NV40_STATE_FRAGPROG) |
 			    (1ULL << NV40_STATE_FRAGTEX0))) {
-		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
-		OUT_RING  (2);
-		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
-		OUT_RING  (1);
+		BEGIN_RING(chan, curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (chan, 2);
+		BEGIN_RING(chan, curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (chan, 1);
 	}
 
 	state->dirty = 0;
diff --git a/src/gallium/drivers/nv40/nv40_state_fb.c b/src/gallium/drivers/nv40/nv40_state_fb.c
index 1c7a7cd64f..a58fe9ddb1 100644
--- a/src/gallium/drivers/nv40/nv40_state_fb.c
+++ b/src/gallium/drivers/nv40/nv40_state_fb.c
@@ -19,7 +19,7 @@ nv40_state_framebuffer_validate(struct nv40_context *nv40)
 	struct nv04_surface *rt[4], *zeta;
 	uint32_t rt_enable, rt_format;
 	int i, colour_format = 0, zeta_format = 0;
-	struct nouveau_stateobj *so = so_new(64, 10);
+	struct nouveau_stateobj *so = so_new(18, 24, 10);
 	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
 	unsigned w = fb->width;
 	unsigned h = fb->height;
diff --git a/src/gallium/drivers/nv40/nv40_state_scissor.c b/src/gallium/drivers/nv40/nv40_state_scissor.c
index cf58d33906..753a505e93 100644
--- a/src/gallium/drivers/nv40/nv40_state_scissor.c
+++ b/src/gallium/drivers/nv40/nv40_state_scissor.c
@@ -12,7 +12,7 @@ nv40_state_scissor_validate(struct nv40_context *nv40)
 		return FALSE;
 	nv40->state.scissor_enabled = rast->scissor;
 
-	so = so_new(3, 0);
+	so = so_new(1, 2, 0);
 	so_method(so, nv40->screen->curie, NV40TCL_SCISSOR_HORIZ, 2);
 	if (nv40->state.scissor_enabled) {
 		so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
diff --git a/src/gallium/drivers/nv40/nv40_state_stipple.c b/src/gallium/drivers/nv40/nv40_state_stipple.c
index b51024ad9b..2b371ebfec 100644
--- a/src/gallium/drivers/nv40/nv40_state_stipple.c
+++ b/src/gallium/drivers/nv40/nv40_state_stipple.c
@@ -14,14 +14,14 @@ nv40_state_stipple_validate(struct nv40_context *nv40)
 	if (rast->poly_stipple_enable) {
 		unsigned i;
 
-		so = so_new(35, 0);
+		so = so_new(2, 33, 0);
 		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
 		so_data  (so, 1);
 		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_PATTERN(0), 32);
 		for (i = 0; i < 32; i++)
 			so_data(so, nv40->stipple[i]);
 	} else {
-		so = so_new(2, 0);
+		so = so_new(1, 1, 0);
 		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
 		so_data  (so, 0);
 	}
diff --git a/src/gallium/drivers/nv40/nv40_state_viewport.c b/src/gallium/drivers/nv40/nv40_state_viewport.c
index 665d2d5fca..9919ba1d0b 100644
--- a/src/gallium/drivers/nv40/nv40_state_viewport.c
+++ b/src/gallium/drivers/nv40/nv40_state_viewport.c
@@ -19,7 +19,7 @@ nv40_state_viewport_validate(struct nv40_context *nv40)
 		return FALSE;
 	nv40->state.viewport_bypass = bypass;
 
-	so = so_new(11, 0);
+	so = so_new(2, 9, 0);
 	if (!bypass) {
 		so_method(so, nv40->screen->curie,
 			  NV40TCL_VIEWPORT_TRANSLATE_X, 8);
diff --git a/src/gallium/drivers/nv40/nv40_vbo.c b/src/gallium/drivers/nv40/nv40_vbo.c
index af3fcf6a34..a777898f68 100644
--- a/src/gallium/drivers/nv40/nv40_vbo.c
+++ b/src/gallium/drivers/nv40/nv40_vbo.c
@@ -164,18 +164,21 @@ nv40_vbo_static_attrib(struct nv40_context *nv40, struct nouveau_stateobj *so,
 	return TRUE;
 }
 
-boolean
+void
 nv40_draw_arrays(struct pipe_context *pipe,
 		 unsigned mode, unsigned start, unsigned count)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
-	struct nouveau_channel *chan = nv40->screen->base.channel;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 	unsigned restart;
 
 	nv40_vbo_set_idxbuf(nv40, NULL, 0);
 	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
-		return nv40_draw_elements_swtnl(pipe, NULL, 0,
-						mode, start, count);
+		nv40_draw_elements_swtnl(pipe, NULL, 0,
+                                         mode, start, count);
+                return;
 	}
 
 	while (count) {
@@ -186,17 +189,17 @@ nv40_draw_arrays(struct pipe_context *pipe,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
 					mode, start, count, &restart);
 		if (!vc) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		nr = (vc & 0xff);
 		if (nr) {
-			BEGIN_RING(curie, NV40TCL_VB_VERTEX_BATCH, 1);
-			OUT_RING  (((nr - 1) << 24) | start);
+			BEGIN_RING(chan, curie, NV40TCL_VB_VERTEX_BATCH, 1);
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
 			start += nr;
 		}
 
@@ -206,29 +209,30 @@ nv40_draw_arrays(struct pipe_context *pipe,
 
 			nr -= push;
 
-			BEGIN_RING_NI(curie, NV40TCL_VB_VERTEX_BATCH, push);
+			BEGIN_RING_NI(chan, curie, NV40TCL_VB_VERTEX_BATCH, push);
 			while (push--) {
-				OUT_RING(((0x100 - 1) << 24) | start);
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
 				start += 0x100;
 			}
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		count -= vc;
 		start = restart;
 	}
 
 	pipe->flush(pipe, 0, NULL);
-	return TRUE;
 }
 
 static INLINE void
 nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv40->screen->base.channel;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	while (count) {
 		uint8_t *elts = (uint8_t *)ib + start;
@@ -239,17 +243,17 @@ nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		if (vc & 1) {
-			BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
-			OUT_RING  (elts[0]);
+			BEGIN_RING(chan, curie, NV40TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (chan, elts[0]);
 			elts++; vc--;
 		}
 
@@ -258,16 +262,16 @@ nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
 
 			push = MIN2(vc, 2047 * 2);
 
-			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
+			BEGIN_RING_NI(chan, curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
 			for (i = 0; i < push; i+=2)
-				OUT_RING((elts[i+1] << 16) | elts[i]);
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
@@ -277,7 +281,9 @@ static INLINE void
 nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv40->screen->base.channel;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	while (count) {
 		uint16_t *elts = (uint16_t *)ib + start;
@@ -288,17 +294,17 @@ nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		if (vc & 1) {
-			BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
-			OUT_RING  (elts[0]);
+			BEGIN_RING(chan, curie, NV40TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (chan, elts[0]);
 			elts++; vc--;
 		}
 
@@ -307,16 +313,16 @@ nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
 
 			push = MIN2(vc, 2047 * 2);
 
-			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
+			BEGIN_RING_NI(chan, curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
 			for (i = 0; i < push; i+=2)
-				OUT_RING((elts[i+1] << 16) | elts[i]);
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
@@ -326,7 +332,9 @@ static INLINE void
 nv40_draw_elements_u32(struct nv40_context *nv40, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv40->screen->base.channel;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	while (count) {
 		uint32_t *elts = (uint32_t *)ib + start;
@@ -337,32 +345,32 @@ nv40_draw_elements_u32(struct nv40_context *nv40, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 5, 1,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		while (vc) {
 			push = MIN2(vc, 2047);
 
-			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U32, push);
-			OUT_RINGp    (elts, push);
+			BEGIN_RING_NI(chan, curie, NV40TCL_VB_ELEMENT_U32, push);
+			OUT_RINGp    (chan, elts, push);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
 }
 
-static boolean
+static void
 nv40_draw_elements_inline(struct pipe_context *pipe,
 			  struct pipe_buffer *ib, unsigned ib_size,
 			  unsigned mode, unsigned start, unsigned count)
@@ -393,15 +401,16 @@ nv40_draw_elements_inline(struct pipe_context *pipe,
 	}
 
 	pipe_buffer_unmap(pscreen, ib);
-	return TRUE;
 }
 
-static boolean
+static void
 nv40_draw_elements_vbo(struct pipe_context *pipe,
 		       unsigned mode, unsigned start, unsigned count)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
-	struct nouveau_channel *chan = nv40->screen->base.channel;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 	unsigned restart;
 
 	while (count) {
@@ -412,17 +421,17 @@ nv40_draw_elements_vbo(struct pipe_context *pipe,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
 					mode, start, count, &restart);
 		if (!vc) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		nr = (vc & 0xff);
 		if (nr) {
-			BEGIN_RING(curie, NV40TCL_VB_INDEX_BATCH, 1);
-			OUT_RING  (((nr - 1) << 24) | start);
+			BEGIN_RING(chan, curie, NV40TCL_VB_INDEX_BATCH, 1);
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
 			start += nr;
 		}
 
@@ -432,24 +441,22 @@ nv40_draw_elements_vbo(struct pipe_context *pipe,
 
 			nr -= push;
 
-			BEGIN_RING_NI(curie, NV40TCL_VB_INDEX_BATCH, push);
+			BEGIN_RING_NI(chan, curie, NV40TCL_VB_INDEX_BATCH, push);
 			while (push--) {
-				OUT_RING(((0x100 - 1) << 24) | start);
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
 				start += 0x100;
 			}
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		count -= vc;
 		start = restart;
 	}
-
-	return TRUE;
 }
 
-boolean
+void
 nv40_draw_elements(struct pipe_context *pipe,
 		   struct pipe_buffer *indexBuffer, unsigned indexSize,
 		   unsigned mode, unsigned start, unsigned count)
@@ -459,8 +466,9 @@ nv40_draw_elements(struct pipe_context *pipe,
 
 	idxbuf = nv40_vbo_set_idxbuf(nv40, indexBuffer, indexSize);
 	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
-		return nv40_draw_elements_swtnl(pipe, NULL, 0,
-						mode, start, count);
+		nv40_draw_elements_swtnl(pipe, NULL, 0,
+                                         mode, start, count);
+                return;
 	}
 
 	if (idxbuf) {
@@ -471,7 +479,6 @@ nv40_draw_elements(struct pipe_context *pipe,
 	}
 
 	pipe->flush(pipe, 0, NULL);
-	return TRUE;
 }
 
 static boolean
@@ -484,9 +491,9 @@ nv40_vbo_validate(struct nv40_context *nv40)
 	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
 	int hw;
 
-	vtxbuf = so_new(20, 18);
+	vtxbuf = so_new(3, 17, 18);
 	so_method(vtxbuf, curie, NV40TCL_VTXBUF_ADDRESS(0), nv40->vtxelt_nr);
-	vtxfmt = so_new(17, 0);
+	vtxfmt = so_new(1, 16, 0);
 	so_method(vtxfmt, curie, NV40TCL_VTXFMT(0), nv40->vtxelt_nr);
 
 	for (hw = 0; hw < nv40->vtxelt_nr; hw++) {
@@ -499,7 +506,7 @@ nv40_vbo_validate(struct nv40_context *nv40)
 
 		if (!vb->stride) {
 			if (!sattr)
-				sattr = so_new(16 * 5, 0);
+				sattr = so_new(16, 16 * 4, 0);
 
 			if (nv40_vbo_static_attrib(nv40, sattr, hw, ve, vb)) {
 				so_data(vtxbuf, 0);
diff --git a/src/gallium/drivers/nv40/nv40_vertprog.c b/src/gallium/drivers/nv40/nv40_vertprog.c
index d9fc31006f..8d80fcad38 100644
--- a/src/gallium/drivers/nv40/nv40_vertprog.c
+++ b/src/gallium/drivers/nv40/nv40_vertprog.c
@@ -834,7 +834,9 @@ static boolean
 nv40_vertprog_validate(struct nv40_context *nv40)
 { 
 	struct pipe_screen *pscreen = nv40->pipe.screen;
-	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 	struct nv40_vertex_program *vp;
 	struct pipe_buffer *constbuf;
 	boolean upload_code = FALSE, upload_data = FALSE;
@@ -884,7 +886,7 @@ check_gpu_resources:
 				assert(0);
 		}
 
-		so = so_new(7, 0);
+		so = so_new(3, 4, 0);
 		so_method(so, curie, NV40TCL_VP_START_FROM_ID, 1);
 		so_data  (so, vp->exec->start);
 		so_method(so, curie, NV40TCL_VP_ATTRIB_EN, 2);
@@ -974,9 +976,9 @@ check_gpu_resources:
 				       4 * sizeof(float));
 			}
 
-			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_CONST_ID, 5);
-			OUT_RING  (i + vp->data->start);
-			OUT_RINGp ((uint32_t *)vpd->value, 4);
+			BEGIN_RING(chan, curie, NV40TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (chan, i + vp->data->start);
+			OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
 		}
 
 		if (constbuf)
@@ -993,11 +995,11 @@ check_gpu_resources:
 			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
 		}
 #endif
-		BEGIN_RING(curie, NV40TCL_VP_UPLOAD_FROM_ID, 1);
-		OUT_RING  (vp->exec->start);
+		BEGIN_RING(chan, curie, NV40TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (chan, vp->exec->start);
 		for (i = 0; i < vp->nr_insns; i++) {
-			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_INST(0), 4);
-			OUT_RINGp (vp->insns[i].data, 4);
+			BEGIN_RING(chan, curie, NV40TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (chan, vp->insns[i].data, 4);
 		}
 	}
 
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
index 5578a5838f..cbd4c3ff86 100644
--- a/src/gallium/drivers/nv50/nv50_context.h
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -191,9 +191,9 @@ nv50_surface_do_copy(struct nv50_screen *screen, struct pipe_surface *dst,
 extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *nv50);
 
 /* nv50_vbo.c */
-extern boolean nv50_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv50_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv50_draw_elements(struct pipe_context *pipe,
+extern void nv50_draw_elements(struct pipe_context *pipe,
 				  struct pipe_buffer *indexBuffer,
 				  unsigned indexSize,
 				  unsigned mode, unsigned start,
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
index 3f1edf0a13..cecb1efc90 100644
--- a/src/gallium/drivers/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -145,7 +145,7 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 				  mt->level[0].tile_mode, tile_flags,
 				  &mt->base.bo);
 	if (ret) {
-		for (l = 0; l < pt->last_level; ++l)
+		for (l = 0; l <= pt->last_level; ++l)
 			FREE(mt->level[l].image_offset);
 		FREE(mt);
 		return NULL;
@@ -188,7 +188,7 @@ nv50_miptree_destroy(struct pipe_texture *pt)
 	struct nv50_miptree *mt = nv50_miptree(pt);
 	unsigned l;
 
-	for (l = 0; l < pt->last_level; ++l)
+	for (l = 0; l <= pt->last_level; ++l)
 		FREE(mt->level[l].image_offset);
 
 	nouveau_bo_ref(NULL, &mt->base.bo);
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 2d0b1818ef..069f815938 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -96,7 +96,11 @@ struct nv50_reg {
 
 #define NV50_MOD_NEG 1
 #define NV50_MOD_ABS 2
+#define NV50_MOD_NEG_ABS (NV50_MOD_NEG | NV50_MOD_ABS)
 #define NV50_MOD_SAT 4
+#define NV50_MOD_I32 8
+
+/* NV50_MOD_I32 is used to indicate integer mode for neg/abs */
 
 /* STACK: Conditionals and loops have to use the (per warp) stack.
  * Stack entries consist of an entry type (divergent path, join at),
@@ -134,6 +138,7 @@ struct nv50_pc {
 	uint8_t addr_alloc; /* set bit indicates used for TGSI_FILE_ADDRESS */
 
 	struct nv50_reg *temp_temp[16];
+	struct nv50_program_exec *temp_temp_exec[16];
 	unsigned temp_temp_nr;
 
 	/* broadcast and destination replacement regs */
@@ -241,7 +246,8 @@ alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
 		}
 	}
 
-	assert(0);
+	NOUVEAU_ERR("out of registers\n");
+	abort();
 }
 
 static INLINE struct nv50_reg *
@@ -281,7 +287,8 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
 		}
 	}
 
-	assert(0);
+	NOUVEAU_ERR("out of registers\n");
+	abort();
 	return NULL;
 }
 
@@ -343,23 +350,29 @@ free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
 }
 
 static struct nv50_reg *
-temp_temp(struct nv50_pc *pc)
+temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e)
 {
 	if (pc->temp_temp_nr >= 16)
 		assert(0);
 
 	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
+	pc->temp_temp_exec[pc->temp_temp_nr] = e;
 	return pc->temp_temp[pc->temp_temp_nr++];
 }
 
+/* This *must* be called for all nv50_program_exec that have been
+ * given as argument to temp_temp, or the temps will be leaked !
+ */
 static void
-kill_temp_temp(struct nv50_pc *pc)
+kill_temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e)
 {
 	int i;
 
 	for (i = 0; i < pc->temp_temp_nr; i++)
-		free_temp(pc, pc->temp_temp[i]);
-	pc->temp_temp_nr = 0;
+		if (pc->temp_temp_exec[i] == e)
+			free_temp(pc, pc->temp_temp[i]);
+	if (!e)
+		pc->temp_temp_nr = 0;
 }
 
 static int
@@ -421,6 +434,8 @@ emit(struct nv50_pc *pc, struct nv50_program_exec *e)
 		p->exec_head = e;
 	p->exec_tail = e;
 	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
+
+	kill_temp_temp(pc, e);
 }
 
 static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
@@ -776,7 +791,7 @@ set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
 	struct nv50_reg *temp;
 
 	if (src->type != P_TEMP) {
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, e);
 		emit_mov(pc, temp, src);
 		src = temp;
 	}
@@ -795,7 +810,7 @@ set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 		e->inst[1] |= 0x00200000;
 	} else
 	if (src->type == P_CONST || src->type == P_IMMD) {
-		struct nv50_reg *temp = temp_temp(pc);
+		struct nv50_reg *temp = temp_temp(pc, e);
 
 		emit_mov(pc, temp, src);
 		src = temp;
@@ -811,7 +826,7 @@ static void
 set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 {
 	if (src->type == P_ATTR) {
-		struct nv50_reg *temp = temp_temp(pc);
+		struct nv50_reg *temp = temp_temp(pc, e);
 
 		emit_mov(pc, temp, src);
 		src = temp;
@@ -819,7 +834,7 @@ set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 	if (src->type == P_CONST || src->type == P_IMMD) {
 		assert(!(e->inst[0] & 0x00800000));
 		if (e->inst[0] & 0x01000000) {
-			struct nv50_reg *temp = temp_temp(pc);
+			struct nv50_reg *temp = temp_temp(pc, e);
 
 			emit_mov(pc, temp, src);
 			src = temp;
@@ -841,7 +856,7 @@ set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 	set_long(pc, e);
 
 	if (src->type == P_ATTR) {
-		struct nv50_reg *temp = temp_temp(pc);
+		struct nv50_reg *temp = temp_temp(pc, e);
 
 		emit_mov(pc, temp, src);
 		src = temp;
@@ -849,7 +864,7 @@ set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 	if (src->type == P_CONST || src->type == P_IMMD) {
 		assert(!(e->inst[0] & 0x01000000));
 		if (e->inst[0] & 0x00800000) {
-			struct nv50_reg *temp = temp_temp(pc);
+			struct nv50_reg *temp = temp_temp(pc, e);
 
 			emit_mov(pc, temp, src);
 			src = temp;
@@ -864,6 +879,26 @@ set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 }
 
 static void
+set_half_src(struct nv50_pc *pc, struct nv50_reg *src, int lh,
+	     struct nv50_program_exec *e, int pos)
+{
+	struct nv50_reg *r = src;
+
+	alloc_reg(pc, r);
+	if (r->type != P_TEMP) {
+		r = temp_temp(pc, e);
+		emit_mov(pc, r, src);
+	}
+
+	if (r->hw > (NV50_SU_MAX_TEMP / 2)) {
+		NOUVEAU_ERR("out of low GPRs\n");
+		abort();
+	}
+
+	e->inst[pos / 32] |= ((src->hw * 2) + lh) << (pos % 32);
+}
+
+static void
 emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred)
 {
 	struct nv50_program_exec *e = exec(pc);
@@ -967,6 +1002,13 @@ emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
 	emit(pc, e);
 }
 
+#define NV50_MAX_F32 0x880
+#define NV50_MAX_S32 0x08c
+#define NV50_MAX_U32 0x084
+#define NV50_MIN_F32 0x8a0
+#define NV50_MIN_S32 0x0ac
+#define NV50_MIN_U32 0x0a4
+
 static void
 emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
 	    struct nv50_reg *src0, struct nv50_reg *src1)
@@ -974,8 +1016,8 @@ emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
 	struct nv50_program_exec *e = exec(pc);
 
 	set_long(pc, e);
-	e->inst[0] |= 0xb0000000;
-	e->inst[1] |= (sub << 29);
+	e->inst[0] |= 0x30000000 | ((sub & 0x800) << 20);
+	e->inst[1] |= (sub << 24);
 
 	check_swap_src_0_1(pc, &src0, &src1);
 	set_dst(pc, dst, e);
@@ -1039,6 +1081,69 @@ emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 }
 
 static void
+emit_not(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xd0000000;
+	e->inst[1] = 0x0402c000;
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_src_1(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_shift(struct nv50_pc *pc, struct nv50_reg *dst,
+	   struct nv50_reg *src0, struct nv50_reg *src1, unsigned dir)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x30000000;
+	e->inst[1] = 0xc4000000;
+
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+
+	if (src1->type == P_IMMD) {
+		e->inst[1] |= (1 << 20);
+		e->inst[0] |= (pc->immd_buf[src1->hw] & 0x7f) << 16;
+	} else
+		set_src_1(pc, src1, e);
+
+	if (dir != TGSI_OPCODE_SHL)
+		e->inst[1] |= (1 << 29);
+
+	if (dir == TGSI_OPCODE_ISHR)
+		e->inst[1] |= (1 << 27);
+
+	emit(pc, e);
+}
+
+static void
+emit_shl_imm(struct nv50_pc *pc, struct nv50_reg *dst,
+	     struct nv50_reg *src, int s)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x30000000;
+	e->inst[1] = 0xc4100000;
+	if (s < 0) {
+		e->inst[1] |= 1 << 29;
+		s = -s;
+	}
+	e->inst[1] |= ((s & 0x7f) << 16);
+
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
 emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	 struct nv50_reg *src1, struct nv50_reg *src2)
 {
@@ -1142,36 +1247,41 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 	emit(pc, e);
 }
 
-#define CVTOP_RN	0x01
-#define CVTOP_FLOOR	0x03
-#define CVTOP_CEIL	0x05
-#define CVTOP_TRUNC	0x07
-#define CVTOP_SAT	0x08
-#define CVTOP_ABS	0x10
-
-/* 0x04 == 32 bit dst */
-/* 0x40 == dst is float */
-/* 0x80 == src is float */
-#define CVT_F32_F32 0xc4
-#define CVT_F32_S32 0x44
-#define CVT_S32_F32 0x8c
-#define CVT_S32_S32 0x0c
-#define CVT_NEG     0x20
-#define CVT_RI      0x08
+#define CVT_RN    (0x00 << 16)
+#define CVT_FLOOR (0x02 << 16)
+#define CVT_CEIL  (0x04 << 16)
+#define CVT_TRUNC (0x06 << 16)
+#define CVT_SAT   (0x08 << 16)
+#define CVT_ABS   (0x10 << 16)
+
+#define CVT_X32_X32 0x04004000
+#define CVT_X32_S32 0x04014000
+#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32)
+#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32)
+#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32)
+#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32)
+#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32)
+#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32)
+#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32)
+#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32)
+
+#define CVT_NEG 0x20000000
+#define CVT_RI  0x08000000
 
 static void
 emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
-	 int wp, unsigned cvn, unsigned fmt)
+	 int wp, uint32_t cvn)
 {
 	struct nv50_program_exec *e;
 
 	e = exec(pc);
-	set_long(pc, e);
 
-	e->inst[0] |= 0xa0000000;
-	e->inst[1] |= 0x00004000; /* 32 bit src */
-	e->inst[1] |= (cvn << 16);
-	e->inst[1] |= (fmt << 24);
+	if (src->mod & NV50_MOD_NEG) cvn |= CVT_NEG;
+	if (src->mod & NV50_MOD_ABS) cvn |= CVT_ABS;
+
+	e->inst[0] = 0xa0000000;
+	e->inst[1] = cvn;
+	set_long(pc, e);
 	set_src_0(pc, src, e);
 
 	if (wp >= 0)
@@ -1196,10 +1306,12 @@ emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
  *  0x6 = GE
  *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
  *  0x8 = unordered bit (allows NaN)
+ *
+ *  mode = 0x04 (u32), 0x0c (s32), 0x80 (f32)
  */
 static void
 emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
-	 struct nv50_reg *src0, struct nv50_reg *src1)
+	 struct nv50_reg *src0, struct nv50_reg *src1, uint8_t mode)
 {
 	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
 
@@ -1214,16 +1326,10 @@ emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
 	if (dst && dst->type != P_TEMP)
 		dst = alloc_temp(pc, NULL);
 
-	/* set.u32 */
 	set_long(pc, e);
-	e->inst[0] |= 0xb0000000;
+	e->inst[0] |= 0x30000000 | (mode << 24);
 	e->inst[1] |= 0x60000000 | (ccode << 14);
 
-	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
-	 * that doesn't seem to match what the hw actually does
-	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
-	 */
-
 	if (wp >= 0)
 		set_pred_wr(pc, 1, wp, e);
 	if (dst)
@@ -1238,33 +1344,146 @@ emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
 
 	emit(pc, e);
 
-	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
-	if (rdst)
-		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
+	if (rdst && mode == 0x80) /* convert to float ? */
+		emit_cvt(pc, rdst, dst, -1, CVT_ABS | CVT_F32_S32);
 	if (rdst && rdst != dst)
 		free_temp(pc, dst);
 }
 
-static INLINE unsigned
-map_tgsi_setop_cc(unsigned op)
+static INLINE void
+map_tgsi_setop_hw(unsigned op, uint8_t *cc, uint8_t *ty)
 {
 	switch (op) {
-	case TGSI_OPCODE_SLT: return 0x1;
-	case TGSI_OPCODE_SGE: return 0x6;
-	case TGSI_OPCODE_SEQ: return 0x2;
-	case TGSI_OPCODE_SGT: return 0x4;
-	case TGSI_OPCODE_SLE: return 0x3;
-	case TGSI_OPCODE_SNE: return 0xd;
+	case TGSI_OPCODE_SLT: *cc = 0x1; *ty = 0x80; break;
+	case TGSI_OPCODE_SGE: *cc = 0x6; *ty = 0x80; break;
+	case TGSI_OPCODE_SEQ: *cc = 0x2; *ty = 0x80; break;
+	case TGSI_OPCODE_SGT: *cc = 0x4; *ty = 0x80; break;
+	case TGSI_OPCODE_SLE: *cc = 0x3; *ty = 0x80; break;
+	case TGSI_OPCODE_SNE: *cc = 0xd; *ty = 0x80; break;
+
+	case TGSI_OPCODE_ISLT: *cc = 0x1; *ty = 0x0c; break;
+	case TGSI_OPCODE_ISGE: *cc = 0x6; *ty = 0x0c; break;
+	case TGSI_OPCODE_USEQ: *cc = 0x2; *ty = 0x04; break;
+	case TGSI_OPCODE_USGE: *cc = 0x6; *ty = 0x04; break;
+	case TGSI_OPCODE_USLT: *cc = 0x1; *ty = 0x04; break;
+	case TGSI_OPCODE_USNE: *cc = 0x5; *ty = 0x04; break;
 	default:
 		assert(0);
-		return 0;
+		return;
+	}
+}
+
+static void
+emit_add_b32(struct nv50_pc *pc, struct nv50_reg *dst,
+	     struct nv50_reg *src0, struct nv50_reg *rsrc1)
+{
+	struct nv50_program_exec *e = exec(pc);
+	struct nv50_reg *src1;
+
+	e->inst[0] = 0x20000000;
+
+	alloc_reg(pc, rsrc1);
+	check_swap_src_0_1(pc, &src0, &rsrc1);
+
+	src1 = rsrc1;
+	if (src0->mod & rsrc1->mod & NV50_MOD_NEG) {
+		src1 = temp_temp(pc, e);
+		emit_cvt(pc, src1, rsrc1, -1, CVT_S32_S32);
+	}
+
+	if (!pc->allow32 || src1->hw > 63 ||
+	    (src1->type != P_TEMP && src1->type != P_IMMD))
+		set_long(pc, e);
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+
+	if (is_long(e)) {
+		e->inst[1] |= 1 << 26;
+		set_src_2(pc, src1, e);
+	} else {
+		e->inst[0] |= 0x8000;
+		if (src1->type == P_IMMD)
+			set_immd(pc, src1, e);
+		else
+			set_src_1(pc, src1, e);
 	}
+
+	if (src0->mod & NV50_MOD_NEG)
+		e->inst[0] |= 1 << 28;
+	else
+	if (src1->mod & NV50_MOD_NEG)
+		e->inst[0] |= 1 << 22;
+
+	emit(pc, e);
+}
+
+static void
+emit_mad_u16(struct nv50_pc *pc, struct nv50_reg *dst,
+	     struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1,
+	     struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x60000000;
+	if (!pc->allow32)
+		set_long(pc, e);
+	set_dst(pc, dst, e);
+
+	set_half_src(pc, src0, lh_0, e, 9);
+	set_half_src(pc, src1, lh_1, e, 16);
+	alloc_reg(pc, src2);
+	if (is_long(e) || (src2->type != P_TEMP) || (src2->hw != dst->hw))
+		set_src_2(pc, src2, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_mul_u16(struct nv50_pc *pc, struct nv50_reg *dst,
+	     struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x40000000;
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+
+	set_half_src(pc, src0, lh_0, e, 9);
+	set_half_src(pc, src1, lh_1, e, 16);
+
+	emit(pc, e);
+}
+
+static void
+emit_sad(struct nv50_pc *pc, struct nv50_reg *dst,
+	 struct nv50_reg *src0, struct nv50_reg *src1, struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x50000000;
+	if (!pc->allow32)
+		set_long(pc, e);
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	alloc_reg(pc, src2);
+	if (is_long(e) || (src2->type != dst->type) || (src2->hw != dst->hw))
+		set_src_2(pc, src2, e);
+
+	if (is_long(e))
+		e->inst[1] |= 0x0c << 24;
+	else
+		e->inst[0] |= 0x81 << 8;
+
+	emit(pc, e);
 }
 
 static INLINE void
 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI);
+	emit_cvt(pc, dst, src, -1, CVT_FLOOR | CVT_F32_F32 | CVT_RI);
 }
 
 static void
@@ -1282,15 +1501,9 @@ emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
 }
 
 static INLINE void
-emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
-}
-
-static INLINE void
 emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
+	emit_cvt(pc, dst, src, -1, CVT_SAT | CVT_F32_F32);
 }
 
 static void
@@ -1308,18 +1521,18 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 
 	if (mask & (3 << 1)) {
 		tmp[0] = alloc_temp(pc, NULL);
-		emit_minmax(pc, 4, tmp[0], src[0], zero);
+		emit_minmax(pc, NV50_MAX_F32, tmp[0], src[0], zero);
 	}
 
 	if (mask & (1 << 2)) {
 		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
 
-		tmp[1] = temp_temp(pc);
-		emit_minmax(pc, 4, tmp[1], src[1], zero);
+		tmp[1] = temp_temp(pc, NULL);
+		emit_minmax(pc, NV50_MAX_F32, tmp[1], src[1], zero);
 
-		tmp[3] = temp_temp(pc);
-		emit_minmax(pc, 4, tmp[3], src[3], neg128);
-		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
+		tmp[3] = temp_temp(pc, NULL);
+		emit_minmax(pc, NV50_MAX_F32, tmp[3], src[3], neg128);
+		emit_minmax(pc, NV50_MIN_F32, tmp[3], tmp[3], pos128);
 
 		emit_pow(pc, dst[2], tmp[1], tmp[3]);
 		emit_mov(pc, dst[2], zero);
@@ -1347,12 +1560,6 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 	FREE(one);
 }
 
-static INLINE void
-emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG);
-}
-
 static void
 emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
 {
@@ -1364,14 +1571,9 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
 	set_long(pc, e); /* sets cond code to ALWAYS */
 
 	if (src) {
-		unsigned cvn = CVT_F32_F32;
-
 		set_pred(pc, 0x1 /* cc = LT */, r_pred, e);
-
-		if (src->mod & NV50_MOD_NEG)
-			cvn |= CVT_NEG;
-		/* write predicate reg */
-		emit_cvt(pc, NULL, src, r_pred, CVTOP_RN, cvn);
+		/* write to predicate reg */
+		emit_cvt(pc, NULL, src, r_pred, CVT_F32_F32);
 	}
 
 	emit(pc, e);
@@ -1474,8 +1676,8 @@ load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
 	src[1]->mod |= NV50_MOD_ABS;
 	src[2]->mod |= NV50_MOD_ABS;
 
-	emit_minmax(pc, 4, t[2], src[0], src[1]);
-	emit_minmax(pc, 4, t[2], src[2], t[2]);
+	emit_minmax(pc, NV50_MAX_F32, t[2], src[0], src[1]);
+	emit_minmax(pc, NV50_MAX_F32, t[2], src[2], t[2]);
 
 	src[0]->mod = mod[0];
 	src[1]->mod = mod[1];
@@ -1778,6 +1980,21 @@ convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
 		q = 0x0403c000;
 		m = 0xffff7fff;
 		break;
+	case 0x2:
+	case 0x3:
+		/* ADD, SUB, SUBR b32 */
+		m = ~(0x8000 | (127 << 16));
+		q = ((e->inst[0] & (~m)) >> 2) | (1 << 26);
+		break;
+	case 0x5:
+		/* SAD */
+		m = ~(0x81 << 8);
+		q = (0x0c << 24) | ((e->inst[0] & (0x7f << 2)) << 12);
+		break;
+	case 0x6:
+		/* MAD u16 */
+		q = (e->inst[0] & (0x7f << 2)) << 12;
+		break;
 	case 0x8:
 		/* INTERP (move centroid, perspective and flat bits) */
 		m = ~0x03000100;
@@ -1814,8 +2031,8 @@ convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
 }
 
 /* Some operations support an optional negation flag. */
-static boolean
-negate_supported(const struct tgsi_full_instruction *insn, int i)
+static int
+get_supported_mods(const struct tgsi_full_instruction *insn, int i)
 {
 	switch (insn->Instruction.Opcode) {
 	case TGSI_OPCODE_ADD:
@@ -1835,9 +2052,36 @@ negate_supported(const struct tgsi_full_instruction *insn, int i)
 	case TGSI_OPCODE_SCS:
 	case TGSI_OPCODE_SIN:
 	case TGSI_OPCODE_SUB:
-		return TRUE;
+		return NV50_MOD_NEG;
+	case TGSI_OPCODE_MAX:
+	case TGSI_OPCODE_MIN:
+	case TGSI_OPCODE_INEG: /* tgsi src sign toggle/set would be stupid */
+		return NV50_MOD_ABS;
+	case TGSI_OPCODE_CEIL:
+	case TGSI_OPCODE_FLR:
+	case TGSI_OPCODE_TRUNC:
+		return NV50_MOD_NEG | NV50_MOD_ABS;
+	case TGSI_OPCODE_F2I:
+	case TGSI_OPCODE_F2U:
+	case TGSI_OPCODE_I2F:
+	case TGSI_OPCODE_U2F:
+		return NV50_MOD_NEG | NV50_MOD_ABS | NV50_MOD_I32;
+	case TGSI_OPCODE_UADD:
+		return NV50_MOD_NEG | NV50_MOD_I32;
+	case TGSI_OPCODE_SAD:
+	case TGSI_OPCODE_SHL:
+	case TGSI_OPCODE_IMAX:
+	case TGSI_OPCODE_IMIN:
+	case TGSI_OPCODE_ISHR:
+	case TGSI_OPCODE_NOT:
+	case TGSI_OPCODE_UMAD:
+	case TGSI_OPCODE_UMAX:
+	case TGSI_OPCODE_UMIN:
+	case TGSI_OPCODE_UMUL:
+	case TGSI_OPCODE_USHR:
+		return NV50_MOD_I32;
 	default:
-		return FALSE;
+		return 0;
 	}
 }
 
@@ -1944,11 +2188,11 @@ tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 
 static struct nv50_reg *
 tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
-	 boolean neg)
+	 int mod)
 {
 	struct nv50_reg *r = NULL;
-	struct nv50_reg *temp;
-	unsigned sgn, c, swz;
+	struct nv50_reg *temp = NULL;
+	unsigned sgn, c, swz, cvn;
 
 	if (src->Register.File != TGSI_FILE_CONSTANT)
 		assert(!src->Register.Indirect);
@@ -1988,7 +2232,7 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 			r = &pc->immd[src->Register.Index * 4 + c];
 			break;
 		case TGSI_FILE_SAMPLER:
-			break;
+			return NULL;
 		case TGSI_FILE_ADDRESS:
 			r = pc->addr[src->Register.Index * 4 + c];
 			assert(r);
@@ -2003,35 +2247,34 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 		break;
 	}
 
+	cvn = (mod & NV50_MOD_I32) ? CVT_S32_S32 : CVT_F32_F32;
+
 	switch (sgn) {
-	case TGSI_UTIL_SIGN_KEEP:
-		break;
 	case TGSI_UTIL_SIGN_CLEAR:
-		temp = temp_temp(pc);
-		emit_abs(pc, temp, r);
-		r = temp;
-		break;
-	case TGSI_UTIL_SIGN_TOGGLE:
-		if (neg)
-			r->mod = NV50_MOD_NEG;
-		else {
-			temp = temp_temp(pc);
-			emit_neg(pc, temp, r);
-			r = temp;
-		}
+		r->mod = NV50_MOD_ABS;
 		break;
 	case TGSI_UTIL_SIGN_SET:
-		temp = temp_temp(pc);
-		emit_cvt(pc, temp, r, -1, CVTOP_ABS, CVT_F32_F32 | CVT_NEG);
-		r = temp;
+		r->mod = NV50_MOD_NEG_ABS;
+		break;
+	case TGSI_UTIL_SIGN_TOGGLE:
+		r->mod = NV50_MOD_NEG;
 		break;
 	default:
-		assert(0);
+		assert(!r->mod && sgn == TGSI_UTIL_SIGN_KEEP);
 		break;
 	}
 
-	if (r && r->acc >= 0 && r != temp)
-		return reg_instance(pc, r);
+	if ((r->mod & mod) != r->mod) {
+		temp = temp_temp(pc, NULL);
+		emit_cvt(pc, temp, r, -1, cvn);
+		r->mod = 0;
+		r = temp;
+	} else
+		r->mod |= mod & NV50_MOD_I32;
+
+	assert(r);
+	if (r->acc >= 0 && r != temp)
+		return reg_instance(pc, r); /* will clear r->mod */
 	return r;
 }
 
@@ -2195,22 +2438,22 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 		const struct tgsi_full_src_register *fs = &inst->Src[i];
 		unsigned src_mask;
-		boolean neg_supp;
+		int mod_supp;
 
 		src_mask = nv50_tgsi_src_mask(inst, i);
-		neg_supp = negate_supported(inst, i);
+		mod_supp = get_supported_mods(inst, i);
 
 		if (fs->Register.File == TGSI_FILE_SAMPLER)
 			unit = fs->Register.Index;
 
 		for (c = 0; c < 4; c++)
 			if (src_mask & (1 << c))
-				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
+				src[i][c] = tgsi_src(pc, c, fs, mod_supp);
 	}
 
 	brdc = temp = pc->r_brdc;
 	if (brdc && brdc->type != P_TEMP) {
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, NULL);
 		if (sat)
 			brdc = temp;
 	} else
@@ -2219,7 +2462,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
 				continue;
 			/* rdst[c] = dst[c]; */ /* done above */
-			dst[c] = temp_temp(pc);
+			dst[c] = temp_temp(pc, NULL);
 		}
 	}
 
@@ -2230,7 +2473,8 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_abs(pc, dst[c], src[0][c]);
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_ABS | CVT_F32_F32);
 		}
 		break;
 	case TGSI_OPCODE_ADD:
@@ -2252,8 +2496,8 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		break;
 	case TGSI_OPCODE_ARL:
 		assert(src[0][0]);
-		temp = temp_temp(pc);
-		emit_cvt(pc, temp, src[0][0], -1, CVTOP_FLOOR, CVT_S32_F32);
+		temp = temp_temp(pc, NULL);
+		emit_cvt(pc, temp, src[0][0], -1, CVT_FLOOR | CVT_S32_F32);
 		emit_arl(pc, dst[0], temp, 4);
 		break;
 	case TGSI_OPCODE_BGNLOOP:
@@ -2282,7 +2526,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			if (!(mask & (1 << c)))
 				continue;
 			emit_cvt(pc, dst[c], src[0][c], -1,
-				 CVTOP_CEIL, CVT_F32_F32 | CVT_RI);
+				 CVT_CEIL | CVT_F32_F32 | CVT_RI);
 		}
 		break;
 	case TGSI_OPCODE_CMP:
@@ -2290,7 +2534,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_cvt(pc, NULL, src[0][c], 1, CVTOP_RN, CVT_F32_F32);
+			emit_cvt(pc, NULL, src[0][c], 1, CVT_F32_F32);
 			emit_mov(pc, dst[c], src[1][c]);
 			set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */
 			emit_mov(pc, dst[c], src[2][c]);
@@ -2309,7 +2553,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			if (!(mask &= 7))
 				break;
 			if (temp == dst[3])
-				temp = brdc = temp_temp(pc);
+				temp = brdc = temp_temp(pc, NULL);
 		}
 		emit_precossin(pc, temp, src[0][0]);
 		emit_flop(pc, NV50_FLOP_COS, brdc, temp);
@@ -2397,8 +2641,8 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		struct nv50_reg *t[2];
 
 		assert(!temp);
-		t[0] = temp_temp(pc);
-		t[1] = temp_temp(pc);
+		t[0] = temp_temp(pc, NULL);
+		t[1] = temp_temp(pc, NULL);
 
 		if (mask & 0x6)
 			emit_mov(pc, t[0], src[0][0]);
@@ -2419,6 +2663,22 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			emit_mov_immdval(pc, dst[3], 1.0f);
 	}
 		break;
+	case TGSI_OPCODE_F2I:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_TRUNC | CVT_S32_F32);
+		}
+		break;
+	case TGSI_OPCODE_F2U:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_TRUNC | CVT_U32_F32);
+		}
+		break;
 	case TGSI_OPCODE_FLR:
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
@@ -2427,7 +2687,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		}
 		break;
 	case TGSI_OPCODE_FRC:
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, NULL);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
@@ -2435,14 +2695,42 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			emit_sub(pc, dst[c], src[0][c], temp);
 		}
 		break;
+	case TGSI_OPCODE_I2F:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_S32);
+		}
+		break;
 	case TGSI_OPCODE_IF:
 		assert(pc->if_lvl < NV50_MAX_COND_NESTING);
-		emit_cvt(pc, NULL, src[0][0], 0, CVTOP_ABS | CVTOP_RN,
-			 CVT_F32_F32);
+		emit_cvt(pc, NULL, src[0][0], 0, CVT_ABS | CVT_F32_F32);
 		pc->if_join[pc->if_lvl] = emit_joinat(pc);
 		pc->if_insn[pc->if_lvl++] = emit_branch(pc, 0, 2);;
 		terminate_mbb(pc);
 		break;
+	case TGSI_OPCODE_IMAX:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x08c, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_IMIN:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x0ac, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_INEG:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_S32_S32 | CVT_NEG);
+		}
+		break;
 	case TGSI_OPCODE_KIL:
 		assert(src[0][0] && src[0][1] && src[0][2] && src[0][3]);
 		emit_kil(pc, src[0][0]);
@@ -2463,13 +2751,13 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 	{
 		struct nv50_reg *t[2];
 
-		t[0] = temp_temp(pc);
+		t[0] = temp_temp(pc, NULL);
 		if (mask & (1 << 1))
-			t[1] = temp_temp(pc);
+			t[1] = temp_temp(pc, NULL);
 		else
 			t[1] = t[0];
 
-		emit_abs(pc, t[0], src[0][0]);
+		emit_cvt(pc, t[0], src[0][0], -1, CVT_ABS | CVT_F32_F32);
 		emit_flop(pc, NV50_FLOP_LG2, t[1], t[0]);
 		if (mask & (1 << 2))
 			emit_mov(pc, dst[2], t[1]);
@@ -2488,7 +2776,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 	}
 		break;
 	case TGSI_OPCODE_LRP:
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, NULL);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
@@ -2507,14 +2795,14 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
+			emit_minmax(pc, 0x880, dst[c], src[0][c], src[1][c]);
 		}
 		break;
 	case TGSI_OPCODE_MIN:
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
+			emit_minmax(pc, 0x8a0, dst[c], src[0][c], src[1][c]);
 		}
 		break;
 	case TGSI_OPCODE_MOV:
@@ -2531,10 +2819,19 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			emit_mul(pc, dst[c], src[0][c], src[1][c]);
 		}
 		break;
+	case TGSI_OPCODE_NOT:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_not(pc, dst[c], src[0][c]);
+		}
+		break;
 	case TGSI_OPCODE_POW:
 		emit_pow(pc, brdc, src[0][0], src[1][0]);
 		break;
 	case TGSI_OPCODE_RCP:
+		if (!sat && popcnt4(mask) == 1)
+			brdc = dst[ffs(mask) - 1];
 		emit_flop(pc, NV50_FLOP_RCP, brdc, src[0][0]);
 		break;
 	case TGSI_OPCODE_RET:
@@ -2543,11 +2840,20 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		emit_ret(pc, -1, 0);
 		break;
 	case TGSI_OPCODE_RSQ:
+		if (!sat && popcnt4(mask) == 1)
+			brdc = dst[ffs(mask) - 1];
 		src[0][0]->mod |= NV50_MOD_ABS;
 		emit_flop(pc, NV50_FLOP_RSQ, brdc, src[0][0]);
 		break;
+	case TGSI_OPCODE_SAD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_sad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
+		}
+		break;
 	case TGSI_OPCODE_SCS:
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, NULL);
 		if (mask & 3)
 			emit_precossin(pc, temp, src[0][0]);
 		if (mask & (1 << 0))
@@ -2559,6 +2865,16 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		if (mask & (1 << 3))
 			emit_mov_immdval(pc, dst[3], 1.0);
 		break;
+	case TGSI_OPCODE_SHL:
+	case TGSI_OPCODE_ISHR:
+	case TGSI_OPCODE_USHR:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_shift(pc, dst[c], src[0][c], src[1][c],
+				   inst->Instruction.Opcode);
+		}
+		break;
 	case TGSI_OPCODE_SIN:
 		if (mask & 8) {
 			emit_precossin(pc, temp, src[0][3]);
@@ -2566,7 +2882,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			if (!(mask &= 7))
 				break;
 			if (temp == dst[3])
-				temp = brdc = temp_temp(pc);
+				temp = brdc = temp_temp(pc, NULL);
 		}
 		emit_precossin(pc, temp, src[0][0]);
 		emit_flop(pc, NV50_FLOP_SIN, brdc, temp);
@@ -2577,12 +2893,23 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 	case TGSI_OPCODE_SGT:
 	case TGSI_OPCODE_SLE:
 	case TGSI_OPCODE_SNE:
-		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
+	case TGSI_OPCODE_ISLT:
+	case TGSI_OPCODE_ISGE:
+	case TGSI_OPCODE_USEQ:
+	case TGSI_OPCODE_USGE:
+	case TGSI_OPCODE_USLT:
+	case TGSI_OPCODE_USNE:
+	{
+		uint8_t cc, ty;
+
+		map_tgsi_setop_hw(inst->Instruction.Opcode, &cc, &ty);
+
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
+			emit_set(pc, cc, dst[c], -1, src[0][c], src[1][c], ty);
 		}
+	}
 		break;
 	case TGSI_OPCODE_SUB:
 		for (c = 0; c < 4; c++) {
@@ -2612,11 +2939,72 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			if (!(mask & (1 << c)))
 				continue;
 			emit_cvt(pc, dst[c], src[0][c], -1,
-				 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI);
+				 CVT_TRUNC | CVT_F32_F32 | CVT_RI);
+		}
+		break;
+	case TGSI_OPCODE_U2F:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_U32);
+		}
+		break;
+	case TGSI_OPCODE_UADD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_add_b32(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_UMAX:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x084, dst[c], src[0][c], src[1][c]);
 		}
 		break;
+	case TGSI_OPCODE_UMIN:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x0a4, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_UMAD:
+	{
+		assert(!temp);
+		temp = temp_temp(pc, NULL);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1);
+			emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0,
+				     temp);
+			emit_shl_imm(pc, temp, temp, 16);
+			emit_mad_u16(pc, temp, src[0][c], 0, src[1][c], 0,
+				     temp);
+			emit_add_b32(pc, dst[c], temp, src[2][c]);
+		}
+	}
+		break;
+	case TGSI_OPCODE_UMUL:
+	{
+		assert(!temp);
+		temp = temp_temp(pc, NULL);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1);
+			emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0,
+				     temp);
+			emit_shl_imm(pc, temp, temp, 16);
+			emit_mad_u16(pc, dst[c], src[0][c], 0, src[1][c], 0,
+				     temp);
+		}
+	}
+		break;
 	case TGSI_OPCODE_XPD:
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, NULL);
 		if (mask & (1 << 0)) {
 			emit_mul(pc, temp, src[0][2], src[1][1]);
 			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
@@ -2670,7 +3058,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		}
 	}
 
-	kill_temp_temp(pc);
+	kill_temp_temp(pc, NULL);
 	pc->reg_instance_nr = 0;
 
 	return TRUE;
@@ -2679,7 +3067,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 static void
 prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
 {
-	struct nv50_reg *reg = NULL;
+	struct nv50_reg *r, *reg = NULL;
 	const struct tgsi_full_src_register *src;
 	const struct tgsi_dst_register *dst;
 	unsigned i, c, k, mask;
@@ -2725,7 +3113,15 @@ prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
 				continue;
 			k = tgsi_util_get_full_src_register_swizzle(src, c);
 
-			reg[src->Register.Index * 4 + k].acc = pc->insn_nr;
+			r = &reg[src->Register.Index * 4 + k];
+
+			/* If used before written, pre-allocate the reg,
+			 * lest we overwrite results from a subroutine.
+			 */
+			if (!r->acc && r->type == P_TEMP)
+				alloc_reg(pc, r);
+
+			r->acc = pc->insn_nr;
 		}
 	}
 }
@@ -2814,7 +3210,7 @@ nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
 
 	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
 		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
-		boolean neg_supp = negate_supported(insn, i);
+		int ms = get_supported_mods(insn, i);
 
 		fs = &insn->Src[i];
 		if (fs->Register.File != fd->Register.File ||
@@ -2832,10 +3228,12 @@ nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
 			if (!(fd->Register.WriteMask & (1 << c)))
 				continue;
 
-			/* no danger if src is copied to TEMP first */
-			if ((s != TGSI_UTIL_SIGN_KEEP) &&
-			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
-				continue;
+			if (s == TGSI_UTIL_SIGN_TOGGLE && !(ms & NV50_MOD_NEG))
+					continue;
+			if (s == TGSI_UTIL_SIGN_CLEAR && !(ms & NV50_MOD_ABS))
+					continue;
+			if ((s == TGSI_UTIL_SIGN_SET) && ((ms & 3) != 3))
+					continue;
 
 			rdep[c] |= nv50_tgsi_dst_revdep(
 				insn->Instruction.Opcode, i, chn);
@@ -2859,7 +3257,7 @@ nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 	if (is_scalar_op(insn.Instruction.Opcode)) {
 		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
 		if (!pc->r_brdc)
-			pc->r_brdc = temp_temp(pc);
+			pc->r_brdc = temp_temp(pc, NULL);
 		return nv50_program_tx_insn(pc, &insn);
 	}
 	pc->r_brdc = NULL;
@@ -3224,6 +3622,8 @@ free_nv50_pc(struct nv50_pc *pc)
 		FREE(pc->attr);
 	if (pc->temp)
 		FREE(pc->temp);
+	if (pc->insn_pos)
+		FREE(pc->insn_pos);
 
 	FREE(pc);
 }
@@ -3579,7 +3979,7 @@ nv50_vertprog_validate(struct nv50_context *nv50)
 	nv50_program_validate_data(nv50, p);
 	nv50_program_validate_code(nv50, p);
 
-	so = so_new(13, 2);
+	so = so_new(5, 8, 2);
 	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
 		      NOUVEAU_BO_HIGH, 0, 0);
@@ -3615,7 +4015,7 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 	nv50_program_validate_data(nv50, p);
 	nv50_program_validate_code(nv50, p);
 
-	so = so_new(64, 2);
+	so = so_new(6, 7, 2);
 	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
 		      NOUVEAU_BO_HIGH, 0, 0);
@@ -3635,12 +4035,13 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 	so_ref(NULL, &so);
 }
 
-static void
+static uint32_t
 nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
 {
 	struct nv50_program *fp = nv50->fragprog;
 	struct nv50_program *vp = nv50->vertprog;
 	unsigned i, c, m = base;
+	uint32_t origin = 0x00000010;
 
 	/* XXX: this might not work correctly in all cases yet - we'll
 	 * just assume that an FP generic input that is not written in
@@ -3674,7 +4075,9 @@ nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
 			if (mode == PIPE_SPRITE_COORD_NONE) {
 				m += n;
 				continue;
-			}
+			} else
+			if (mode == PIPE_SPRITE_COORD_LOWER_LEFT)
+				origin = 0;
 		}
 
 		/* this is either PointCoord or replaced by sprite coords */
@@ -3685,6 +4088,7 @@ nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
 			++m;
 		}
 	}
+	return origin;
 }
 
 static int
@@ -3783,7 +4187,7 @@ nv50_linkage_validate(struct nv50_context *nv50)
 	}
 
 	/* now fill the stateobj */
-	so = so_new(64, 0);
+	so = so_new(7, 57, 0);
 
 	n = (m + 3) / 4;
 	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
@@ -3801,7 +4205,9 @@ nv50_linkage_validate(struct nv50_context *nv50)
 	so_datap (so, lin, 4);
 
 	if (nv50->rasterizer->pipe.point_sprite) {
-		nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
+		so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1);
+		so_data  (so,
+			  nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff));
 
 		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
 		so_datap (so, pcrd, 8);
diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c
index 5d9e18218a..5a4ab3508b 100644
--- a/src/gallium/drivers/nv50/nv50_query.c
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -111,7 +111,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
 
 	if (!q->ready) {
 		ret = nouveau_bo_map(q->bo, NOUVEAU_BO_RD |
-				     wait ? 0 : NOUVEAU_BO_NOWAIT);
+				     (wait ? 0 : NOUVEAU_BO_NOWAIT));
 		if (ret)
 			return false;
 		q->result = ((uint32_t *)q->bo->map)[1];
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index 7e039ea82e..28e2b35dea 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -189,6 +189,28 @@ nv50_screen_destroy(struct pipe_screen *pscreen)
 	FREE(screen);
 }
 
+static int
+nv50_pre_pipebuffer_map(struct pipe_screen *pscreen, struct pipe_buffer *pb,
+	unsigned usage)
+{
+	struct nv50_screen *screen = nv50_screen(pscreen);
+	struct nv50_context *ctx = screen->cur_ctx;
+
+	if (!(pb->usage & PIPE_BUFFER_USAGE_VERTEX))
+		return 0;
+
+	/* Our vtxbuf got mapped, it can no longer be considered part of current
+	 * state, remove it to avoid emitting reloc markers.
+	 */
+	if (ctx && ctx->state.vtxbuf && so_bo_is_reloc(ctx->state.vtxbuf,
+			nouveau_bo(pb))) {
+		so_ref(NULL, &ctx->state.vtxbuf);
+		ctx->dirty |= NV50_NEW_ARRAYS;
+	}
+
+	return 0;
+}
+
 struct pipe_screen *
 nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 {
@@ -216,6 +238,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	pscreen->get_param = nv50_screen_get_param;
 	pscreen->get_paramf = nv50_screen_get_paramf;
 	pscreen->is_format_supported = nv50_screen_is_format_supported;
+	screen->base.pre_pipebuffer_map_callback = nv50_pre_pipebuffer_map;
 
 	nv50_screen_init_miptree_functions(pscreen);
 	nv50_transfer_init_screen_functions(pscreen);
@@ -228,7 +251,6 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
-	BIND_RING(chan, screen->m2mf, 1);
 
 	/* 2D object */
 	ret = nouveau_grobj_alloc(chan, 0xbeef502d, NV50_2D, &screen->eng2d);
@@ -237,7 +259,6 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
-	BIND_RING(chan, screen->eng2d, 2);
 
 	/* 3D object */
 	switch (chipset & 0xf0) {
@@ -273,7 +294,6 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
-	BIND_RING(chan, screen->tesla, 3);
 
 	/* Sync notifier */
 	ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync);
@@ -284,7 +304,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	/* Static M2MF init */
-	so = so_new(32, 0);
+	so = so_new(1, 3, 0);
 	so_method(so, screen->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 3);
 	so_data  (so, screen->sync->handle);
 	so_data  (so, chan->vram->handle);
@@ -293,7 +313,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_ref (NULL, &so);
 
 	/* Static 2D init */
-	so = so_new(64, 0);
+	so = so_new(4, 7, 0);
 	so_method(so, screen->eng2d, NV50_2D_DMA_NOTIFY, 4);
 	so_data  (so, screen->sync->handle);
 	so_data  (so, chan->vram->handle);
@@ -309,7 +329,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_ref(NULL, &so);
 
 	/* Static tesla init */
-	so = so_new(256, 20);
+	so = so_new(40, 84, 20);
 
 	so_method(so, screen->tesla, NV50TCL_COND_MODE, 1);
 	so_data  (so, NV50TCL_COND_MODE_ALWAYS);
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
index 61e24a5b57..a038a4e3c2 100644
--- a/src/gallium/drivers/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -2,6 +2,7 @@
 #define __NV50_SCREEN_H__
 
 #include "nouveau/nouveau_screen.h"
+#include "nv50_context.h"
 
 struct nv50_screen {
 	struct nouveau_screen base;
@@ -9,6 +10,7 @@ struct nv50_screen {
 	struct nouveau_winsys *nvws;
 
 	unsigned cur_pctx;
+	struct nv50_context *cur_ctx;
 
 	struct nouveau_grobj *tesla;
 	struct nouveau_grobj *eng2d;
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index 30b2b0f91b..1f67df814b 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -35,7 +35,7 @@ static void *
 nv50_blend_state_create(struct pipe_context *pipe,
 			const struct pipe_blend_state *cso)
 {
-	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_stateobj *so = so_new(5, 24, 0);
 	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
 	struct nv50_blend_stateobj *bso = CALLOC_STRUCT(nv50_blend_stateobj);
 	unsigned cmask = 0, i;
@@ -146,7 +146,6 @@ nv50_sampler_state_create(struct pipe_context *pipe,
 		  (wrap_mode(cso->wrap_r) << 6));
 
 	switch (cso->mag_img_filter) {
-	case PIPE_TEX_FILTER_ANISO:
 	case PIPE_TEX_FILTER_LINEAR:
 		tsc[1] |= NV50TSC_1_1_MAGF_LINEAR;
 		break;
@@ -157,7 +156,6 @@ nv50_sampler_state_create(struct pipe_context *pipe,
 	}
 
 	switch (cso->min_img_filter) {
-	case PIPE_TEX_FILTER_ANISO:
 	case PIPE_TEX_FILTER_LINEAR:
 		tsc[1] |= NV50TSC_1_1_MINF_LINEAR;
 		break;
@@ -280,7 +278,7 @@ static void *
 nv50_rasterizer_state_create(struct pipe_context *pipe,
 			     const struct pipe_rasterizer_state *cso)
 {
-	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_stateobj *so = so_new(15, 21, 0);
 	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
 	struct nv50_rasterizer_stateobj *rso =
 		CALLOC_STRUCT(nv50_rasterizer_stateobj);
@@ -425,7 +423,7 @@ nv50_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 {
 	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
 	struct nv50_zsa_stateobj *zsa = CALLOC_STRUCT(nv50_zsa_stateobj);
-	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_stateobj *so = so_new(8, 22, 0);
 
 	so_method(so, tesla, NV50TCL_DEPTH_WRITE_ENABLE, 1);
 	so_data  (so, cso->depth.writemask ? 1 : 0);
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index c8bdf9dc27..f83232f43c 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -33,7 +33,7 @@ static void
 nv50_state_validate_fb(struct nv50_context *nv50)
 {
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	struct nouveau_stateobj *so = so_new(128, 18);
+	struct nouveau_stateobj *so = so_new(32, 79, 18);
 	struct pipe_framebuffer_state *fb = &nv50->framebuffer;
 	unsigned i, w, h, gw = 0;
 
@@ -185,6 +185,9 @@ nv50_state_emit(struct nv50_context *nv50)
 	struct nv50_screen *screen = nv50->screen;
 	struct nouveau_channel *chan = screen->base.channel;
 
+	/* I don't want to copy headers from the winsys. */
+	screen->cur_ctx = nv50;
+
 	if (nv50->pctx_id != screen->cur_pctx) {
 		if (nv50->state.fb)
 			nv50->state.dirty |= NV50_NEW_FRAMEBUFFER;
@@ -296,7 +299,7 @@ nv50_state_validate(struct nv50_context *nv50)
 		so_ref(nv50->rasterizer->so, &nv50->state.rast);
 
 	if (nv50->dirty & NV50_NEW_BLEND_COLOUR) {
-		so = so_new(5, 0);
+		so = so_new(1, 4, 0);
 		so_method(so, tesla, NV50TCL_BLEND_COLOR(0), 4);
 		so_data  (so, fui(nv50->blend_colour.color[0]));
 		so_data  (so, fui(nv50->blend_colour.color[1]));
@@ -307,7 +310,7 @@ nv50_state_validate(struct nv50_context *nv50)
 	}
 
 	if (nv50->dirty & NV50_NEW_STIPPLE) {
-		so = so_new(33, 0);
+		so = so_new(1, 32, 0);
 		so_method(so, tesla, NV50TCL_POLYGON_STIPPLE_PATTERN(0), 32);
 		for (i = 0; i < 32; i++)
 			so_data(so, util_bswap32(nv50->stipple.stipple[i]));
@@ -324,7 +327,7 @@ nv50_state_validate(struct nv50_context *nv50)
 			goto scissor_uptodate;
 		nv50->state.scissor_enabled = rast->scissor;
 
-		so = so_new(3, 0);
+		so = so_new(1, 2, 0);
 		so_method(so, tesla, NV50TCL_SCISSOR_HORIZ(0), 2);
 		if (nv50->state.scissor_enabled) {
 			so_data(so, (s->maxx << 16) | s->minx);
@@ -353,7 +356,7 @@ scissor_uptodate:
 			goto viewport_uptodate;
 		nv50->state.viewport_bypass = bypass;
 
-		so = so_new(14, 0);
+		so = so_new(5, 9, 0);
 		if (!bypass) {
 			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSLATE_X(0), 3);
 			so_data  (so, fui(nv50->viewport.translate[0]));
@@ -397,7 +400,8 @@ viewport_uptodate:
 		for (i = 0; i < PIPE_SHADER_TYPES; ++i)
 			nr += nv50->sampler_nr[i];
 
-		so = so_new(nr * 8 + 24 * PIPE_SHADER_TYPES + 2, 4);
+		so = so_new(1+ 5 * PIPE_SHADER_TYPES, 1+ 19 * PIPE_SHADER_TYPES
+					+ nr * 8, PIPE_SHADER_TYPES * 2);
 
 		nv50_validate_samplers(nv50, so, PIPE_SHADER_VERTEX);
 		nv50_validate_samplers(nv50, so, PIPE_SHADER_FRAGMENT);
diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c
index c4ca096d6a..bef548b728 100644
--- a/src/gallium/drivers/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nv50/nv50_tex.c
@@ -199,16 +199,18 @@ nv50_tex_validate(struct nv50_context *nv50)
 {
 	struct nouveau_stateobj *so;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	unsigned p, push, nrlc;
+	unsigned p, start, push, nrlc;
 
-	for (nrlc = 0, push = 0, p = 0; p < PIPE_SHADER_TYPES; ++p) {
+	for (nrlc = 0, start = 0, push = 0, p = 0; p < PIPE_SHADER_TYPES; ++p) {
+		start += MAX2(nv50->miptree_nr[p], nv50->state.miptree_nr[p]);
 		push += MAX2(nv50->miptree_nr[p], nv50->state.miptree_nr[p]);
 		nrlc += nv50->miptree_nr[p];
 	}
-	push = push * 11 + 23 * PIPE_SHADER_TYPES + 4;
+	start = start * 2 + 4 * PIPE_SHADER_TYPES + 2;
+	push = push * 9 + 19 * PIPE_SHADER_TYPES + 2;
 	nrlc = nrlc * 2 + 2 * PIPE_SHADER_TYPES;
 
-	so = so_new(push, nrlc);
+	so = so_new(start, push, nrlc);
 
 	if (nv50_validate_textures(nv50, so, PIPE_SHADER_VERTEX) == FALSE ||
 	    nv50_validate_textures(nv50, so, PIPE_SHADER_FRAGMENT) == FALSE) {
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index 602adfc50d..f2e510fba6 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -152,7 +152,7 @@ nv50_vbo_vtxelt_to_hw(struct pipe_vertex_element *ve)
 	return (hw_type | hw_size);
 }
 
-boolean
+void
 nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 		 unsigned count)
 {
@@ -182,7 +182,9 @@ nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
 	OUT_RING  (chan, 0);
 
-	return ret;
+        /* XXX: not sure what to do if ret != TRUE: flush and retry?
+         */
+        assert(ret);
 }
 
 static INLINE boolean
@@ -275,7 +277,7 @@ nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map,
 	return TRUE;
 }
 
-boolean
+void
 nv50_draw_elements(struct pipe_context *pipe,
 		   struct pipe_buffer *indexBuffer, unsigned indexSize,
 		   unsigned mode, unsigned start, unsigned count)
@@ -317,8 +319,10 @@ nv50_draw_elements(struct pipe_context *pipe,
 	OUT_RING  (chan, 0);
 
 	pipe_buffer_unmap(pscreen, indexBuffer);
-
-	return ret;
+        
+        /* XXX: what to do if ret != TRUE?  Flush and retry?
+         */
+	assert(ret);
 }
 
 static INLINE boolean
@@ -350,7 +354,7 @@ nv50_vbo_static_attrib(struct nv50_context *nv50, unsigned attrib,
 
 	so = *pso;
 	if (!so)
-		*pso = so = so_new(nv50->vtxelt_nr * 5, 0);
+		*pso = so = so_new(nv50->vtxelt_nr, nv50->vtxelt_nr * 4, 0);
 
 	switch (ve->nr_components) {
 	case 4:
@@ -411,8 +415,8 @@ nv50_vbo_validate(struct nv50_context *nv50)
 	n_ve = MAX2(nv50->vtxelt_nr, nv50->state.vtxelt_nr);
 
 	vtxattr = NULL;
-	vtxbuf = so_new(n_ve * 7, nv50->vtxelt_nr * 4);
-	vtxfmt = so_new(n_ve + 1, 0);
+	vtxbuf = so_new(n_ve * 2, n_ve * 5, nv50->vtxelt_nr * 4);
+	vtxfmt = so_new(1, n_ve, 0);
 	so_method(vtxfmt, tesla, NV50TCL_VERTEX_ARRAY_ATTRIB(0), n_ve);
 
 	for (i = 0; i < nv50->vtxelt_nr; i++) {
diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
index ffe066d536..c14414fff6 100644
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -27,9 +27,9 @@
 
 static void r300_blitter_save_states(struct r300_context* r300)
 {
-    util_blitter_save_blend(r300->blitter, r300->blend_state);
-    util_blitter_save_depth_stencil_alpha(r300->blitter, r300->dsa_state);
-    util_blitter_save_rasterizer(r300->blitter, r300->rs_state);
+    util_blitter_save_blend(r300->blitter, r300->blend_state.state);
+    util_blitter_save_depth_stencil_alpha(r300->blitter, r300->dsa_state.state);
+    util_blitter_save_rasterizer(r300->blitter, r300->rs_state.state);
     util_blitter_save_fragment_shader(r300->blitter, r300->fs);
     util_blitter_save_vertex_shader(r300->blitter, r300->vs);
 }
diff --git a/src/gallium/drivers/r300/r300_chipset.c b/src/gallium/drivers/r300/r300_chipset.c
index 51fdb82ff3..92de297ef1 100644
--- a/src/gallium/drivers/r300/r300_chipset.c
+++ b/src/gallium/drivers/r300/r300_chipset.c
@@ -33,6 +33,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
     /* Reasonable defaults */
     caps->num_vert_fpus = 4;
     caps->has_tcl = debug_get_bool_option("RADEON_NO_TCL", FALSE) ? FALSE : TRUE;
+    caps->is_r400 = FALSE;
     caps->is_r500 = FALSE;
     caps->high_second_pipe = FALSE;
 
@@ -123,6 +124,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x4A54:
             caps->family = CHIP_FAMILY_R420;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x5548:
@@ -136,6 +138,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x5D57:
             caps->family = CHIP_FAMILY_R423;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x554C:
@@ -147,6 +150,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x5D4A:
             caps->family = CHIP_FAMILY_R430;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x5D4C:
@@ -157,6 +161,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x5D52:
             caps->family = CHIP_FAMILY_R480;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x4B48:
@@ -166,6 +171,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x4B4C:
             caps->family = CHIP_FAMILY_R481;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x5E4C:
@@ -182,6 +188,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x5E4D:
             caps->family = CHIP_FAMILY_RV410;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x5954:
@@ -212,6 +219,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x791F:
             caps->family = CHIP_FAMILY_RS690;
             caps->has_tcl = FALSE;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x793F:
@@ -219,6 +227,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x7942:
             caps->family = CHIP_FAMILY_RS600;
             caps->has_tcl = FALSE;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x796C:
@@ -227,6 +236,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x796F:
             caps->family = CHIP_FAMILY_RS740;
             caps->has_tcl = FALSE;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x7100:
diff --git a/src/gallium/drivers/r300/r300_chipset.h b/src/gallium/drivers/r300/r300_chipset.h
index 0633a8b8a7..2808486492 100644
--- a/src/gallium/drivers/r300/r300_chipset.h
+++ b/src/gallium/drivers/r300/r300_chipset.h
@@ -40,11 +40,18 @@ struct r300_capabilities {
     unsigned num_z_pipes;
     /* Whether or not TCL is physically present */
     boolean has_tcl;
+    /* Whether or not this is R400. The differences compared to their R3xx
+     * cousins are:
+     * - Extended fragment shader registers
+     * - Blend LTE/GTE thresholds */
+    boolean is_r400;
     /* Whether or not this is an RV515 or newer; R500s have many differences
      * that require extra consideration, compared to their R3xx cousins:
      * - Extra bit of width and height on texture sizes
      * - Blend color is split across two registers
-     * - Universal Shader (US) block used for fragment shaders */
+     * - Blend LTE/GTE thresholds
+     * - Universal Shader (US) block used for fragment shaders
+     * - FP16 blending and multisampling */
     boolean is_r500;
     /* Whether or not the second pixel pipe is accessed with the high bit */
     boolean high_second_pipe;
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index d5c2d63d39..5e4f6552c3 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -30,6 +30,7 @@
 
 #include "r300_blit.h"
 #include "r300_context.h"
+#include "r300_emit.h"
 #include "r300_flush.h"
 #include "r300_query.h"
 #include "r300_render.h"
@@ -69,11 +70,13 @@ static void r300_destroy_context(struct pipe_context* context)
         FREE(query);
     }
 
-    FREE(r300->blend_color_state);
+    FREE(r300->blend_color_state.state);
+    FREE(r300->clip_state.state);
     FREE(r300->rs_block);
-    FREE(r300->scissor_state);
+    FREE(r300->scissor_state.state);
     FREE(r300->vertex_info);
-    FREE(r300->viewport_state);
+    FREE(r300->viewport_state.state);
+    FREE(r300->ztop_state.state);
     FREE(r300);
 }
 
@@ -107,6 +110,35 @@ static void r300_flush_cb(void *data)
     cs_context_copy->context.flush(&cs_context_copy->context, 0, NULL);
 }
 
+#define R300_INIT_ATOM(atomname, atomsize) \
+    r300->atomname##_state.name = #atomname; \
+    r300->atomname##_state.state = NULL; \
+    r300->atomname##_state.size = atomsize; \
+    r300->atomname##_state.emit = r300_emit_##atomname##_state; \
+    r300->atomname##_state.dirty = FALSE; \
+    insert_at_tail(&r300->atom_list, &r300->atomname##_state);
+
+static void r300_setup_atoms(struct r300_context* r300)
+{
+    /* Create the actual atom list.
+     *
+     * Each atom is examined and emitted in the order it appears here, which
+     * can affect performance and conformance if not handled with care.
+     *
+     * Some atoms never change size, others change every emit. This is just
+     * an upper bound on each atom, to keep the emission machinery from
+     * underallocating space. */
+    make_empty_list(&r300->atom_list);
+    R300_INIT_ATOM(ztop, 2);
+    R300_INIT_ATOM(blend, 8);
+    R300_INIT_ATOM(blend_color, 3);
+    R300_INIT_ATOM(clip, 29);
+    R300_INIT_ATOM(dsa, 8);
+    R300_INIT_ATOM(rs, 22);
+    R300_INIT_ATOM(scissor, 3);
+    R300_INIT_ATOM(viewport, 9);
+}
+
 struct pipe_context* r300_create_context(struct pipe_screen* screen,
                                          struct radeon_winsys* radeon_winsys)
 {
@@ -155,11 +187,15 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300->shader_hash_table = util_hash_table_create(r300_shader_key_hash,
         r300_shader_key_compare);
 
-    r300->blend_color_state = CALLOC_STRUCT(r300_blend_color_state);
+    r300_setup_atoms(r300);
+
+    r300->blend_color_state.state = CALLOC_STRUCT(r300_blend_color_state);
+    r300->clip_state.state = CALLOC_STRUCT(pipe_clip_state);
     r300->rs_block = CALLOC_STRUCT(r300_rs_block);
-    r300->scissor_state = CALLOC_STRUCT(r300_scissor_state);
+    r300->scissor_state.state = CALLOC_STRUCT(pipe_scissor_state);
     r300->vertex_info = CALLOC_STRUCT(r300_vertex_info);
-    r300->viewport_state = CALLOC_STRUCT(r300_viewport_state);
+    r300->viewport_state.state = CALLOC_STRUCT(r300_viewport_state);
+    r300->ztop_state.state = CALLOC_STRUCT(r300_ztop_state);
 
     /* Open up the OQ BO. */
     r300->oqbo = screen->buffer_create(screen, 4096,
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index 232530b7dc..682b9179c8 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -30,9 +30,28 @@
 #include "pipe/p_context.h"
 #include "pipe/p_inlines.h"
 
+struct r300_context;
+
 struct r300_fragment_shader;
 struct r300_vertex_shader;
 
+struct r300_atom {
+    /* List pointers. */
+    struct r300_atom *prev, *next;
+    /* Name, for debugging. */
+    const char* name;
+    /* Opaque state. */
+    void* state;
+    /* Emit the state to the context. */
+    void (*emit)(struct r300_context*, void*);
+    /* Upper bound on number of dwords to emit. */
+    unsigned size;
+    /* Whether this atom should be emitted. */
+    boolean dirty;
+    /* Another dirty flag that is never automatically cleared. */
+    boolean always_dirty;
+};
+
 struct r300_blend_state {
     uint32_t blend_control;       /* R300_RB3D_CBLEND: 0x4e04 */
     uint32_t alpha_blend_control; /* R300_RB3D_ABLEND: 0x4e08 */
@@ -62,11 +81,6 @@ struct r300_rs_state {
     /* Draw-specific rasterizer state */
     struct pipe_rasterizer_state rs;
 
-    /* Whether or not to enable the VTE. This is referenced at the very
-     * last moment during emission of VTE state, to decide whether or not
-     * the VTE should be used for transformation. */
-    boolean enable_vte;
-
     uint32_t vap_control_status;    /* R300_VAP_CNTL_STATUS: 0x2140 */
     uint32_t point_size;            /* R300_GA_POINT_SIZE: 0x421c */
     uint32_t point_minmax;          /* R300_GA_POINT_MINMAX: 0x4230 */
@@ -102,19 +116,6 @@ struct r300_sampler_state {
     unsigned min_lod, max_lod;
 };
 
-struct r300_scissor_regs {
-    uint32_t top_left;     /* R300_SC_SCISSORS_TL: 0x43e0 */
-    uint32_t bottom_right; /* R300_SC_SCISSORS_BR: 0x43e4 */
-
-    /* Whether everything is culled by scissoring. */
-    boolean empty_area;
-};
-
-struct r300_scissor_state {
-    struct r300_scissor_regs framebuffer;
-    struct r300_scissor_regs scissor;
-};
-
 struct r300_texture_state {
     uint32_t format0; /* R300_TX_FORMAT0: 0x4480 */
     uint32_t format1; /* R300_TX_FORMAT1: 0x44c0 */
@@ -135,24 +136,17 @@ struct r300_ztop_state {
     uint32_t z_buffer_top;      /* R300_ZB_ZTOP: 0x4f14 */
 };
 
-#define R300_NEW_BLEND           0x00000001
-#define R300_NEW_BLEND_COLOR     0x00000002
-#define R300_NEW_CLIP            0x00000004
-#define R300_NEW_DSA             0x00000008
 #define R300_NEW_FRAMEBUFFERS    0x00000010
 #define R300_NEW_FRAGMENT_SHADER 0x00000020
 #define R300_NEW_FRAGMENT_SHADER_CONSTANTS    0x00000040
-#define R300_NEW_RASTERIZER      0x00000080
 #define R300_NEW_RS_BLOCK        0x00000100
 #define R300_NEW_SAMPLER         0x00000200
 #define R300_ANY_NEW_SAMPLERS    0x0001fe00
-#define R300_NEW_SCISSOR         0x00020000
 #define R300_NEW_TEXTURE         0x00040000
 #define R300_ANY_NEW_TEXTURES    0x03fc0000
 #define R300_NEW_VERTEX_FORMAT   0x04000000
 #define R300_NEW_VERTEX_SHADER   0x08000000
 #define R300_NEW_VERTEX_SHADER_CONSTANTS    0x10000000
-#define R300_NEW_VIEWPORT        0x20000000
 #define R300_NEW_QUERY           0x40000000
 #define R300_NEW_KITCHEN_SINK    0x7fffffff
 
@@ -194,6 +188,12 @@ struct r300_query {
     struct r300_query* next;
 };
 
+enum r300_buffer_tiling {
+    R300_BUFFER_LINEAR = 0,
+    R300_BUFFER_TILED,
+    R300_BUFFER_SQUARETILED
+};
+
 struct r300_texture {
     /* Parent class */
     struct pipe_texture tex;
@@ -230,6 +230,9 @@ struct r300_texture {
 
     /* Registers carrying texture format data. */
     struct r300_texture_state state;
+
+    /* Buffer tiling */
+    enum r300_buffer_tiling microtile, macrotile;
 };
 
 struct r300_vertex_info {
@@ -273,38 +276,40 @@ struct r300_context {
     struct r300_vertex_info* vertex_info;
 
     /* Various CSO state objects. */
+    /* Beginning of atom list. */
+    struct r300_atom atom_list;
     /* Blend state. */
-    struct r300_blend_state* blend_state;
+    struct r300_atom blend_state;
     /* Blend color state. */
-    struct r300_blend_color_state* blend_color_state;
+    struct r300_atom blend_color_state;
     /* User clip planes. */
-    struct pipe_clip_state clip_state;
+    struct r300_atom clip_state;
     /* Shader constants. */
     struct r300_constant_buffer shader_constants[PIPE_SHADER_TYPES];
     /* Depth, stencil, and alpha state. */
-    struct r300_dsa_state* dsa_state;
+    struct r300_atom dsa_state;
     /* Fragment shader. */
     struct r300_fragment_shader* fs;
     /* Framebuffer state. We currently don't need our own version of this. */
     struct pipe_framebuffer_state framebuffer_state;
     /* Rasterizer state. */
-    struct r300_rs_state* rs_state;
+    struct r300_atom rs_state;
     /* RS block state. */
     struct r300_rs_block* rs_block;
     /* Sampler states. */
     struct r300_sampler_state* sampler_states[8];
     int sampler_count;
     /* Scissor state. */
-    struct r300_scissor_state* scissor_state;
+    struct r300_atom scissor_state;
     /* Texture states. */
     struct r300_texture* textures[8];
     int texture_count;
     /* Vertex shader. */
     struct r300_vertex_shader* vs;
     /* Viewport state. */
-    struct r300_viewport_state* viewport_state;
+    struct r300_atom viewport_state;
     /* ZTOP state. */
-    struct r300_ztop_state ztop_state;
+    struct r300_atom ztop_state;
 
     /* Vertex buffers for Gallium. */
     struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
@@ -317,6 +322,8 @@ struct r300_context {
     uint32_t dirty_state;
     /* Flag indicating whether or not the HW is dirty. */
     uint32_t dirty_hw;
+    /* Whether the TCL engine should be in bypass mode. */
+    boolean tcl_bypass;
 
     /** Combination of DBG_xxx flags */
     unsigned debug;
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
index d142fee050..151f72b0fe 100644
--- a/src/gallium/drivers/r300/r300_cs.h
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -52,7 +52,7 @@
 #define CS_LOCALS(context) \
     struct r300_context* const cs_context_copy = (context); \
     struct radeon_winsys* cs_winsys = cs_context_copy->winsys; \
-    int cs_count = 0;
+    int cs_count = 0; (void) cs_count;
 
 #define CHECK_CS(size) \
     assert(cs_winsys->check_cs(cs_winsys, (size)))
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index 1dc9216a7b..9f93327e59 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -25,6 +25,7 @@
 
 #include "util/u_format.h"
 #include "util/u_math.h"
+#include "util/u_simple_list.h"
 
 #include "r300_context.h"
 #include "r300_cs.h"
@@ -36,11 +37,13 @@
 #include "r300_texture.h"
 #include "r300_vs.h"
 
-void r300_emit_blend_state(struct r300_context* r300,
-                           struct r300_blend_state* blend)
+void r300_emit_blend_state(struct r300_context* r300, void* state)
 {
+    struct r300_blend_state* blend = (struct r300_blend_state*)state;
     CS_LOCALS(r300);
+
     BEGIN_CS(8);
+    OUT_CS_REG(R300_RB3D_ROPCNTL, blend->rop);
     OUT_CS_REG_SEQ(R300_RB3D_CBLEND, 3);
     if (r300->framebuffer_state.nr_cbufs) {
         OUT_CS(blend->blend_control);
@@ -52,14 +55,13 @@ void r300_emit_blend_state(struct r300_context* r300,
         OUT_CS(0);
         /* XXX also disable fastfill here once it's supported */
     }
-    OUT_CS_REG(R300_RB3D_ROPCNTL, blend->rop);
     OUT_CS_REG(R300_RB3D_DITHER_CTL, blend->dither);
     END_CS;
 }
 
-void r300_emit_blend_color_state(struct r300_context* r300,
-                                 struct r300_blend_color_state* bc)
+void r300_emit_blend_color_state(struct r300_context* r300, void* state)
 {
+    struct r300_blend_color_state* bc = (struct r300_blend_color_state*)state;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     CS_LOCALS(r300);
 
@@ -76,9 +78,9 @@ void r300_emit_blend_color_state(struct r300_context* r300,
     }
 }
 
-void r300_emit_clip_state(struct r300_context* r300,
-                          struct pipe_clip_state* clip)
+void r300_emit_clip_state(struct r300_context* r300, void* state)
 {
+    struct pipe_clip_state* clip = (struct pipe_clip_state*)state;
     int i;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     CS_LOCALS(r300);
@@ -106,13 +108,13 @@ void r300_emit_clip_state(struct r300_context* r300,
 
 }
 
-void r300_emit_dsa_state(struct r300_context* r300,
-                           struct r300_dsa_state* dsa)
+void r300_emit_dsa_state(struct r300_context* r300, void* state)
 {
+    struct r300_dsa_state* dsa = (struct r300_dsa_state*)state;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     CS_LOCALS(r300);
 
-    BEGIN_CS(r300screen->caps->is_r500 ? 10 : 8);
+    BEGIN_CS(r300screen->caps->is_r500 ? 8 : 6);
     OUT_CS_REG(R300_FG_ALPHA_FUNC, dsa->alpha_function);
 
     /* not needed since we use the 8bit alpha ref */
@@ -121,10 +123,16 @@ void r300_emit_dsa_state(struct r300_context* r300,
     }*/
 
     OUT_CS_REG_SEQ(R300_ZB_CNTL, 3);
-    OUT_CS(dsa->z_buffer_control);
-    OUT_CS(dsa->z_stencil_control);
+
+    if (r300->framebuffer_state.zsbuf) {
+        OUT_CS(dsa->z_buffer_control);
+        OUT_CS(dsa->z_stencil_control);
+    } else {
+        OUT_CS(0);
+        OUT_CS(0);
+    }
+
     OUT_CS(dsa->stencil_ref_mask);
-    OUT_CS_REG(R300_ZB_ZTOP, r300->ztop_state.z_buffer_top);
 
     /* XXX it seems r3xx doesn't support STENCILREFMASK_BF */
     if (r300screen->caps->is_r500) {
@@ -138,6 +146,8 @@ static const float * get_shader_constant(
     struct rc_constant * constant,
     struct r300_constant_buffer * externals)
 {
+    struct r300_viewport_state* viewport =
+        (struct r300_viewport_state*)r300->viewport_state.state;
     static float vec[4] = { 0.0, 0.0, 0.0, 1.0 };
     struct pipe_texture *tex;
 
@@ -160,11 +170,31 @@ static const float * get_shader_constant(
 
                 /* Texture compare-fail value. */
                 /* XXX Since Gallium doesn't support GL_ARB_shadow_ambient,
-                 * this is always (0,0,0,0). */
+                 * this is always (0,0,0,0), right? */
                 case RC_STATE_SHADOW_AMBIENT:
                     vec[3] = 0;
                     break;
 
+                case RC_STATE_R300_VIEWPORT_SCALE:
+                    if (r300->tcl_bypass) {
+                        vec[0] = 1;
+                        vec[1] = 1;
+                        vec[2] = 1;
+                    } else {
+                        vec[0] = viewport->xscale;
+                        vec[1] = viewport->yscale;
+                        vec[2] = viewport->zscale;
+                    }
+                    break;
+
+                case RC_STATE_R300_VIEWPORT_OFFSET:
+                    if (!r300->tcl_bypass) {
+                        vec[0] = viewport->xoffset;
+                        vec[1] = viewport->yoffset;
+                        vec[2] = viewport->zoffset;
+                    }
+                    break;
+
                 default:
                     debug_printf("r300: Implementation error: "
                         "Unknown RC_CONSTANT type %d\n", constant->u.State[0]);
@@ -283,6 +313,22 @@ void r300_emit_fs_constant_buffer(struct r300_context* r300,
     END_CS;
 }
 
+static void r300_emit_fragment_depth_config(struct r300_context* r300,
+                                            struct r300_fragment_shader* fs)
+{
+    CS_LOCALS(r300);
+
+    BEGIN_CS(4);
+    if (r300_fragment_shader_writes_depth(fs)) {
+        OUT_CS_REG(R300_FG_DEPTH_SRC, R300_FG_DEPTH_SRC_SHADER);
+        OUT_CS_REG(R300_US_W_FMT, R300_W_FMT_W24 | R300_W_SRC_US);
+    } else {
+        OUT_CS_REG(R300_FG_DEPTH_SRC, R300_FG_DEPTH_SRC_SCAN);
+        OUT_CS_REG(R300_US_W_FMT, R300_W_FMT_W0 | R300_W_SRC_US);
+    }
+    END_CS;
+}
+
 void r500_emit_fragment_program_code(struct r300_context* r300,
                                      struct rX00_fragment_program_code* generic_code)
 {
@@ -374,8 +420,10 @@ void r300_emit_fb_state(struct r300_context* r300,
 
         OUT_CS_REG_SEQ(R300_RB3D_COLORPITCH0 + (4 * i), 1);
         OUT_CS_RELOC(tex->buffer, tex->pitch[surf->level] |
-                     r300_translate_colorformat(tex->tex.format), 0,
-                     RADEON_GEM_DOMAIN_VRAM, 0);
+                     r300_translate_colorformat(tex->tex.format) |
+                     R300_COLOR_TILE(tex->macrotile) |
+                     R300_COLOR_MICROTILE(tex->microtile),
+                     0, RADEON_GEM_DOMAIN_VRAM, 0);
 
         OUT_CS_REG(R300_US_OUT_FMT_0 + (4 * i),
             r300_translate_out_fmt(surf->format));
@@ -398,8 +446,10 @@ void r300_emit_fb_state(struct r300_context* r300,
         OUT_CS_REG(R300_ZB_FORMAT, r300_translate_zsformat(tex->tex.format));
 
         OUT_CS_REG_SEQ(R300_ZB_DEPTHPITCH, 1);
-        OUT_CS_RELOC(tex->buffer, tex->pitch[surf->level], 0,
-                     RADEON_GEM_DOMAIN_VRAM, 0);
+        OUT_CS_RELOC(tex->buffer, tex->pitch[surf->level] |
+                     R300_DEPTHMACROTILE(tex->macrotile) |
+                     R300_DEPTHMICROTILE(tex->microtile),
+                     0, RADEON_GEM_DOMAIN_VRAM, 0);
     }
 
     END_CS;
@@ -531,8 +581,9 @@ void r300_emit_query_end(struct r300_context* r300)
         r300_emit_query_finish(r300, query);
 }
 
-void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs)
+void r300_emit_rs_state(struct r300_context* r300, void* state)
 {
+    struct r300_rs_state* rs = (struct r300_rs_state*)state;
     CS_LOCALS(r300);
 
     BEGIN_CS(22);
@@ -595,26 +646,47 @@ void r300_emit_rs_block_state(struct r300_context* r300,
     END_CS;
 }
 
-static void r300_emit_scissor_regs(struct r300_context* r300,
-                                   struct r300_scissor_regs* scissor)
+void r300_emit_scissor_state(struct r300_context* r300, void* state)
 {
+    unsigned minx, miny, maxx, maxy;
+    uint32_t top_left, bottom_right;
+    struct r300_screen* r300screen = r300_screen(r300->context.screen);
+    struct pipe_scissor_state* scissor = (struct pipe_scissor_state*)state;
     CS_LOCALS(r300);
 
-    BEGIN_CS(3);
-    OUT_CS_REG_SEQ(R300_SC_SCISSORS_TL, 2);
-    OUT_CS(scissor->top_left);
-    OUT_CS(scissor->bottom_right);
-    END_CS;
-}
+    minx = miny = 0;
+    maxx = r300->framebuffer_state.width;
+    maxy = r300->framebuffer_state.height;
 
-void r300_emit_scissor_state(struct r300_context* r300,
-                             struct r300_scissor_state* scissor)
-{
-    if (r300->rs_state->rs.scissor) {
-        r300_emit_scissor_regs(r300, &scissor->scissor);
+    if (((struct r300_rs_state*)r300->rs_state.state)->rs.scissor) {
+        minx = MAX2(minx, scissor->minx);
+        miny = MAX2(miny, scissor->miny);
+        maxx = MIN2(maxx, scissor->maxx);
+        maxy = MIN2(maxy, scissor->maxy);
+    }
+
+    if (r300screen->caps->is_r500) {
+        top_left =
+            (minx << R300_SCISSORS_X_SHIFT) |
+            (miny << R300_SCISSORS_Y_SHIFT);
+        bottom_right =
+            ((maxx - 1) << R300_SCISSORS_X_SHIFT) |
+            ((maxy - 1) << R300_SCISSORS_Y_SHIFT);
     } else {
-        r300_emit_scissor_regs(r300, &scissor->framebuffer);
+        /* Offset of 1440 in non-R500 chipsets. */
+        top_left =
+            ((minx + 1440) << R300_SCISSORS_X_SHIFT) |
+            ((miny + 1440) << R300_SCISSORS_Y_SHIFT);
+        bottom_right =
+            (((maxx - 1) + 1440) << R300_SCISSORS_X_SHIFT) |
+            (((maxy - 1) + 1440) << R300_SCISSORS_Y_SHIFT);
     }
+
+    BEGIN_CS(3);
+    OUT_CS_REG_SEQ(R300_SC_SCISSORS_TL, 2);
+    OUT_CS(top_left);
+    OUT_CS(bottom_right);
+    END_CS;
 }
 
 void r300_emit_texture(struct r300_context* r300,
@@ -650,8 +722,10 @@ void r300_emit_texture(struct r300_context* r300,
     OUT_CS_REG(R300_TX_FORMAT1_0 + (offset * 4), tex->state.format1);
     OUT_CS_REG(R300_TX_FORMAT2_0 + (offset * 4), tex->state.format2);
     OUT_CS_REG_SEQ(R300_TX_OFFSET_0 + (offset * 4), 1);
-    OUT_CS_RELOC(tex->buffer, 0,
-            RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0, 0);
+    OUT_CS_RELOC(tex->buffer,
+                 R300_TXO_MACRO_TILE(tex->macrotile) |
+                 R300_TXO_MICRO_TILE(tex->microtile),
+                 RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0, 0);
     END_CS;
 }
 
@@ -717,32 +791,6 @@ void r300_emit_aos(struct r300_context* r300, unsigned offset)
     END_CS;
 }
 
-#if 0
-void r300_emit_draw_packet(struct r300_context* r300)
-{
-    CS_LOCALS(r300);
-
-    DBG(r300, DBG_DRAW, "r300: Preparing vertex buffer %p for render, "
-            "vertex size %d\n", r300->vbo,
-            r300->vertex_info->vinfo.size);
-    /* Set the pointer to our vertex buffer. The emitted values are this:
-     * PACKET3 [3D_LOAD_VBPNTR]
-     * COUNT   [1]
-     * FORMAT  [size | stride << 8]
-     * OFFSET  [offset into BO]
-     * VBPNTR  [relocated BO]
-     */
-    BEGIN_CS(7);
-    OUT_CS_PKT3(R300_PACKET3_3D_LOAD_VBPNTR, 3);
-    OUT_CS(1);
-    OUT_CS(r300->vertex_info->vinfo.size |
-            (r300->vertex_info->vinfo.size << 8));
-    OUT_CS(r300->vbo_offset);
-    OUT_CS_RELOC(r300->vbo, 0, RADEON_GEM_DOMAIN_GTT, 0, 0);
-    END_CS;
-}
-#endif
-
 void r300_emit_vertex_format_state(struct r300_context* r300)
 {
     int i;
@@ -867,26 +915,27 @@ void r300_emit_vs_constant_buffer(struct r300_context* r300,
     END_CS;
 }
 
-void r300_emit_viewport_state(struct r300_context* r300,
-                              struct r300_viewport_state* viewport)
+void r300_emit_viewport_state(struct r300_context* r300, void* state)
 {
+    struct r300_viewport_state* viewport = (struct r300_viewport_state*)state;
     CS_LOCALS(r300);
 
-    BEGIN_CS(9);
-    OUT_CS_REG_SEQ(R300_SE_VPORT_XSCALE, 6);
-    OUT_CS_32F(viewport->xscale);
-    OUT_CS_32F(viewport->xoffset);
-    OUT_CS_32F(viewport->yscale);
-    OUT_CS_32F(viewport->yoffset);
-    OUT_CS_32F(viewport->zscale);
-    OUT_CS_32F(viewport->zoffset);
-
-    if (r300->rs_state->enable_vte) {
-        OUT_CS_REG(R300_VAP_VTE_CNTL, viewport->vte_control);
-    } else {
+    if (r300->tcl_bypass) {
+        BEGIN_CS(2);
         OUT_CS_REG(R300_VAP_VTE_CNTL, 0);
+        END_CS;
+    } else {
+        BEGIN_CS(9);
+        OUT_CS_REG_SEQ(R300_SE_VPORT_XSCALE, 6);
+        OUT_CS_32F(viewport->xscale);
+        OUT_CS_32F(viewport->xoffset);
+        OUT_CS_32F(viewport->yscale);
+        OUT_CS_32F(viewport->yoffset);
+        OUT_CS_32F(viewport->zscale);
+        OUT_CS_32F(viewport->zoffset);
+        OUT_CS_REG(R300_VAP_VTE_CNTL, viewport->vte_control);
+        END_CS;
     }
-    END_CS;
 }
 
 void r300_emit_texture_count(struct r300_context* r300)
@@ -910,6 +959,16 @@ void r300_emit_texture_count(struct r300_context* r300)
 
 }
 
+void r300_emit_ztop_state(struct r300_context* r300, void* state)
+{
+    struct r300_ztop_state* ztop = (struct r300_ztop_state*)state;
+    CS_LOCALS(r300);
+
+    BEGIN_CS(2);
+    OUT_CS_REG(R300_ZB_ZTOP, ztop->z_buffer_top);
+    END_CS;
+}
+
 void r300_flush_textures(struct r300_context* r300)
 {
     CS_LOCALS(r300);
@@ -933,18 +992,24 @@ void r300_emit_dirty_state(struct r300_context* r300)
 {
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     struct r300_texture* tex;
-    int i, dirty_tex = 0;
+    struct r300_atom* atom;
+    unsigned i, dwords = 1024;
+    int dirty_tex = 0;
     boolean invalid = FALSE;
 
-    if (!(r300->dirty_state)) {
-        return;
+    /* Check the required number of dwords against the space remaining in the
+     * current CS object. If we need more, then flush. */
+
+    foreach(atom, &r300->atom_list) {
+        if (atom->dirty || atom->always_dirty) {
+            dwords += atom->size;
+        }
     }
 
-    /* Check size of CS. */
-    /* Make sure we have at least 8*1024 spare dwords. */
+    /* Make sure we have at least 2*1024 spare dwords. */
     /* XXX It would be nice to know the number of dwords we really need to
      * XXX emit. */
-    if (!r300->winsys->check_cs(r300->winsys, 8*1024)) {
+    if (!r300->winsys->check_cs(r300->winsys, dwords)) {
         r300->context.flush(&r300->context, 0, NULL);
     }
 
@@ -984,10 +1049,12 @@ validate:
         }
     }
     /* ...occlusion query buffer... */
-    if (!r300->winsys->add_buffer(r300->winsys, r300->oqbo,
-                0, RADEON_GEM_DOMAIN_GTT)) {
-        r300->context.flush(&r300->context, 0, NULL);
-        goto validate;
+    if (r300->dirty_state & R300_NEW_QUERY) {
+        if (!r300->winsys->add_buffer(r300->winsys, r300->oqbo,
+                    0, RADEON_GEM_DOMAIN_GTT)) {
+            r300->context.flush(&r300->context, 0, NULL);
+            goto validate;
+        }
     }
     /* ...and vertex buffer. */
     if (r300->vbo) {
@@ -1015,27 +1082,15 @@ validate:
         r300->dirty_state &= ~R300_NEW_QUERY;
     }
 
-    if (r300->dirty_state & R300_NEW_BLEND) {
-        r300_emit_blend_state(r300, r300->blend_state);
-        r300->dirty_state &= ~R300_NEW_BLEND;
-    }
-
-    if (r300->dirty_state & R300_NEW_BLEND_COLOR) {
-        r300_emit_blend_color_state(r300, r300->blend_color_state);
-        r300->dirty_state &= ~R300_NEW_BLEND_COLOR;
-    }
-
-    if (r300->dirty_state & R300_NEW_CLIP) {
-        r300_emit_clip_state(r300, &r300->clip_state);
-        r300->dirty_state &= ~R300_NEW_CLIP;
-    }
-
-    if (r300->dirty_state & R300_NEW_DSA) {
-        r300_emit_dsa_state(r300, r300->dsa_state);
-        r300->dirty_state &= ~R300_NEW_DSA;
+    foreach(atom, &r300->atom_list) {
+        if (atom->dirty || atom->always_dirty) {
+            atom->emit(r300, atom->state);
+            atom->dirty = FALSE;
+        }
     }
 
     if (r300->dirty_state & R300_NEW_FRAGMENT_SHADER) {
+        r300_emit_fragment_depth_config(r300, r300->fs);
         if (r300screen->caps->is_r500) {
             r500_emit_fragment_program_code(r300, &r300->fs->shader->code);
         } else {
@@ -1060,21 +1115,11 @@ validate:
         r300->dirty_state &= ~R300_NEW_FRAMEBUFFERS;
     }
 
-    if (r300->dirty_state & R300_NEW_RASTERIZER) {
-        r300_emit_rs_state(r300, r300->rs_state);
-        r300->dirty_state &= ~R300_NEW_RASTERIZER;
-    }
-
     if (r300->dirty_state & R300_NEW_RS_BLOCK) {
         r300_emit_rs_block_state(r300, r300->rs_block);
         r300->dirty_state &= ~R300_NEW_RS_BLOCK;
     }
 
-    if (r300->dirty_state & R300_NEW_SCISSOR) {
-        r300_emit_scissor_state(r300, r300->scissor_state);
-        r300->dirty_state &= ~R300_NEW_SCISSOR;
-    }
-
     /* Samplers and textures are tracked separately but emitted together. */
     if (r300->dirty_state &
             (R300_ANY_NEW_SAMPLERS | R300_ANY_NEW_TEXTURES)) {
@@ -1096,11 +1141,6 @@ validate:
         r300->dirty_state &= ~(R300_ANY_NEW_SAMPLERS | R300_ANY_NEW_TEXTURES);
     }
 
-    if (r300->dirty_state & R300_NEW_VIEWPORT) {
-        r300_emit_viewport_state(r300, r300->viewport_state);
-        r300->dirty_state &= ~R300_NEW_VIEWPORT;
-    }
-
     if (dirty_tex) {
         r300_flush_textures(r300);
     }
diff --git a/src/gallium/drivers/r300/r300_emit.h b/src/gallium/drivers/r300/r300_emit.h
index 3797d3d332..05a6bfeae8 100644
--- a/src/gallium/drivers/r300/r300_emit.h
+++ b/src/gallium/drivers/r300/r300_emit.h
@@ -31,17 +31,13 @@ struct r300_vertex_program_code;
 
 void r300_emit_aos(struct r300_context* r300, unsigned offset);
 
-void r300_emit_blend_state(struct r300_context* r300,
-                           struct r300_blend_state* blend);
+void r300_emit_blend_state(struct r300_context* r300, void* state);
 
-void r300_emit_blend_color_state(struct r300_context* r300,
-                                 struct r300_blend_color_state* bc);
+void r300_emit_blend_color_state(struct r300_context* r300, void* state);
 
-void r300_emit_clip_state(struct r300_context* r300,
-                          struct pipe_clip_state* clip);
+void r300_emit_clip_state(struct r300_context* r300, void* state);
 
-void r300_emit_dsa_state(struct r300_context* r300,
-                         struct r300_dsa_state* dsa);
+void r300_emit_dsa_state(struct r300_context* r300, void* state);
 
 void r300_emit_fragment_program_code(struct r300_context* r300,
                                      struct rX00_fragment_program_code* generic_code);
@@ -63,13 +59,12 @@ void r300_emit_query_begin(struct r300_context* r300,
 
 void r300_emit_query_end(struct r300_context* r300);
 
-void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs);
+void r300_emit_rs_state(struct r300_context* r300, void* state);
 
 void r300_emit_rs_block_state(struct r300_context* r300,
                               struct r300_rs_block* rs);
 
-void r300_emit_scissor_state(struct r300_context* r300,
-                             struct r300_scissor_state* scissor);
+void r300_emit_scissor_state(struct r300_context* r300, void* state);
 
 void r300_emit_texture(struct r300_context* r300,
                        struct r300_sampler_state* sampler,
@@ -89,11 +84,12 @@ void r300_emit_vs_constant_buffer(struct r300_context* r300,
 void r300_emit_vertex_shader(struct r300_context* r300,
                              struct r300_vertex_shader* vs);
 
-void r300_emit_viewport_state(struct r300_context* r300,
-                              struct r300_viewport_state* viewport);
+void r300_emit_viewport_state(struct r300_context* r300, void* state);
 
 void r300_emit_texture_count(struct r300_context* r300);
 
+void r300_emit_ztop_state(struct r300_context* r300, void* state);
+
 void r300_flush_textures(struct r300_context* r300);
 
 /* Emit all dirty state. */
diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c
index 14a08241fc..c78a7673a3 100644
--- a/src/gallium/drivers/r300/r300_flush.c
+++ b/src/gallium/drivers/r300/r300_flush.c
@@ -37,6 +37,7 @@ static void r300_flush(struct pipe_context* pipe,
 {
     struct r300_context *r300 = r300_context(pipe);
     struct r300_query *query;
+    struct r300_atom *atom;
 
     CS_LOCALS(r300);
     /* We probably need to flush Draw, but we may have been called from
@@ -54,7 +55,15 @@ static void r300_flush(struct pipe_context* pipe,
         r300_emit_invariant_state(r300);
         r300->dirty_state = R300_NEW_KITCHEN_SINK;
         r300->dirty_hw = 0;
+
+        /* New kitchen sink, baby. */
+        foreach(atom, &r300->atom_list) {
+            if (atom->state) {
+                atom->dirty = TRUE;
+            }
+        }
     }
+
     /* reset flushed query */
     foreach(query, &r300->query_list) {
         query->flushed = TRUE;
diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index 4e1b61ca40..60ea9c171d 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -63,6 +63,11 @@ void r300_shader_read_fs_inputs(struct tgsi_shader_info* info,
                 fs_inputs->fog = i;
                 break;
 
+            case TGSI_SEMANTIC_POSITION:
+                assert(index == 0);
+                fs_inputs->wpos = i;
+                break;
+
             default:
                 assert(0);
         }
@@ -114,6 +119,9 @@ static void allocate_hardware_inputs(
     if (inputs->fog != ATTR_UNUSED) {
         allocate(mydata, inputs->fog, reg++);
     }
+    if (inputs->wpos != ATTR_UNUSED) {
+        allocate(mydata, inputs->wpos, reg++);
+    }
 }
 
 static void get_compare_state(
@@ -144,6 +152,7 @@ static void r300_translate_fragment_shader(
     struct r300_fragment_shader* fs = r300->fs;
     struct r300_fragment_program_compiler compiler;
     struct tgsi_to_rc ttr;
+    int wpos = fs->inputs.wpos;
 
     /* Setup the compiler. */
     memset(&compiler, 0, sizeof(compiler));
@@ -171,6 +180,18 @@ static void r300_translate_fragment_shader(
 
     fs->shadow_samplers = compiler.Base.Program.ShadowSamplers;
 
+    /**
+     * Transform the program to support WPOS.
+     *
+     * Introduce a small fragment at the start of the program that will be
+     * the only code that directly reads the WPOS input.
+     * All other code pieces that reference that input will be rewritten
+     * to read from a newly allocated temporary. */
+    if (wpos != ATTR_UNUSED) {
+        /* Moving the input to some other reg is not really necessary. */
+        rc_transform_fragment_wpos(&compiler.Base, wpos, wpos, TRUE);
+    }
+
     /* Invoke the compiler */
     r3xx_compile_fragment_program(&compiler);
     if (compiler.Base.Error) {
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index 0aa1da07f8..361813891f 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -1619,18 +1619,20 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_TX_OFFSET_5                    0x4554
 #define R300_TX_OFFSET_6                    0x4558
 #define R300_TX_OFFSET_7                    0x455C
-	/* BEGIN: Guess from R200 */
+
 #       define R300_TXO_ENDIAN_NO_SWAP           (0 << 0)
 #       define R300_TXO_ENDIAN_BYTE_SWAP         (1 << 0)
 #       define R300_TXO_ENDIAN_WORD_SWAP         (2 << 0)
 #       define R300_TXO_ENDIAN_HALFDW_SWAP       (3 << 0)
-#       define R300_TXO_MACRO_TILE               (1 << 2)
+#       define R300_TXO_MACRO_TILE_LINEAR        (0 << 2)
+#       define R300_TXO_MACRO_TILE_TILED         (1 << 2)
+#       define R300_TXO_MACRO_TILE(x)            ((x) << 2)
 #       define R300_TXO_MICRO_TILE_LINEAR        (0 << 3)
-#       define R300_TXO_MICRO_TILE               (1 << 3)
-#       define R300_TXO_MICRO_TILE_SQUARE        (2 << 3)
+#       define R300_TXO_MICRO_TILE_TILED         (1 << 3)
+#       define R300_TXO_MICRO_TILE_TILED_SQUARE  (2 << 3)
+#       define R300_TXO_MICRO_TILE(x)            ((x) << 3)
 #       define R300_TXO_OFFSET_MASK              0xffffffe0
 #       define R300_TXO_OFFSET_SHIFT             5
-	/* END: Guess from R200 */
 
 /* 32 bit chroma key */
 #define R300_TX_CHROMA_KEY_0                      0x4580
@@ -2186,6 +2188,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_1     (4 << 3)
 #       define R300_DISCARD_SRC_PIXELS_SRC_COLOR_1     (5 << 3)
 #       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_1     (6 << 3)
+#       define R500_SRC_ALPHA_0_NO_READ                (1 << 30)
+#       define R500_SRC_ALPHA_1_NO_READ                (1 << 31)
 
 /* the following are shared between CBLEND and ABLEND */
 #       define R300_FCN_MASK                         (3  << 12)
@@ -2281,9 +2285,11 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_COLORPITCH_MASK              0x00003FFE
 #       define R300_COLOR_TILE_DISABLE            (0 << 16)
 #       define R300_COLOR_TILE_ENABLE             (1 << 16)
+#       define R300_COLOR_TILE(x)                 ((x) << 16)
 #       define R300_COLOR_MICROTILE_DISABLE       (0 << 17)
 #       define R300_COLOR_MICROTILE_ENABLE        (1 << 17)
 #       define R300_COLOR_MICROTILE_ENABLE_SQUARE (2 << 17) /* Only available in 16-bit */
+#       define R300_COLOR_MICROTILE(x)            ((x) << 17)
 #       define R300_COLOR_ENDIAN_NO_SWAP          (0 << 19)
 #       define R300_COLOR_ENDIAN_WORD_SWAP        (1 << 19)
 #       define R300_COLOR_ENDIAN_DWORD_SWAP       (2 << 19)
@@ -2542,9 +2548,11 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_DEPTHPITCH_MASK              0x00003FFC
 #       define R300_DEPTHMACROTILE_DISABLE      (0 << 16)
 #       define R300_DEPTHMACROTILE_ENABLE       (1 << 16)
+#       define R300_DEPTHMACROTILE(x)           ((x) << 16)
 #       define R300_DEPTHMICROTILE_LINEAR       (0 << 17)
 #       define R300_DEPTHMICROTILE_TILED        (1 << 17)
 #       define R300_DEPTHMICROTILE_TILED_SQUARE (2 << 17)
+#       define R300_DEPTHMICROTILE(x)           ((x) << 17)
 #       define R300_DEPTHENDIAN_NO_SWAP         (0 << 18)
 #       define R300_DEPTHENDIAN_WORD_SWAP       (1 << 18)
 #       define R300_DEPTHENDIAN_DWORD_SWAP      (2 << 18)
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index a89cb633e0..710d850163 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -26,6 +26,8 @@
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
 
+#include "indices/u_indices.h"
+
 #include "pipe/p_inlines.h"
 
 #include "util/u_memory.h"
@@ -69,16 +71,11 @@ uint32_t r300_translate_primitive(unsigned prim)
     }
 }
 
-static boolean r300_nothing_to_draw(struct r300_context *r300)
-{
-    return r300->rs_state->rs.scissor &&
-           r300->scissor_state->scissor.empty_area;
-}
-
 static uint32_t r300_provoking_vertex_fixes(struct r300_context *r300,
                                             unsigned mode)
 {
-    uint32_t color_control = r300->rs_state->color_control;
+    struct r300_rs_state* rs = (struct r300_rs_state*)r300->rs_state.state;
+    uint32_t color_control = rs->color_control;
 
     /* By default (see r300_state.c:r300_create_rs_state) color_control is
      * initialized to provoking the first vertex.
@@ -98,7 +95,7 @@ static uint32_t r300_provoking_vertex_fixes(struct r300_context *r300,
      * ~ C.
      */
 
-    if (r300->rs_state->rs.flatshade_first) {
+    if (rs->rs.flatshade_first) {
         switch (mode) {
             case PIPE_PRIM_TRIANGLE_FAN:
                 color_control |= R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_SECOND;
@@ -119,6 +116,44 @@ static uint32_t r300_provoking_vertex_fixes(struct r300_context *r300,
     return color_control;
 }
 
+static void r300_emit_draw_immediate(struct r300_context *r300,
+                                     unsigned mode,
+                                     unsigned start,
+                                     unsigned count)
+{
+    struct pipe_buffer* vbo = r300->vertex_buffer[0].buffer;
+    unsigned vertex_size = r300->vertex_buffer[0].stride / sizeof(float);
+    unsigned i;
+    uint32_t* map;
+    CS_LOCALS(r300);
+
+    map = (uint32_t*)pipe_buffer_map_range(r300->context.screen, vbo,
+            start * vertex_size, count * vertex_size,
+            PIPE_BUFFER_USAGE_CPU_READ);
+
+    BEGIN_CS(10 + count * vertex_size);
+    OUT_CS_REG(R300_GA_COLOR_CONTROL,
+            r300_provoking_vertex_fixes(r300, mode));
+    OUT_CS_REG(R300_VAP_VTX_SIZE, vertex_size);
+    OUT_CS_REG(R300_VAP_VF_MIN_VTX_INDX, 0);
+    OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, count - 1);
+    OUT_CS_PKT3(R300_PACKET3_3D_DRAW_IMMD_2, count * vertex_size);
+    OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_EMBEDDED | (count << 16) |
+            r300_translate_primitive(mode));
+    //debug_printf("r300: Immd %d verts, %d attrs\n", count, vertex_size);
+    for (i = 0; i < count * vertex_size; i++) {
+        if (i % vertex_size == 0) {
+            //debug_printf("r300: -- vert --\n");
+        }
+        //debug_printf("r300: 0x%08x\n", *map);
+        OUT_CS(*map);
+        map++;
+    }
+    END_CS;
+
+    pipe_buffer_unmap(r300->context.screen, vbo);
+}
+
 static void r300_emit_draw_arrays(struct r300_context *r300,
                                   unsigned mode,
                                   unsigned count)
@@ -212,43 +247,84 @@ validate:
     return TRUE;
 }
 
+static struct pipe_buffer* r300_translate_elts(struct r300_context* r300,
+                                               struct pipe_buffer* elts,
+                                               unsigned* size,
+                                               unsigned* mode,
+                                               unsigned* count)
+{
+    struct pipe_screen* screen = r300->context.screen;
+    struct pipe_buffer* new_elts;
+    void *in_map, *out_map;
+    unsigned out_prim, out_index_size, out_nr;
+    u_translate_func out_translate;
+
+    (void)u_index_translator(~0, *mode, *size, *count, PV_LAST, PV_LAST,
+        &out_prim, &out_index_size, &out_nr, &out_translate);
+
+    new_elts = screen->buffer_create(screen, 32,
+                                     PIPE_BUFFER_USAGE_INDEX |
+                                     PIPE_BUFFER_USAGE_CPU_WRITE |
+                                     PIPE_BUFFER_USAGE_GPU_READ,
+                                     out_index_size * out_nr);
+
+    in_map = pipe_buffer_map(screen, elts, PIPE_BUFFER_USAGE_CPU_READ);
+    out_map = pipe_buffer_map(screen, new_elts, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+    out_translate(in_map, *count, out_map);
+
+    pipe_buffer_unmap(screen, elts);
+    pipe_buffer_unmap(screen, new_elts);
+
+    *size = out_index_size;
+    *mode = out_prim;
+    *count = out_nr;
+
+    return new_elts;
+}
+
 /* This is the fast-path drawing & emission for HW TCL. */
-boolean r300_draw_range_elements(struct pipe_context* pipe,
-                                 struct pipe_buffer* indexBuffer,
-                                 unsigned indexSize,
-                                 unsigned minIndex,
-                                 unsigned maxIndex,
-                                 unsigned mode,
-                                 unsigned start,
-                                 unsigned count)
+void r300_draw_range_elements(struct pipe_context* pipe,
+                              struct pipe_buffer* indexBuffer,
+                              unsigned indexSize,
+                              unsigned minIndex,
+                              unsigned maxIndex,
+                              unsigned mode,
+                              unsigned start,
+                              unsigned count)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct pipe_buffer* orgIndexBuffer = indexBuffer;
 
     if (!u_trim_pipe_prim(mode, &count)) {
-        return FALSE;
+        return;
     }
 
     if (count > 65535) {
-        return FALSE;
-    }
-
-    if (r300_nothing_to_draw(r300)) {
-        return TRUE;
+       /* XXX: use aux/indices functions to split this into smaller
+        * primitives.
+        */
+        return;
     }
 
     r300_update_derived_state(r300);
 
     if (!r300_setup_vertex_buffers(r300)) {
-        return FALSE;
+        return;
+    }
+
+    if (indexSize == 1) {
+        indexBuffer = r300_translate_elts(r300, indexBuffer,
+            &indexSize, &mode, &count);
     }
 
     if (!r300->winsys->add_buffer(r300->winsys, indexBuffer,
                                   RADEON_GEM_DOMAIN_GTT, 0)) {
-        return FALSE;
+        goto cleanup;
     }
 
     if (!r300->winsys->validate(r300->winsys)) {
-        return FALSE;
+        goto cleanup;
     }
 
     r300_emit_dirty_state(r300);
@@ -258,49 +334,52 @@ boolean r300_draw_range_elements(struct pipe_context* pipe,
     r300_emit_draw_elements(r300, indexBuffer, indexSize, minIndex, maxIndex,
                             mode, start, count);
 
-    return TRUE;
+cleanup:
+    if (indexBuffer != orgIndexBuffer) {
+        pipe->screen->buffer_destroy(indexBuffer);
+    }
 }
 
 /* Simple helpers for context setup. Should probably be moved to util. */
-boolean r300_draw_elements(struct pipe_context* pipe,
-                           struct pipe_buffer* indexBuffer,
-                           unsigned indexSize, unsigned mode,
-                           unsigned start, unsigned count)
+void r300_draw_elements(struct pipe_context* pipe,
+                        struct pipe_buffer* indexBuffer,
+                        unsigned indexSize, unsigned mode,
+                        unsigned start, unsigned count)
 {
-    return pipe->draw_range_elements(pipe, indexBuffer, indexSize, 0, ~0,
-                                     mode, start, count);
+   pipe->draw_range_elements(pipe, indexBuffer, indexSize, 0, ~0,
+                             mode, start, count);
 }
 
-boolean r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
-                         unsigned start, unsigned count)
+void r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
+                      unsigned start, unsigned count)
 {
     struct r300_context* r300 = r300_context(pipe);
 
     if (!u_trim_pipe_prim(mode, &count)) {
-        return FALSE;
+        return;
     }
 
     if (count > 65535) {
-        return FALSE;
-    }
-
-    if (r300_nothing_to_draw(r300)) {
-        return TRUE;
+        /* XXX: driver needs to handle this -- use the functions in
+         * aux/indices to split this into several smaller primitives.
+         */
+        return;
     }
 
     r300_update_derived_state(r300);
 
     if (!r300_setup_vertex_buffers(r300)) {
-        return FALSE;
+        return;
     }
 
     r300_emit_dirty_state(r300);
 
-    r300_emit_aos(r300, start);
-
-    r300_emit_draw_arrays(r300, mode, count);
-
-    return TRUE;
+    if (FALSE && count <= 4 && r300->vertex_buffer_count == 1) {
+        r300_emit_draw_immediate(r300, mode, start, count);
+    } else {
+        r300_emit_aos(r300, start);
+        r300_emit_draw_arrays(r300, mode, count);
+    }
 }
 
 /****************************************************************************
@@ -309,7 +388,7 @@ boolean r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
  ***************************************************************************/
 
 /* SW TCL arrays, using Draw. */
-boolean r300_swtcl_draw_arrays(struct pipe_context* pipe,
+void r300_swtcl_draw_arrays(struct pipe_context* pipe,
                                unsigned mode,
                                unsigned start,
                                unsigned count)
@@ -318,11 +397,7 @@ boolean r300_swtcl_draw_arrays(struct pipe_context* pipe,
     int i;
 
     if (!u_trim_pipe_prim(mode, &count)) {
-        return FALSE;
-    }
-
-    if (r300_nothing_to_draw(r300)) {
-        return TRUE;
+        return;
     }
 
     for (i = 0; i < r300->vertex_buffer_count; i++) {
@@ -346,12 +421,10 @@ boolean r300_swtcl_draw_arrays(struct pipe_context* pipe,
         pipe_buffer_unmap(pipe->screen, r300->vertex_buffer[i].buffer);
         draw_set_mapped_vertex_buffer(r300->draw, i, NULL);
     }
-
-    return TRUE;
 }
 
 /* SW TCL elements, using Draw. */
-boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
+void r300_swtcl_draw_range_elements(struct pipe_context* pipe,
                                        struct pipe_buffer* indexBuffer,
                                        unsigned indexSize,
                                        unsigned minIndex,
@@ -365,11 +438,7 @@ boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
     void* indices;
 
     if (!u_trim_pipe_prim(mode, &count)) {
-        return FALSE;
-    }
-
-    if (r300_nothing_to_draw(r300)) {
-        return TRUE;
+        return;
     }
 
     for (i = 0; i < r300->vertex_buffer_count; i++) {
@@ -400,8 +469,6 @@ boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
     pipe_buffer_unmap(pipe->screen, indexBuffer);
     draw_set_mapped_element_buffer_range(r300->draw, 0, start,
                                          start + count - 1, NULL);
-
-    return TRUE;
 }
 
 /* Object for rendering using Draw. */
diff --git a/src/gallium/drivers/r300/r300_render.h b/src/gallium/drivers/r300/r300_render.h
index da83069083..27b5e6a963 100644
--- a/src/gallium/drivers/r300/r300_render.h
+++ b/src/gallium/drivers/r300/r300_render.h
@@ -25,35 +25,35 @@
 
 uint32_t r300_translate_primitive(unsigned prim);
 
-boolean r300_draw_range_elements(struct pipe_context* pipe,
-                                 struct pipe_buffer* indexBuffer,
-                                 unsigned indexSize,
-                                 unsigned minIndex,
-                                 unsigned maxIndex,
-                                 unsigned mode,
-                                 unsigned start,
-                                 unsigned count);
-
-boolean r300_draw_elements(struct pipe_context* pipe,
-                           struct pipe_buffer* indexBuffer,
-                           unsigned indexSize, unsigned mode,
-                           unsigned start, unsigned count);
-
-boolean r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
-                         unsigned start, unsigned count);
-
-boolean r300_swtcl_draw_arrays(struct pipe_context* pipe,
-                               unsigned mode,
-                               unsigned start,
-                               unsigned count);
-
-boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
-                                       struct pipe_buffer* indexBuffer,
-                                       unsigned indexSize,
-                                       unsigned minIndex,
-                                       unsigned maxIndex,
-                                       unsigned mode,
-                                       unsigned start,
-                                       unsigned count);
+void r300_draw_range_elements(struct pipe_context* pipe,
+                              struct pipe_buffer* indexBuffer,
+                              unsigned indexSize,
+                              unsigned minIndex,
+                              unsigned maxIndex,
+                              unsigned mode,
+                              unsigned start,
+                              unsigned count);
+
+void r300_draw_elements(struct pipe_context* pipe,
+                        struct pipe_buffer* indexBuffer,
+                        unsigned indexSize, unsigned mode,
+                        unsigned start, unsigned count);
+
+void r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
+                      unsigned start, unsigned count);
+
+void r300_swtcl_draw_arrays(struct pipe_context* pipe,
+                            unsigned mode,
+                            unsigned start,
+                            unsigned count);
+
+void r300_swtcl_draw_range_elements(struct pipe_context* pipe,
+                                    struct pipe_buffer* indexBuffer,
+                                    unsigned indexSize,
+                                    unsigned minIndex,
+                                    unsigned maxIndex,
+                                    unsigned mode,
+                                    unsigned start,
+                                    unsigned count);
 
 #endif /* R300_RENDER_H */
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 2a8667d483..287664b1d2 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -83,6 +83,7 @@ static int r300_get_param(struct pipe_screen* pscreen, int param)
 
     switch (param) {
         case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+        case PIPE_CAP_MAX_COMBINED_SAMPLERS:
             /* XXX I'm told this goes up to 16 */
             return 8;
         case PIPE_CAP_NPOT_TEXTURES:
@@ -143,9 +144,11 @@ static int r300_get_param(struct pipe_screen* pscreen, int param)
         case PIPE_CAP_BLEND_EQUATION_SEPARATE:
             return 1;
         case PIPE_CAP_SM3:
-            return 1;
-        case PIPE_CAP_MAX_COMBINED_SAMPLERS:
-            return 8;
+            if (r300screen->caps->is_r500) {
+                return 1;
+            } else {
+                return 0;
+            }
         default:
             debug_printf("r300: Implementation error: Bad param %d\n",
                 param);
diff --git a/src/gallium/drivers/r300/r300_shader_semantics.h b/src/gallium/drivers/r300/r300_shader_semantics.h
index 85184e2cfd..6796841b29 100644
--- a/src/gallium/drivers/r300/r300_shader_semantics.h
+++ b/src/gallium/drivers/r300/r300_shader_semantics.h
@@ -40,6 +40,7 @@ struct r300_shader_semantics {
     int bcolor[ATTR_COLOR_COUNT];
     int generic[ATTR_GENERIC_COUNT];
     int fog;
+    int wpos;
 };
 
 static INLINE void r300_shader_semantics_reset(
@@ -50,6 +51,7 @@ static INLINE void r300_shader_semantics_reset(
     info->pos = ATTR_UNUSED;
     info->psize = ATTR_UNUSED;
     info->fog = ATTR_UNUSED;
+    info->wpos = ATTR_UNUSED;
 
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
         info->color[i] = ATTR_UNUSED;
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index 49072462ec..281ff68449 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -1,5 +1,6 @@
 /*
  * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -41,6 +42,120 @@
 /* r300_state: Functions used to intialize state context by translating
  * Gallium state objects into semi-native r300 state objects. */
 
+static boolean blend_discard_if_src_alpha_0(unsigned srcRGB, unsigned srcA,
+                                            unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_ALPHA == 0, and the following state is set, the colorbuffer
+     * will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_SRC_COLOR ||
+            srcA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+            srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            dstA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_alpha_1(unsigned srcRGB, unsigned srcA,
+                                            unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_ALPHA == 1, and the following state is set, the colorbuffer
+     * will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            srcA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_SRC_COLOR ||
+            dstA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_color_0(unsigned srcRGB, unsigned srcA,
+                                            unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_COLOR == (0,0,0), and the following state is set, the colorbuffer
+     * will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_SRC_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_color_1(unsigned srcRGB, unsigned srcA,
+                                            unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_COLOR == (1,1,1), and the following state is set, the colorbuffer
+     * will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_SRC_COLOR ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_alpha_color_0(unsigned srcRGB, unsigned srcA,
+                                                  unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_ALPHA_COLOR == (0,0,0,0), and the following state is set,
+     * the colorbuffer will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_SRC_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_SRC_COLOR ||
+            srcA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+            srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            dstA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_alpha_color_1(unsigned srcRGB, unsigned srcA,
+                                                  unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_ALPHA_COLOR == (1,1,1,1), and the following state is set,
+     * the colorbuffer will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            srcA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_SRC_COLOR ||
+            dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_SRC_COLOR ||
+            dstA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            dstA == PIPE_BLENDFACTOR_ONE);
+}
+
 /* Create a new blend state based on the CSO blend state.
  *
  * This encompasses alpha blending, logic/raster ops, and blend dithering. */
@@ -66,7 +181,11 @@ static void* r300_create_blend_state(struct pipe_context* pipe,
             ( r300_translate_blend_factor(srcRGB) << R300_SRC_BLEND_SHIFT) |
             ( r300_translate_blend_factor(dstRGB) << R300_DST_BLEND_SHIFT);
 
-        /* optimization: some operations do not require the destination color */
+        /* Optimization: some operations do not require the destination color.
+         *
+         * When SRC_ALPHA_SATURATE is used, colorbuffer reads must be enabled,
+         * otherwise blending gives incorrect results. It seems to be
+         * a hardware bug. */
         if (eqRGB == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MIN ||
             eqRGB == PIPE_BLEND_MAX || eqA == PIPE_BLEND_MAX ||
             dstRGB != PIPE_BLENDFACTOR_ZERO ||
@@ -78,11 +197,81 @@ static void* r300_create_blend_state(struct pipe_context* pipe,
             srcA == PIPE_BLENDFACTOR_DST_COLOR ||
             srcA == PIPE_BLENDFACTOR_DST_ALPHA ||
             srcA == PIPE_BLENDFACTOR_INV_DST_COLOR ||
-            srcA == PIPE_BLENDFACTOR_INV_DST_ALPHA)
+            srcA == PIPE_BLENDFACTOR_INV_DST_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) {
+            /* Enable reading from the colorbuffer. */
             blend->blend_control |= R300_READ_ENABLE;
 
-        /* XXX implement the optimization with DISCARD_SRC_PIXELS*/
-        /* XXX implement the optimization with SRC_ALPHA_?_NO_READ */
+            if (r300_screen(r300_context(pipe)->context.screen)->caps->is_r500) {
+                /* Optimization: Depending on incoming pixels, we can
+                 * conditionally disable the reading in hardware... */
+                if (eqRGB != PIPE_BLEND_MIN && eqA != PIPE_BLEND_MIN &&
+                    eqRGB != PIPE_BLEND_MAX && eqA != PIPE_BLEND_MAX) {
+                    /* Disable reading if SRC_ALPHA == 0. */
+                    if ((dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+                         dstRGB == PIPE_BLENDFACTOR_ZERO) &&
+                        (dstA == PIPE_BLENDFACTOR_SRC_COLOR ||
+                         dstA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+                         dstA == PIPE_BLENDFACTOR_ZERO)) {
+                         blend->blend_control |= R500_SRC_ALPHA_0_NO_READ;
+                    }
+
+                    /* Disable reading if SRC_ALPHA == 1. */
+                    if ((dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+                         dstRGB == PIPE_BLENDFACTOR_ZERO) &&
+                        (dstA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+                         dstA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+                         dstA == PIPE_BLENDFACTOR_ZERO)) {
+                         blend->blend_control |= R500_SRC_ALPHA_1_NO_READ;
+                    }
+                }
+            }
+        }
+
+        /* Optimization: discard pixels which don't change the colorbuffer.
+         *
+         * The code below is non-trivial and some math is involved.
+         *
+         * Discarding pixels must be disabled when FP16 AA is enabled.
+         * This is a hardware bug. Also, this implementation wouldn't work
+         * with FP blending enabled and equation clamping disabled.
+         *
+         * Equations other than ADD are rarely used and therefore won't be
+         * optimized. */
+        if ((eqRGB == PIPE_BLEND_ADD || eqRGB == PIPE_BLEND_REVERSE_SUBTRACT) &&
+            (eqA == PIPE_BLEND_ADD || eqA == PIPE_BLEND_REVERSE_SUBTRACT)) {
+            /* ADD: X+Y
+             * REVERSE_SUBTRACT: Y-X
+             *
+             * The idea is:
+             * If X = src*srcFactor = 0 and Y = dst*dstFactor = 1,
+             * then CB will not be changed.
+             *
+             * Given the srcFactor and dstFactor variables, we can derive
+             * what src and dst should be equal to and discard appropriate
+             * pixels.
+             */
+            if (blend_discard_if_src_alpha_0(srcRGB, srcA, dstRGB, dstA)) {
+                blend->blend_control |= R300_DISCARD_SRC_PIXELS_SRC_ALPHA_0;
+            } else if (blend_discard_if_src_alpha_1(srcRGB, srcA,
+                                                    dstRGB, dstA)) {
+                blend->blend_control |= R300_DISCARD_SRC_PIXELS_SRC_ALPHA_1;
+            } else if (blend_discard_if_src_color_0(srcRGB, srcA,
+                                                    dstRGB, dstA)) {
+                blend->blend_control |= R300_DISCARD_SRC_PIXELS_SRC_COLOR_0;
+            } else if (blend_discard_if_src_color_1(srcRGB, srcA,
+                                                    dstRGB, dstA)) {
+                blend->blend_control |= R300_DISCARD_SRC_PIXELS_SRC_COLOR_1;
+            } else if (blend_discard_if_src_alpha_color_0(srcRGB, srcA,
+                                                          dstRGB, dstA)) {
+                blend->blend_control |=
+                    R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_0;
+            } else if (blend_discard_if_src_alpha_color_1(srcRGB, srcA,
+                                                          dstRGB, dstA)) {
+                blend->blend_control |=
+                    R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_1;
+            }
+        }
 
         /* separate alpha */
         if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
@@ -128,8 +317,8 @@ static void r300_bind_blend_state(struct pipe_context* pipe,
 {
     struct r300_context* r300 = r300_context(pipe);
 
-    r300->blend_state = (struct r300_blend_state*)state;
-    r300->dirty_state |= R300_NEW_BLEND;
+    r300->blend_state.state = state;
+    r300->blend_state.dirty = TRUE;
 }
 
 /* Free blend state. */
@@ -151,20 +340,24 @@ static void r300_set_blend_color(struct pipe_context* pipe,
                                  const struct pipe_blend_color* color)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct r300_screen* r300screen = r300_screen(pipe->screen);
+    struct r300_blend_color_state* state =
+        (struct r300_blend_color_state*)r300->blend_color_state.state;
     union util_color uc;
 
     util_pack_color(color->color, PIPE_FORMAT_A8R8G8B8_UNORM, &uc);
-    r300->blend_color_state->blend_color = uc.ui;
+    state->blend_color = uc.ui;
 
     /* XXX if FP16 blending is enabled, we should use the FP16 format */
-    r300->blend_color_state->blend_color_red_alpha =
+    state->blend_color_red_alpha =
         float_to_fixed10(color->color[0]) |
         (float_to_fixed10(color->color[3]) << 16);
-    r300->blend_color_state->blend_color_green_blue =
+    state->blend_color_green_blue =
         float_to_fixed10(color->color[2]) |
         (float_to_fixed10(color->color[1]) << 16);
 
-    r300->dirty_state |= R300_NEW_BLEND_COLOR;
+    r300->blend_color_state.size = r300screen->caps->is_r500 ? 3 : 2;
+    r300->blend_color_state.dirty = TRUE;
 }
 
 static void r300_set_clip_state(struct pipe_context* pipe,
@@ -173,12 +366,15 @@ static void r300_set_clip_state(struct pipe_context* pipe,
     struct r300_context* r300 = r300_context(pipe);
 
     if (r300_screen(pipe->screen)->caps->has_tcl) {
-        r300->clip_state = *state;
-        r300->dirty_state |= R300_NEW_CLIP;
+        memcpy(r300->clip_state.state, state, sizeof(struct pipe_clip_state));
+        r300->clip_state.size = 29;
     } else {
         draw_flush(r300->draw);
         draw_set_clip_state(r300->draw, state);
+        r300->clip_state.size = 2;
     }
+
+    r300->clip_state.dirty = TRUE;
 }
 
 /* Create a new depth, stencil, and alpha state based on the CSO dsa state.
@@ -271,9 +467,11 @@ static void r300_bind_dsa_state(struct pipe_context* pipe,
                                 void* state)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct r300_screen* r300screen = r300_screen(pipe->screen);
 
-    r300->dsa_state = (struct r300_dsa_state*)state;
-    r300->dirty_state |= R300_NEW_DSA;
+    r300->dsa_state.state = state;
+    r300->dsa_state.size = r300screen->caps->is_r500 ? 8 : 6;
+    r300->dsa_state.dirty = TRUE;
 }
 
 /* Free DSA state. */
@@ -283,37 +481,11 @@ static void r300_delete_dsa_state(struct pipe_context* pipe,
     FREE(state);
 }
 
-static void r300_set_scissor_regs(const struct pipe_scissor_state* state,
-                                  struct r300_scissor_regs *scissor,
-                                  boolean is_r500)
-{
-    if (is_r500) {
-        scissor->top_left =
-            (state->minx << R300_SCISSORS_X_SHIFT) |
-            (state->miny << R300_SCISSORS_Y_SHIFT);
-        scissor->bottom_right =
-            ((state->maxx - 1) << R300_SCISSORS_X_SHIFT) |
-            ((state->maxy - 1) << R300_SCISSORS_Y_SHIFT);
-    } else {
-        /* Offset of 1440 in non-R500 chipsets. */
-        scissor->top_left =
-            ((state->minx + 1440) << R300_SCISSORS_X_SHIFT) |
-            ((state->miny + 1440) << R300_SCISSORS_Y_SHIFT);
-        scissor->bottom_right =
-            (((state->maxx - 1) + 1440) << R300_SCISSORS_X_SHIFT) |
-            (((state->maxy - 1) + 1440) << R300_SCISSORS_Y_SHIFT);
-    }
-
-    scissor->empty_area = state->minx >= state->maxx ||
-                          state->miny >= state->maxy;
-}
-
 static void
     r300_set_framebuffer_state(struct pipe_context* pipe,
                                const struct pipe_framebuffer_state* state)
 {
     struct r300_context* r300 = r300_context(pipe);
-    struct pipe_scissor_state scissor;
 
     if (r300->draw) {
         draw_flush(r300->draw);
@@ -321,18 +493,12 @@ static void
 
     r300->framebuffer_state = *state;
 
-    scissor.minx = scissor.miny = 0;
-    scissor.maxx = state->width;
-    scissor.maxy = state->height;
-    r300_set_scissor_regs(&scissor, &r300->scissor_state->framebuffer,
-                          r300_screen(r300->context.screen)->caps->is_r500);
-
     /* Don't rely on the order of states being set for the first time. */
-    if (!r300->rs_state || !r300->rs_state->rs.scissor) {
-        r300->dirty_state |= R300_NEW_SCISSOR;
-    }
     r300->dirty_state |= R300_NEW_FRAMEBUFFERS;
-    r300->dirty_state |= R300_NEW_BLEND;
+
+    r300->blend_state.dirty = TRUE;
+    r300->dsa_state.dirty = TRUE;
+    r300->scissor_state.dirty = TRUE;
 }
 
 /* Create fragment shader state. */
@@ -367,6 +533,10 @@ static void r300_bind_fs_state(struct pipe_context* pipe, void* shader)
     r300->fs = fs;
     r300_pick_fragment_shader(r300);
 
+    if (r300->vs && r300_vertex_shader_setup_wpos(r300)) {
+        r300->dirty_state |= R300_NEW_VERTEX_FORMAT;
+    }
+
     r300->dirty_state |= R300_NEW_FRAGMENT_SHADER | R300_NEW_FRAGMENT_SHADER_CONSTANTS;
 }
 
@@ -407,8 +577,6 @@ static void* r300_create_rs_state(struct pipe_context* pipe,
     /* Copy rasterizer state for Draw. */
     rs->rs = *state;
 
-    rs->enable_vte = !state->bypass_vs_clip_and_viewport;
-
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
     rs->vap_control_status = R300_VC_NO_SWAP;
 #else
@@ -524,12 +692,23 @@ static void r300_bind_rs_state(struct pipe_context* pipe, void* state)
         draw_set_rasterizer_state(r300->draw, &rs->rs);
     }
 
-    r300->rs_state = rs;
+    if (rs) {
+        r300->tcl_bypass = rs->rs.bypass_vs_clip_and_viewport;
+    } else {
+        r300->tcl_bypass = FALSE;
+    }
+
+    r300->rs_state.state = rs;
+    r300->rs_state.dirty = TRUE;
+    /* XXX Why is this still needed, dammit!? */
+    r300->scissor_state.dirty = TRUE;
+    r300->viewport_state.dirty = TRUE;
+
     /* XXX Clean these up when we move to atom emits */
-    r300->dirty_state |= R300_NEW_RASTERIZER;
     r300->dirty_state |= R300_NEW_RS_BLOCK;
-    r300->dirty_state |= R300_NEW_SCISSOR;
-    r300->dirty_state |= R300_NEW_VIEWPORT;
+    if (r300->fs && r300->fs->inputs.wpos != ATTR_UNUSED) {
+        r300->dirty_state |= R300_NEW_FRAGMENT_SHADER_CONSTANTS;
+    }
 }
 
 /* Free rasterizer state. */
@@ -556,7 +735,8 @@ static void*
 
     sampler->filter0 |= r300_translate_tex_filters(state->min_img_filter,
                                                    state->mag_img_filter,
-                                                   state->min_mip_filter);
+                                                   state->min_mip_filter,
+                                                   state->max_anisotropy > 1.0);
 
     /* Unfortunately, r300-r500 don't support floating-point mipmap lods. */
     /* We must pass these to the emit function to clamp them properly. */
@@ -664,49 +844,51 @@ static void r300_set_scissor_state(struct pipe_context* pipe,
 {
     struct r300_context* r300 = r300_context(pipe);
 
-    r300_set_scissor_regs(state, &r300->scissor_state->scissor,
-                          r300_screen(r300->context.screen)->caps->is_r500);
+    memcpy(r300->scissor_state.state, state,
+        sizeof(struct pipe_scissor_state));
 
-    /* Don't rely on the order of states being set for the first time. */
-    if (!r300->rs_state || r300->rs_state->rs.scissor) {
-        r300->dirty_state |= R300_NEW_SCISSOR;
-    }
+    r300->scissor_state.dirty = TRUE;
 }
 
 static void r300_set_viewport_state(struct pipe_context* pipe,
                                     const struct pipe_viewport_state* state)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct r300_viewport_state* viewport =
+        (struct r300_viewport_state*)r300->viewport_state.state;
 
     /* Do the transform in HW. */
-    r300->viewport_state->vte_control = R300_VTX_W0_FMT;
+    viewport->vte_control = R300_VTX_W0_FMT;
 
     if (state->scale[0] != 1.0f) {
-        r300->viewport_state->xscale = state->scale[0];
-        r300->viewport_state->vte_control |= R300_VPORT_X_SCALE_ENA;
+        viewport->xscale = state->scale[0];
+        viewport->vte_control |= R300_VPORT_X_SCALE_ENA;
     }
     if (state->scale[1] != 1.0f) {
-        r300->viewport_state->yscale = state->scale[1];
-        r300->viewport_state->vte_control |= R300_VPORT_Y_SCALE_ENA;
+        viewport->yscale = state->scale[1];
+        viewport->vte_control |= R300_VPORT_Y_SCALE_ENA;
     }
     if (state->scale[2] != 1.0f) {
-        r300->viewport_state->zscale = state->scale[2];
-        r300->viewport_state->vte_control |= R300_VPORT_Z_SCALE_ENA;
+        viewport->zscale = state->scale[2];
+        viewport->vte_control |= R300_VPORT_Z_SCALE_ENA;
     }
     if (state->translate[0] != 0.0f) {
-        r300->viewport_state->xoffset = state->translate[0];
-        r300->viewport_state->vte_control |= R300_VPORT_X_OFFSET_ENA;
+        viewport->xoffset = state->translate[0];
+        viewport->vte_control |= R300_VPORT_X_OFFSET_ENA;
     }
     if (state->translate[1] != 0.0f) {
-        r300->viewport_state->yoffset = state->translate[1];
-        r300->viewport_state->vte_control |= R300_VPORT_Y_OFFSET_ENA;
+        viewport->yoffset = state->translate[1];
+        viewport->vte_control |= R300_VPORT_Y_OFFSET_ENA;
     }
     if (state->translate[2] != 0.0f) {
-        r300->viewport_state->zoffset = state->translate[2];
-        r300->viewport_state->vte_control |= R300_VPORT_Z_OFFSET_ENA;
+        viewport->zoffset = state->translate[2];
+        viewport->vte_control |= R300_VPORT_Z_OFFSET_ENA;
     }
 
-    r300->dirty_state |= R300_NEW_VIEWPORT;
+    r300->viewport_state.dirty = TRUE;
+    if (r300->fs && r300->fs->inputs.wpos != ATTR_UNUSED) {
+        r300->dirty_state |= R300_NEW_FRAGMENT_SHADER_CONSTANTS;
+    }
 }
 
 static void r300_set_vertex_buffers(struct pipe_context* pipe,
@@ -778,7 +960,13 @@ static void r300_bind_vs_state(struct pipe_context* pipe, void* shader)
         }
 
         r300->vs = vs;
-        r300->dirty_state |= R300_NEW_VERTEX_SHADER | R300_NEW_VERTEX_SHADER_CONSTANTS;
+        if (r300->fs) {
+            r300_vertex_shader_setup_wpos(r300);
+        }
+
+        r300->dirty_state |=
+            R300_NEW_VERTEX_SHADER | R300_NEW_VERTEX_SHADER_CONSTANTS |
+            R300_NEW_VERTEX_FORMAT;
     } else {
         draw_flush(r300->draw);
         draw_bind_vertex_shader(r300->draw,
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index 727ae7ade6..192846411b 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -139,10 +139,10 @@ static void r300_vertex_psc(struct r300_context* r300)
 
     /* If TCL is bypassed, map vertex streams to equivalent VS output
      * locations. */
-    if (r300->rs_state->enable_vte) {
-        stream_tab = identity;
-    } else {
+    if (r300->tcl_bypass) {
         stream_tab = r300->vs->stream_loc_notcl;
+    } else {
+        stream_tab = identity;
     }
 
     /* Vertex shaders have no semantics on their inputs,
@@ -333,6 +333,8 @@ static void r300_update_rs_block(struct r300_context* r300,
     void (*rX00_rs_col_write)(struct r300_rs_block*, int, int);
     void (*rX00_rs_tex)(struct r300_rs_block*, int, int, boolean);
     void (*rX00_rs_tex_write)(struct r300_rs_block*, int, int);
+    boolean any_bcolor_used = vs_outputs->bcolor[0] != ATTR_UNUSED ||
+                              vs_outputs->bcolor[1] != ATTR_UNUSED;
 
     if (r300_screen(r300->context.screen)->caps->is_r500) {
         rX00_rs_col       = r500_rs_col;
@@ -348,7 +350,7 @@ static void r300_update_rs_block(struct r300_context* r300,
 
     /* Rasterize colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
-        if (vs_outputs->color[i] != ATTR_UNUSED) {
+        if (vs_outputs->color[i] != ATTR_UNUSED || any_bcolor_used) {
             /* Always rasterize if it's written by the VS,
              * otherwise it locks up. */
             rX00_rs_col(rs, col_count, i, FALSE);
@@ -410,6 +412,16 @@ static void r300_update_rs_block(struct r300_context* r300,
         }
     }
 
+    /* Rasterize WPOS. */
+    /* If the FS doesn't need it, it's not written by the VS. */
+    if (fs_inputs->wpos != ATTR_UNUSED) {
+        rX00_rs_tex(rs, tex_count, tex_count, FALSE);
+        rX00_rs_tex_write(rs, tex_count, fp_offset);
+
+        fp_offset++;
+        tex_count++;
+    }
+
     /* Rasterize at least one color, or bad things happen. */
     if (col_count == 0 && tex_count == 0) {
         rX00_rs_col(rs, 0, 0, TRUE);
@@ -496,7 +508,8 @@ static boolean r300_dsa_alpha_test_enabled(struct r300_dsa_state* dsa)
 
 static void r300_update_ztop(struct r300_context* r300)
 {
-    r300->ztop_state.z_buffer_top = R300_ZTOP_ENABLE;
+    struct r300_ztop_state* ztop_state =
+        (struct r300_ztop_state*)r300->ztop_state.state;
 
     /* This is important enough that I felt it warranted a comment.
      *
@@ -518,31 +531,37 @@ static void r300_update_ztop(struct r300_context* r300)
      * 5) Depth writes in fragment shader
      * 6) Outstanding occlusion queries
      *
+     * This register causes stalls all the way from SC to CB when changed,
+     * but it is buffered on-chip so it does not hurt to write it if it has
+     * not changed.
+     *
      * ~C.
      */
 
     /* ZS writes */
-    if (r300_dsa_writes_depth_stencil(r300->dsa_state) &&
-           (r300_dsa_alpha_test_enabled(r300->dsa_state) ||   /* (1) */
-            r300->fs->info.uses_kill)) {                      /* (2) */
-        r300->ztop_state.z_buffer_top = R300_ZTOP_DISABLE;
-    } else if (r300_fragment_shader_writes_depth(r300->fs)) { /* (5) */
-        r300->ztop_state.z_buffer_top = R300_ZTOP_DISABLE;
-    } else if (r300->query_current) {                         /* (6) */
-        r300->ztop_state.z_buffer_top = R300_ZTOP_DISABLE;
+    if (r300_dsa_writes_depth_stencil(r300->dsa_state.state) &&
+           (r300_dsa_alpha_test_enabled(r300->dsa_state.state) ||/* (1) */
+            r300->fs->info.uses_kill)) {                         /* (2) */
+        ztop_state->z_buffer_top = R300_ZTOP_DISABLE;
+    } else if (r300_fragment_shader_writes_depth(r300->fs)) {    /* (5) */
+        ztop_state->z_buffer_top = R300_ZTOP_DISABLE;
+    } else if (r300->query_current) {                            /* (6) */
+        ztop_state->z_buffer_top = R300_ZTOP_DISABLE;
+    } else {
+        ztop_state->z_buffer_top = R300_ZTOP_ENABLE;
     }
+
+    r300->ztop_state.dirty = TRUE;
 }
 
 void r300_update_derived_state(struct r300_context* r300)
 {
+    /* XXX */
     if (r300->dirty_state &
         (R300_NEW_FRAGMENT_SHADER | R300_NEW_VERTEX_SHADER |
-         R300_NEW_VERTEX_FORMAT)) {
+         R300_NEW_VERTEX_FORMAT) || r300->rs_state.dirty) {
         r300_update_derived_shader_state(r300);
     }
 
-    if (r300->dirty_state &
-            (R300_NEW_DSA | R300_NEW_FRAGMENT_SHADER | R300_NEW_QUERY)) {
-        r300_update_ztop(r300);
-    }
+    r300_update_ztop(r300);
 }
diff --git a/src/gallium/drivers/r300/r300_state_inlines.h b/src/gallium/drivers/r300/r300_state_inlines.h
index dbe42edd91..35be00e1b0 100644
--- a/src/gallium/drivers/r300/r300_state_inlines.h
+++ b/src/gallium/drivers/r300/r300_state_inlines.h
@@ -257,38 +257,37 @@ static INLINE uint32_t r300_translate_wrap(int wrap)
     }
 }
 
-static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip)
+static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip,
+                                                  int is_anisotropic)
 {
     uint32_t retval = 0;
-    switch (min) {
+    if (is_anisotropic)
+        retval |= R300_TX_MIN_FILTER_ANISO | R300_TX_MAG_FILTER_ANISO;
+    else {
+        switch (min) {
         case PIPE_TEX_FILTER_NEAREST:
             retval |= R300_TX_MIN_FILTER_NEAREST;
             break;
         case PIPE_TEX_FILTER_LINEAR:
             retval |= R300_TX_MIN_FILTER_LINEAR;
             break;
-        case PIPE_TEX_FILTER_ANISO:
-            retval |= R300_TX_MIN_FILTER_ANISO;
-            break;
         default:
             debug_printf("r300: Unknown texture filter %d\n", min);
             assert(0);
             break;
-    }
-    switch (mag) {
+        }
+        switch (mag) {
         case PIPE_TEX_FILTER_NEAREST:
             retval |= R300_TX_MAG_FILTER_NEAREST;
             break;
         case PIPE_TEX_FILTER_LINEAR:
             retval |= R300_TX_MAG_FILTER_LINEAR;
             break;
-        case PIPE_TEX_FILTER_ANISO:
-            retval |= R300_TX_MAG_FILTER_ANISO;
-            break;
         default:
             debug_printf("r300: Unknown texture filter %d\n", mag);
             assert(0);
             break;
+        }
     }
     switch (mip) {
         case PIPE_TEX_MIPFILTER_NONE:
diff --git a/src/gallium/drivers/r300/r300_state_invariant.c b/src/gallium/drivers/r300/r300_state_invariant.c
index bcd4c030f9..b0f309695c 100644
--- a/src/gallium/drivers/r300/r300_state_invariant.c
+++ b/src/gallium/drivers/r300/r300_state_invariant.c
@@ -43,7 +43,7 @@ void r300_emit_invariant_state(struct r300_context* r300)
     struct r300_capabilities* caps = r300_screen(r300->context.screen)->caps;
     CS_LOCALS(r300);
 
-    BEGIN_CS(20 + (caps->has_tcl ? 2: 0));
+    BEGIN_CS(16 + (caps->has_tcl ? 2: 0));
 
     /*** Graphics Backend (GB) ***/
     /* Various GB enables */
@@ -66,8 +66,6 @@ void r300_emit_invariant_state(struct r300_context* r300)
     OUT_CS_REG(R300_FG_FOG_COLOR_R, 0x0);
     OUT_CS_REG(R300_FG_FOG_COLOR_G, 0x0);
     OUT_CS_REG(R300_FG_FOG_COLOR_B, 0x0);
-    OUT_CS_REG(R300_FG_DEPTH_SRC, 0x0);
-    OUT_CS_REG(R300_US_W_FMT, 0x0);
 
     /*** VAP ***/
     /* Sign/normalize control */
@@ -117,10 +115,12 @@ void r300_emit_invariant_state(struct r300_context* r300)
     OUT_CS_REG(R300_SC_HYPERZ, 0x0000001C);
     OUT_CS_REG(R300_SC_EDGERULE, 0x2DA49525);
     OUT_CS_REG(R300_RB3D_AARESOLVE_CTL, 0x00000000);
-    if (caps->is_r500) {
-        OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 0x00000000);
-        OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD, 0xFFFFFFFF);
+
+    if (caps->family >= CHIP_FAMILY_RV350) {
+        OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 0x01010101);
+        OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD, 0xFEFEFEFE);
     }
+
     OUT_CS_REG(R300_ZB_BW_CNTL, 0x00000000);
     OUT_CS_REG(R300_ZB_DEPTHCLEARVALUE, 0x00000000);
     OUT_CS_REG(R300_ZB_HIZ_OFFSET, 0x00000000);
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 9a96206a4d..a9bbdd56d8 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -30,6 +30,18 @@
 #include "r300_texture.h"
 #include "r300_screen.h"
 
+#define TILE_WIDTH 0
+#define TILE_HEIGHT 1
+
+static const unsigned microblock_table[5][3][2] = {
+    /*linear  tiled   square-tiled */
+    {{32, 1}, {8, 4}, {0, 0}}, /*   8 bits per pixel */
+    {{16, 1}, {8, 2}, {4, 4}}, /*  16 bits per pixel */
+    {{ 8, 1}, {4, 2}, {0, 0}}, /*  32 bits per pixel */
+    {{ 4, 1}, {0, 0}, {2, 2}}, /*  64 bits per pixel */
+    {{ 2, 1}, {0, 0}, {0, 0}}  /* 128 bits per pixel */
+};
+
 static void r300_setup_texture_state(struct r300_texture* tex, boolean is_r500)
 {
     struct r300_texture_state* state = &tex->state;
@@ -92,33 +104,67 @@ unsigned r300_texture_get_offset(struct r300_texture* tex, unsigned level,
 }
 
 /**
+ * Return the width (dim==TILE_WIDTH) or height (dim==TILE_HEIGHT) of one tile
+ * of the given texture.
+ */
+static unsigned r300_texture_get_tile_size(struct r300_texture* tex, int dim)
+{
+    unsigned pixsize, tile_size;
+
+    pixsize = util_format_get_blocksize(tex->tex.format);
+    tile_size = microblock_table[util_logbase2(pixsize)][tex->microtile][dim] *
+                (tex->macrotile == R300_BUFFER_TILED ? 8 : 1);
+
+    assert(tile_size);
+    return tile_size;
+}
+
+/**
  * Return the stride, in bytes, of the texture images of the given texture
  * at the given level.
  */
 unsigned r300_texture_get_stride(struct r300_texture* tex, unsigned level)
 {
+    unsigned tile_width, width;
+
     if (tex->stride_override)
         return tex->stride_override;
 
+    /* Check the level. */
     if (level > tex->tex.last_level) {
         debug_printf("%s: level (%u) > last_level (%u)\n", __FUNCTION__,
             level, tex->tex.last_level);
         return 0;
     }
 
-    return align(util_format_get_stride(tex->tex.format, u_minify(tex->tex.width0, level)), 32);
+    tile_width = r300_texture_get_tile_size(tex, TILE_WIDTH);
+    width = align(u_minify(tex->tex.width0, level), tile_width);
+
+    /* Should already be aligned except for S3TC. */
+    return align(util_format_get_stride(tex->tex.format, width), 32);
+}
+
+static unsigned r300_texture_get_nblocksy(struct r300_texture* tex,
+                                          unsigned level)
+{
+    unsigned height, tile_height;
+
+    tile_height = r300_texture_get_tile_size(tex, TILE_HEIGHT);
+    height = align(u_minify(tex->tex.height0, level), tile_height);
+
+    return util_format_get_nblocksy(tex->tex.format, height);
 }
 
 static void r300_setup_miptree(struct r300_texture* tex)
 {
     struct pipe_texture* base = &tex->tex;
-    int stride, size, layer_size;
-    int i;
+    unsigned stride, size, layer_size, nblocksy, i;
 
-    for (i = 0; i <= base->last_level; i++) {
-        unsigned nblocksy = util_format_get_nblocksy(base->format, u_minify(base->height0, i));
+    debug_printf("r300: Making miptree for texture, format %s\n", pf_name(base->format));
 
+    for (i = 0; i <= base->last_level; i++) {
         stride = r300_texture_get_stride(tex, i);
+        nblocksy = r300_texture_get_nblocksy(tex, i);
         layer_size = stride * nblocksy;
 
         if (base->target == PIPE_TEXTURE_CUBE)
@@ -132,9 +178,9 @@ static void r300_setup_miptree(struct r300_texture* tex)
         tex->pitch[i] = stride / util_format_get_blocksize(base->format);
 
         debug_printf("r300: Texture miptree: Level %d "
-                "(%dx%dx%d px, pitch %d bytes)\n",
+                "(%dx%dx%d px, pitch %d bytes) %d bytes total\n",
                 i, u_minify(base->width0, i), u_minify(base->height0, i),
-                u_minify(base->depth0, i), stride);
+                u_minify(base->depth0, i), stride, tex->size);
     }
 }
 
@@ -163,7 +209,7 @@ static struct pipe_texture*
     r300_setup_miptree(tex);
     r300_setup_texture_state(tex, r300_screen(screen)->caps->is_r500);
 
-    tex->buffer = screen->buffer_create(screen, 1024,
+    tex->buffer = screen->buffer_create(screen, 2048,
                                         PIPE_BUFFER_USAGE_PIXEL,
                                         tex->size);
 
diff --git a/src/gallium/drivers/r300/r300_vs.c b/src/gallium/drivers/r300/r300_vs.c
index c4ed0d712f..68aef70872 100644
--- a/src/gallium/drivers/r300/r300_vs.c
+++ b/src/gallium/drivers/r300/r300_vs.c
@@ -22,6 +22,7 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
 #include "r300_vs.h"
+#include "r300_fs.h"
 
 #include "r300_context.h"
 #include "r300_screen.h"
@@ -33,6 +34,8 @@
 
 #include "radeon_compiler.h"
 
+#include "util/u_math.h"
+
 /* Convert info about VS output semantics into r300_shader_semantics. */
 static void r300_shader_read_vs_outputs(
     struct tgsi_shader_info* info,
@@ -88,11 +91,13 @@ static void r300_shader_read_vs_outputs(
     }
 }
 
-static void r300_shader_vap_output_fmt(
-    struct r300_shader_semantics* vs_outputs,
-    uint* hwfmt)
+static void r300_shader_vap_output_fmt(struct r300_vertex_shader* vs)
 {
+    struct r300_shader_semantics* vs_outputs = &vs->outputs;
+    uint32_t* hwfmt = vs->hwfmt;
     int i, gen_count;
+    boolean any_bcolor_used = vs_outputs->bcolor[0] != ATTR_UNUSED ||
+                              vs_outputs->bcolor[1] != ATTR_UNUSED;
 
     /* Do the actual vertex_info setup.
      *
@@ -119,13 +124,19 @@ static void r300_shader_vap_output_fmt(
 
     /* Colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
-        if (vs_outputs->color[i] != ATTR_UNUSED) {
+        if (vs_outputs->color[i] != ATTR_UNUSED || any_bcolor_used) {
             hwfmt[1] |= R300_INPUT_CNTL_COLOR;
             hwfmt[2] |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT << i;
         }
     }
 
-    /* XXX Back-face colors. */
+    /* Back-face colors. */
+    if (any_bcolor_used) {
+        for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+            hwfmt[1] |= R300_INPUT_CNTL_COLOR;
+            hwfmt[2] |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT << (2+i);
+        }
+    }
 
     /* Texture coordinates. */
     gen_count = 0;
@@ -146,6 +157,9 @@ static void r300_shader_vap_output_fmt(
 
     /* XXX magic */
     assert(gen_count <= 8);
+
+    /* WPOS. */
+    vs->wpos_tex_output = gen_count;
 }
 
 /* Sets up stream mapping to equivalent VS outputs if TCL is bypassed
@@ -155,6 +169,8 @@ static void r300_stream_locations_notcl(
     int* stream_loc)
 {
     int i, tabi = 0, gen_count;
+    boolean any_bcolor_used = vs_outputs->bcolor[0] != ATTR_UNUSED ||
+                              vs_outputs->bcolor[1] != ATTR_UNUSED;
 
     /* Position. */
     stream_loc[tabi++] = 0;
@@ -166,14 +182,14 @@ static void r300_stream_locations_notcl(
 
     /* Colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
-        if (vs_outputs->color[i] != ATTR_UNUSED) {
+        if (vs_outputs->color[i] != ATTR_UNUSED || any_bcolor_used) {
             stream_loc[tabi++] = 2 + i;
         }
     }
 
     /* Back-face colors. */
-    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
-        if (vs_outputs->bcolor[i] != ATTR_UNUSED) {
+    if (any_bcolor_used) {
+        for (i = 0; i < ATTR_COLOR_COUNT; i++) {
             stream_loc[tabi++] = 4 + i;
         }
     }
@@ -181,7 +197,7 @@ static void r300_stream_locations_notcl(
     /* Texture coordinates. */
     gen_count = 0;
     for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
-        if (vs_outputs->bcolor[i] != ATTR_UNUSED) {
+        if (vs_outputs->generic[i] != ATTR_UNUSED) {
             assert(tabi < 16);
             stream_loc[tabi++] = 6 + gen_count;
             gen_count++;
@@ -195,8 +211,12 @@ static void r300_stream_locations_notcl(
         gen_count++;
     }
 
-    /* XXX magic */
-    assert(gen_count <= 8);
+    /* WPOS. */
+    if (vs_outputs->wpos != ATTR_UNUSED) {
+        assert(tabi < 16);
+        stream_loc[tabi++] = 6 + gen_count;
+        gen_count++;
+    }
 
     for (; tabi < 16;) {
         stream_loc[tabi++] = -1;
@@ -209,6 +229,8 @@ static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
     struct r300_shader_semantics* outputs = &vs->outputs;
     struct tgsi_shader_info* info = &vs->info;
     int i, reg = 0;
+    boolean any_bcolor_used = outputs->bcolor[0] != ATTR_UNUSED ||
+                              outputs->bcolor[1] != ATTR_UNUSED;
 
     /* Fill in the input mapping */
     for (i = 0; i < info->num_inputs; i++)
@@ -226,14 +248,30 @@ static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
         c->code->outputs[outputs->psize] = reg++;
     }
 
+    /* If we're writing back facing colors we need to send
+     * four colors to make front/back face colors selection work.
+     * If the vertex program doesn't write all 4 colors, lets
+     * pretend it does by skipping output index reg so the colors
+     * get written into appropriate output vectors.
+     */
+
     /* Colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
         if (outputs->color[i] != ATTR_UNUSED) {
             c->code->outputs[outputs->color[i]] = reg++;
+        } else if (any_bcolor_used) {
+            reg++;
         }
     }
 
-    /* XXX Back-face colors. */
+    /* Back-face colors. */
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        if (outputs->bcolor[i] != ATTR_UNUSED) {
+            c->code->outputs[outputs->bcolor[i]] = reg++;
+        } else if (any_bcolor_used) {
+            reg++;
+        }
+    }
 
     /* Texture coordinates. */
     for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
@@ -246,6 +284,33 @@ static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
     if (outputs->fog != ATTR_UNUSED) {
         c->code->outputs[outputs->fog] = reg++;
     }
+
+    /* WPOS. */
+    if (outputs->wpos != ATTR_UNUSED) {
+        c->code->outputs[outputs->wpos] = reg++;
+    }
+}
+
+static void r300_insert_wpos(struct r300_vertex_program_compiler* c,
+                             struct r300_shader_semantics* outputs)
+{
+    int i, lastOutput = 0;
+
+    /* Find the max output index. */
+    lastOutput = MAX2(lastOutput, outputs->psize);
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        lastOutput = MAX2(lastOutput, outputs->color[i]);
+        lastOutput = MAX2(lastOutput, outputs->bcolor[i]);
+    }
+    for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
+        lastOutput = MAX2(lastOutput, outputs->generic[i]);
+    }
+    lastOutput = MAX2(lastOutput, outputs->fog);
+
+    /* Set WPOS after the last output. */
+    lastOutput++;
+    rc_copy_output(&c->Base, 0, lastOutput); /* out[lastOutput] = out[0]; */
+    outputs->wpos = lastOutput;
 }
 
 void r300_translate_vertex_shader(struct r300_context* r300,
@@ -256,8 +321,6 @@ void r300_translate_vertex_shader(struct r300_context* r300,
 
     /* Initialize. */
     r300_shader_read_vs_outputs(&vs->info, &vs->outputs);
-    r300_shader_vap_output_fmt(&vs->outputs, vs->hwfmt);
-    r300_stream_locations_notcl(&vs->outputs, vs->stream_loc_notcl);
 
     /* Setup the compiler */
     rc_init(&compiler.Base);
@@ -277,9 +340,15 @@ void r300_translate_vertex_shader(struct r300_context* r300,
 
     r300_tgsi_to_rc(&ttr, vs->state.tokens);
 
-    compiler.RequiredOutputs = ~(~0 << vs->info.num_outputs);
+    compiler.RequiredOutputs = ~(~0 << (vs->info.num_outputs+1));
     compiler.SetHwInputOutput = &set_vertex_inputs_outputs;
 
+    /* Insert the WPOS output. */
+    r300_insert_wpos(&compiler, &vs->outputs);
+
+    r300_shader_vap_output_fmt(vs);
+    r300_stream_locations_notcl(&vs->outputs, vs->stream_loc_notcl);
+
     /* Invoke the compiler */
     r3xx_compile_vertex_program(&compiler);
     if (compiler.Base.Error) {
@@ -292,3 +361,30 @@ void r300_translate_vertex_shader(struct r300_context* r300,
     rc_destroy(&compiler.Base);
     vs->translated = TRUE;
 }
+
+boolean r300_vertex_shader_setup_wpos(struct r300_context* r300)
+{
+    struct r300_vertex_shader* vs = r300->vs;
+    int tex_output = r300->vs->wpos_tex_output;
+    uint32_t tex_fmt = R300_INPUT_CNTL_TC0 << tex_output;
+    uint32_t* hwfmt = vs->hwfmt;
+
+    if (r300->fs->inputs.wpos != ATTR_UNUSED) {
+        /* Enable WPOS in VAP. */
+        if (!(hwfmt[1] & tex_fmt)) {
+            hwfmt[1] |= tex_fmt;
+            hwfmt[3] |= (4 << (3 * tex_output));
+
+            assert(tex_output < 8);
+            return TRUE;
+        }
+    } else {
+        /* Disable WPOS in VAP. */
+        if (hwfmt[1] & tex_fmt) {
+            hwfmt[1] &= ~tex_fmt;
+            hwfmt[3] &= ~(4 << (3 * tex_output));
+            return TRUE;
+        }
+    }
+    return FALSE;
+}
diff --git a/src/gallium/drivers/r300/r300_vs.h b/src/gallium/drivers/r300/r300_vs.h
index 67e9db5366..18cfeee3cd 100644
--- a/src/gallium/drivers/r300/r300_vs.h
+++ b/src/gallium/drivers/r300/r300_vs.h
@@ -43,6 +43,9 @@ struct r300_vertex_shader {
     /* Stream locations for SWTCL or if TCL is bypassed. */
     int stream_loc_notcl[16];
 
+    /* Output stream location for WPOS. */
+    int wpos_tex_output;
+
     /* Has this shader been translated yet? */
     boolean translated;
 
@@ -53,4 +56,7 @@ struct r300_vertex_shader {
 void r300_translate_vertex_shader(struct r300_context* r300,
                                   struct r300_vertex_shader* vs);
 
+/* Return TRUE if VAP (hwfmt) needs to be re-emitted. */
+boolean r300_vertex_shader_setup_wpos(struct r300_context* r300);
+
 #endif /* R300_VS_H */
diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c
index b3ece9d8ed..2a27e5ce64 100644
--- a/src/gallium/drivers/softpipe/sp_draw_arrays.c
+++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c
@@ -103,7 +103,7 @@ softpipe_unmap_constant_buffers(struct softpipe_context *sp)
  * Basically, map the vertex buffers (and drawing surfaces), then hand off
  * the drawing to the 'draw' module.
  */
-static boolean
+static void
 softpipe_draw_range_elements_instanced(struct pipe_context *pipe,
                                        struct pipe_buffer *indexBuffer,
                                        unsigned indexSize,
@@ -116,24 +116,24 @@ softpipe_draw_range_elements_instanced(struct pipe_context *pipe,
                                        unsigned instanceCount);
 
 
-boolean
+void
 softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
                      unsigned start, unsigned count)
 {
-   return softpipe_draw_range_elements_instanced(pipe,
-                                                 NULL,
-                                                 0,
-                                                 0,
-                                                 0xffffffff,
-                                                 mode,
-                                                 start,
-                                                 count,
-                                                 0,
-                                                 1);
+   softpipe_draw_range_elements_instanced(pipe,
+                                          NULL,
+                                          0,
+                                          0,
+                                          0xffffffff,
+                                          mode,
+                                          start,
+                                          count,
+                                          0,
+                                          1);
 }
 
 
-boolean
+void
 softpipe_draw_range_elements(struct pipe_context *pipe,
                              struct pipe_buffer *indexBuffer,
                              unsigned indexSize,
@@ -141,35 +141,35 @@ softpipe_draw_range_elements(struct pipe_context *pipe,
                              unsigned max_index,
                              unsigned mode, unsigned start, unsigned count)
 {
-   return softpipe_draw_range_elements_instanced(pipe,
-                                                 indexBuffer,
-                                                 indexSize,
-                                                 min_index,
-                                                 max_index,
-                                                 mode,
-                                                 start,
-                                                 count,
-                                                 0,
-                                                 1);
+   softpipe_draw_range_elements_instanced(pipe,
+                                          indexBuffer,
+                                          indexSize,
+                                          min_index,
+                                          max_index,
+                                          mode,
+                                          start,
+                                          count,
+                                          0,
+                                          1);
 }
 
 
-boolean
+void
 softpipe_draw_elements(struct pipe_context *pipe,
                        struct pipe_buffer *indexBuffer,
                        unsigned indexSize,
                        unsigned mode, unsigned start, unsigned count)
 {
-   return softpipe_draw_range_elements_instanced(pipe,
-                                                 indexBuffer,
-                                                 indexSize,
-                                                 0,
-                                                 0xffffffff,
-                                                 mode,
-                                                 start,
-                                                 count,
-                                                 0,
-                                                 1);
+   softpipe_draw_range_elements_instanced(pipe,
+                                          indexBuffer,
+                                          indexSize,
+                                          0,
+                                          0xffffffff,
+                                          mode,
+                                          start,
+                                          count,
+                                          0,
+                                          1);
 }
 
 void
@@ -214,7 +214,7 @@ softpipe_draw_elements_instanced(struct pipe_context *pipe,
                                           instanceCount);
 }
 
-static boolean
+static void
 softpipe_draw_range_elements_instanced(struct pipe_context *pipe,
                                        struct pipe_buffer *indexBuffer,
                                        unsigned indexSize,
@@ -231,7 +231,7 @@ softpipe_draw_range_elements_instanced(struct pipe_context *pipe,
    unsigned i;
 
    if (!softpipe_check_render_cond(sp))
-      return TRUE;
+      return;
 
    sp->reduced_api_prim = u_reduced_prim(mode);
 
@@ -290,6 +290,4 @@ softpipe_draw_range_elements_instanced(struct pipe_context *pipe,
    softpipe_unmap_constant_buffers(sp);
 
    sp->dirty_render_cache = TRUE;
-
-   return TRUE;
 }
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 3153d6e6a4..0f9b1546df 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -184,14 +184,14 @@ void softpipe_set_vertex_buffers(struct pipe_context *,
 void softpipe_update_derived( struct softpipe_context *softpipe );
 
 
-boolean softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
-			     unsigned start, unsigned count);
-
-boolean softpipe_draw_elements(struct pipe_context *pipe,
-			       struct pipe_buffer *indexBuffer,
-			       unsigned indexSize,
-			       unsigned mode, unsigned start, unsigned count);
-boolean
+void softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                          unsigned start, unsigned count);
+
+void softpipe_draw_elements(struct pipe_context *pipe,
+                            struct pipe_buffer *indexBuffer,
+                            unsigned indexSize,
+                            unsigned mode, unsigned start, unsigned count);
+void
 softpipe_draw_range_elements(struct pipe_context *pipe,
                              struct pipe_buffer *indexBuffer,
                              unsigned indexSize,
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index e26153b1d9..1ae8fecacf 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -2,7 +2,7 @@
  * 
  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
- * Copyright 2008 VMware, Inc.  All rights reserved.
+ * Copyright 2008-2010 VMware, Inc.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -514,21 +514,15 @@ static float
 compute_lambda_1d(const struct sp_sampler_varient *samp,
                   const float s[QUAD_SIZE],
                   const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias)
+                  const float p[QUAD_SIZE])
 {
    const struct pipe_texture *texture = samp->texture;
    const struct pipe_sampler_state *sampler = samp->sampler;
    float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
    float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
    float rho = MAX2(dsdx, dsdy) * texture->width0;
-   float lambda;
-
-   lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->lod_bias;
-   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
 
-   return lambda;
+   return util_fast_log2(rho);
 }
 
 
@@ -536,8 +530,7 @@ static float
 compute_lambda_2d(const struct sp_sampler_varient *samp,
                   const float s[QUAD_SIZE],
                   const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias)
+                  const float p[QUAD_SIZE])
 {
    const struct pipe_texture *texture = samp->texture;
    const struct pipe_sampler_state *sampler = samp->sampler;
@@ -548,13 +541,8 @@ compute_lambda_2d(const struct sp_sampler_varient *samp,
    float maxx = MAX2(dsdx, dsdy) * texture->width0;
    float maxy = MAX2(dtdx, dtdy) * texture->height0;
    float rho  = MAX2(maxx, maxy);
-   float lambda;
 
-   lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->lod_bias;
-   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
-
-   return lambda;
+   return util_fast_log2(rho);
 }
 
 
@@ -562,8 +550,7 @@ static float
 compute_lambda_3d(const struct sp_sampler_varient *samp,
                   const float s[QUAD_SIZE],
                   const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias)
+                  const float p[QUAD_SIZE])
 {
    const struct pipe_texture *texture = samp->texture;
    const struct pipe_sampler_state *sampler = samp->sampler;
@@ -576,31 +563,26 @@ compute_lambda_3d(const struct sp_sampler_varient *samp,
    float maxx = MAX2(dsdx, dsdy) * texture->width0;
    float maxy = MAX2(dtdx, dtdy) * texture->height0;
    float maxz = MAX2(dpdx, dpdy) * texture->depth0;
-   float rho, lambda;
+   float rho;
 
    rho = MAX2(maxx, maxy);
    rho = MAX2(rho, maxz);
 
-   lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->lod_bias;
-   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
-
-   return lambda;
+   return util_fast_log2(rho);
 }
 
 
 /**
  * Compute lambda for a vertex texture sampler.
- * Since there aren't derivatives to use, just return the LOD bias.
+ * Since there aren't derivatives to use, just return 0.
  */
 static float
 compute_lambda_vert(const struct sp_sampler_varient *samp,
                     const float s[QUAD_SIZE],
                     const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    float lodbias)
+                    const float p[QUAD_SIZE])
 {
-   return lodbias;
+   return 0.0f;
 }
 
 
@@ -769,7 +751,8 @@ img_filter_2d_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
                                 const float s[QUAD_SIZE],
                                 const float t[QUAD_SIZE],
                                 const float p[QUAD_SIZE],
-                                float lodbias,
+                                const float c0[QUAD_SIZE],
+                                enum tgsi_sampler_control control,
                                 float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -827,7 +810,8 @@ img_filter_2d_nearest_repeat_POT(struct tgsi_sampler *tgsi_sampler,
                                  const float s[QUAD_SIZE],
                                  const float t[QUAD_SIZE],
                                  const float p[QUAD_SIZE],
-                                 float lodbias,
+                                 const float c0[QUAD_SIZE],
+                                 enum tgsi_sampler_control control,
                                  float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -866,7 +850,8 @@ img_filter_2d_nearest_clamp_POT(struct tgsi_sampler *tgsi_sampler,
                                 const float s[QUAD_SIZE],
                                 const float t[QUAD_SIZE],
                                 const float p[QUAD_SIZE],
-                                float lodbias,
+                                const float c0[QUAD_SIZE],
+                                enum tgsi_sampler_control control,
                                 float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -914,7 +899,8 @@ img_filter_1d_nearest(struct tgsi_sampler *tgsi_sampler,
                         const float s[QUAD_SIZE],
                         const float t[QUAD_SIZE],
                         const float p[QUAD_SIZE],
-                        float lodbias,
+                        const float c0[QUAD_SIZE],
+                        enum tgsi_sampler_control control,
                         float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -949,7 +935,8 @@ img_filter_2d_nearest(struct tgsi_sampler *tgsi_sampler,
                       const float s[QUAD_SIZE],
                       const float t[QUAD_SIZE],
                       const float p[QUAD_SIZE],
-                      float lodbias,
+                      const float c0[QUAD_SIZE],
+                      enum tgsi_sampler_control control,
                       float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -996,7 +983,8 @@ img_filter_cube_nearest(struct tgsi_sampler *tgsi_sampler,
                         const float s[QUAD_SIZE],
                         const float t[QUAD_SIZE],
                         const float p[QUAD_SIZE],
-                        float lodbias,
+                        const float c0[QUAD_SIZE],
+                        enum tgsi_sampler_control control,
                         float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1035,7 +1023,8 @@ img_filter_3d_nearest(struct tgsi_sampler *tgsi_sampler,
                       const float s[QUAD_SIZE],
                       const float t[QUAD_SIZE],
                       const float p[QUAD_SIZE],
-                      float lodbias,
+                      const float c0[QUAD_SIZE],
+                      enum tgsi_sampler_control control,
                       float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1076,7 +1065,8 @@ img_filter_1d_linear(struct tgsi_sampler *tgsi_sampler,
                      const float s[QUAD_SIZE],
                      const float t[QUAD_SIZE],
                      const float p[QUAD_SIZE],
-                     float lodbias,
+                     const float c0[QUAD_SIZE],
+                     enum tgsi_sampler_control control,
                      float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1115,7 +1105,8 @@ img_filter_2d_linear(struct tgsi_sampler *tgsi_sampler,
                      const float s[QUAD_SIZE],
                      const float t[QUAD_SIZE],
                      const float p[QUAD_SIZE],
-                     float lodbias,
+                     const float c0[QUAD_SIZE],
+                     enum tgsi_sampler_control control,
                      float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1161,7 +1152,8 @@ img_filter_cube_linear(struct tgsi_sampler *tgsi_sampler,
                        const float s[QUAD_SIZE],
                        const float t[QUAD_SIZE],
                        const float p[QUAD_SIZE],
-                       float lodbias,
+                       const float c0[QUAD_SIZE],
+                       enum tgsi_sampler_control control,
                        float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1209,7 +1201,8 @@ img_filter_3d_linear(struct tgsi_sampler *tgsi_sampler,
                      const float s[QUAD_SIZE],
                      const float t[QUAD_SIZE],
                      const float p[QUAD_SIZE],
-                     float lodbias,
+                     const float c0[QUAD_SIZE],
+                     enum tgsi_sampler_control control,
                      float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1261,29 +1254,60 @@ img_filter_3d_linear(struct tgsi_sampler *tgsi_sampler,
 }
 
 
+/* Calculate level of detail for every fragment.
+ * Note that lambda has already been biased by global LOD bias.
+ */
+static INLINE void
+compute_lod(const struct pipe_sampler_state *sampler,
+            const float biased_lambda,
+            const float lodbias[QUAD_SIZE],
+            float lod[QUAD_SIZE])
+{
+   uint i;
+
+   for (i = 0; i < QUAD_SIZE; i++) {
+      lod[i] = biased_lambda + lodbias[i];
+      lod[i] = CLAMP(lod[i], sampler->min_lod, sampler->max_lod);
+   }
+}
+
+
 static void
 mip_filter_linear(struct tgsi_sampler *tgsi_sampler,
                   const float s[QUAD_SIZE],
                   const float t[QUAD_SIZE],
                   const float p[QUAD_SIZE],
-                  float lodbias,
+                  const float c0[QUAD_SIZE],
+                  enum tgsi_sampler_control control,
                   float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
    const struct pipe_texture *texture = samp->texture;
    int level0;
    float lambda;
+   float lod[QUAD_SIZE];
+
+   if (control == tgsi_sampler_lod_bias) {
+      lambda = samp->compute_lambda(samp, s, t, p) + samp->sampler->lod_bias;
+      compute_lod(samp->sampler, lambda, c0, lod);
+   } else {
+      assert(control == tgsi_sampler_lod_explicit);
 
-   lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+      memcpy(lod, c0, sizeof(lod));
+   }
+
+   /* XXX: Take into account all lod values.
+    */
+   lambda = lod[0];
    level0 = (int)lambda;
 
    if (lambda < 0.0) { 
       samp->level = 0;
-      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->mag_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
    else if (level0 >= texture->last_level) {
       samp->level = texture->last_level;
-      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
    else {
       float levelBlend = lambda - level0;
@@ -1292,10 +1316,10 @@ mip_filter_linear(struct tgsi_sampler *tgsi_sampler,
       int c,j;
 
       samp->level = level0;
-      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba0 );
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba0);
 
       samp->level = level0+1;
-      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba1 );
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba1);
 
       for (j = 0; j < QUAD_SIZE; j++) {
          for (c = 0; c < 4; c++) {
@@ -1311,23 +1335,36 @@ mip_filter_nearest(struct tgsi_sampler *tgsi_sampler,
                    const float s[QUAD_SIZE],
                    const float t[QUAD_SIZE],
                    const float p[QUAD_SIZE],
-                   float lodbias,
+                   const float c0[QUAD_SIZE],
+                   enum tgsi_sampler_control control,
                    float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
    const struct pipe_texture *texture = samp->texture;
    float lambda;
+   float lod[QUAD_SIZE];
 
-   lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+   if (control == tgsi_sampler_lod_bias) {
+      lambda = samp->compute_lambda(samp, s, t, p) + samp->sampler->lod_bias;
+      compute_lod(samp->sampler, lambda, c0, lod);
+   } else {
+      assert(control == tgsi_sampler_lod_explicit);
+
+      memcpy(lod, c0, sizeof(lod));
+   }
+
+   /* XXX: Take into account all lod values.
+    */
+   lambda = lod[0];
 
    if (lambda < 0.0) { 
       samp->level = 0;
-      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->mag_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
    else {
       samp->level = (int)(lambda + 0.5) ;
       samp->level = MIN2(samp->level, (int)texture->last_level);
-      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
 
 #if 0
@@ -1345,17 +1382,32 @@ mip_filter_none(struct tgsi_sampler *tgsi_sampler,
                 const float s[QUAD_SIZE],
                 const float t[QUAD_SIZE],
                 const float p[QUAD_SIZE],
-                float lodbias,
+                const float c0[QUAD_SIZE],
+                enum tgsi_sampler_control control,
                 float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
-   float lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+   float lambda;
+   float lod[QUAD_SIZE];
+
+   if (control == tgsi_sampler_lod_bias) {
+      lambda = samp->compute_lambda(samp, s, t, p) + samp->sampler->lod_bias;
+      compute_lod(samp->sampler, lambda, c0, lod);
+   } else {
+      assert(control == tgsi_sampler_lod_explicit);
+
+      memcpy(lod, c0, sizeof(lod));
+   }
+
+   /* XXX: Take into account all lod values.
+    */
+   lambda = lod[0];
 
    if (lambda < 0.0) { 
-      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->mag_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
    else {
-      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
 }
 
@@ -1371,15 +1423,28 @@ mip_filter_linear_2d_linear_repeat_POT(
    const float s[QUAD_SIZE],
    const float t[QUAD_SIZE],
    const float p[QUAD_SIZE],
-   float lodbias,
+   const float c0[QUAD_SIZE],
+   enum tgsi_sampler_control control,
    float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
    const struct pipe_texture *texture = samp->texture;
    int level0;
    float lambda;
+   float lod[QUAD_SIZE];
 
-   lambda = compute_lambda_2d(samp, s, t, p, lodbias);
+   if (control == tgsi_sampler_lod_bias) {
+      lambda = samp->compute_lambda(samp, s, t, p) + samp->sampler->lod_bias;
+      compute_lod(samp->sampler, lambda, c0, lod);
+   } else {
+      assert(control == tgsi_sampler_lod_explicit);
+
+      memcpy(lod, c0, sizeof(lod));
+   }
+
+   /* XXX: Take into account all lod values.
+    */
+   lambda = lod[0];
    level0 = (int)lambda;
 
    /* Catches both negative and large values of level0:
@@ -1390,7 +1455,7 @@ mip_filter_linear_2d_linear_repeat_POT(
       else
          samp->level = texture->last_level;
 
-      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba );
+      img_filter_2d_linear_repeat_POT(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
    else {
       float levelBlend = lambda - level0;
@@ -1399,10 +1464,10 @@ mip_filter_linear_2d_linear_repeat_POT(
       int c,j;
 
       samp->level = level0;
-      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba0 );
+      img_filter_2d_linear_repeat_POT(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba0);
 
       samp->level = level0+1;
-      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba1 );
+      img_filter_2d_linear_repeat_POT(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba1);
 
       for (j = 0; j < QUAD_SIZE; j++) {
          for (c = 0; c < 4; c++) {
@@ -1422,7 +1487,8 @@ sample_compare(struct tgsi_sampler *tgsi_sampler,
                const float s[QUAD_SIZE],
                const float t[QUAD_SIZE],
                const float p[QUAD_SIZE],
-               float lodbias,
+               const float c0[QUAD_SIZE],
+               enum tgsi_sampler_control control,
                float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1430,7 +1496,7 @@ sample_compare(struct tgsi_sampler *tgsi_sampler,
    int j, k0, k1, k2, k3;
    float val;
 
-   samp->mip_filter( tgsi_sampler, s, t, p, lodbias, rgba );
+   samp->mip_filter(tgsi_sampler, s, t, p, c0, control, rgba);
 
    /**
     * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
@@ -1508,7 +1574,8 @@ sample_cube(struct tgsi_sampler *tgsi_sampler,
             const float s[QUAD_SIZE],
             const float t[QUAD_SIZE],
             const float p[QUAD_SIZE],
-            float lodbias,
+            const float c0[QUAD_SIZE],
+            enum tgsi_sampler_control control,
             float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1589,7 +1656,7 @@ sample_cube(struct tgsi_sampler *tgsi_sampler,
     * is not active, this will point somewhere deeper into the
     * pipeline, eg. to mip_filter or even img_filter.
     */
-   samp->compare(tgsi_sampler, ssss, tttt, NULL, lodbias, rgba);
+   samp->compare(tgsi_sampler, ssss, tttt, NULL, c0, control, rgba);
 }
 
 
@@ -1862,7 +1929,7 @@ sp_create_sampler_varient( const struct pipe_sampler_state *sampler,
       break;
    }
 
-   if (sampler->compare_mode != FALSE) {
+   if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
       samp->compare = sample_compare;
    }
    else {
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.h b/src/gallium/drivers/softpipe/sp_tex_sample.h
index b0797711d3..b6e66c998a 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.h
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.h
@@ -2,6 +2,7 @@
  * 
  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2010 VMware, Inc.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -46,14 +47,14 @@ typedef void (*wrap_linear_func)(const float s[4],
 typedef float (*compute_lambda_func)(const struct sp_sampler_varient *sampler,
                                      const float s[QUAD_SIZE],
                                      const float t[QUAD_SIZE],
-                                     const float p[QUAD_SIZE],
-                                     float lodbias);
+                                     const float p[QUAD_SIZE]);
 
 typedef void (*filter_func)(struct tgsi_sampler *tgsi_sampler,
                             const float s[QUAD_SIZE],
                             const float t[QUAD_SIZE],
                             const float p[QUAD_SIZE],
-                            float lodbias,
+                            const float c0[QUAD_SIZE],
+                            enum tgsi_sampler_control control,
                             float rgba[NUM_CHANNELS][QUAD_SIZE]);
 
 
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index fa7f6cb3bb..66259fd010 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -266,8 +266,6 @@ struct svga_hw_draw_state
    unsigned ts[16][TS_MAX];
    float cb[PIPE_SHADER_TYPES][CB_MAX][4];
 
-   unsigned shader_id[PIPE_SHADER_TYPES];
-   
    struct svga_shader_result *fs;
    struct svga_shader_result *vs;
    struct svga_hw_view_state views[PIPE_MAX_SAMPLERS];
diff --git a/src/gallium/drivers/svga/svga_pipe_draw.c b/src/gallium/drivers/svga/svga_pipe_draw.c
index 71a552862e..0f24ef4ee8 100644
--- a/src/gallium/drivers/svga/svga_pipe_draw.c
+++ b/src/gallium/drivers/svga/svga_pipe_draw.c
@@ -149,7 +149,7 @@ retry:
 
 
 
-static boolean
+static void
 svga_draw_range_elements( struct pipe_context *pipe,
                           struct pipe_buffer *index_buffer,
                           unsigned index_size,
@@ -162,7 +162,7 @@ svga_draw_range_elements( struct pipe_context *pipe,
    enum pipe_error ret = 0;
 
    if (!u_trim_pipe_prim( prim, &count ))
-      return TRUE;
+      return;
 
    /*
     * Mark currently bound target surfaces as dirty
@@ -183,7 +183,7 @@ svga_draw_range_elements( struct pipe_context *pipe,
 #ifdef DEBUG
    if (svga->curr.vs->base.id == svga->debug.disable_shader ||
        svga->curr.fs->base.id == svga->debug.disable_shader)
-      return 0;
+      return;
 #endif
 
    if (svga->state.sw.need_swtnl)
@@ -225,31 +225,29 @@ svga_draw_range_elements( struct pipe_context *pipe,
       svga_hwtnl_flush_retry( svga );
       svga_context_flush(svga, NULL);
    }
-
-   return ret == PIPE_OK;
 }
 
 
-static boolean 
+static void
 svga_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *index_buffer,
                     unsigned index_size,
                     unsigned prim, unsigned start, unsigned count)
 {
-   return svga_draw_range_elements( pipe, index_buffer,
-                                    index_size,
-                                    0, 0xffffffff,
-                                    prim, start, count );
+   svga_draw_range_elements( pipe, index_buffer,
+                             index_size,
+                             0, 0xffffffff,
+                             prim, start, count );
 }
 
-static boolean 
+static void
 svga_draw_arrays( struct pipe_context *pipe,
                   unsigned prim, unsigned start, unsigned count)
 {
-   return svga_draw_range_elements(pipe, NULL, 0, 
-                                   start, start + count - 1, 
-                                   prim, 
-                                   start, count);
+   svga_draw_range_elements(pipe, NULL, 0, 
+                            start, start + count - 1, 
+                            prim, 
+                            start, count);
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_pipe_fs.c b/src/gallium/drivers/svga/svga_pipe_fs.c
index a461a86dd3..5f1213e46a 100644
--- a/src/gallium/drivers/svga/svga_pipe_fs.c
+++ b/src/gallium/drivers/svga/svga_pipe_fs.c
@@ -111,6 +111,13 @@ void svga_delete_fs_state(struct pipe_context *pipe, void *shader)
       util_bitmask_clear( svga->fs_bm, result->id );
 
       svga_destroy_shader_result( result );
+
+      /*
+       * Remove stale references to this result to ensure a new result on the
+       * same address will be detected as a change.
+       */
+      if(result == svga->state.hw_draw.fs)
+         svga->state.hw_draw.fs = NULL;
    }
 
    FREE((void *)fs->base.tokens);
diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c
index 78053e755e..460a101f8c 100644
--- a/src/gallium/drivers/svga/svga_pipe_sampler.c
+++ b/src/gallium/drivers/svga/svga_pipe_sampler.c
@@ -76,7 +76,6 @@ static INLINE unsigned translate_img_filter( unsigned filter )
    switch (filter) {
    case PIPE_TEX_FILTER_NEAREST: return SVGA3D_TEX_FILTER_NEAREST;
    case PIPE_TEX_FILTER_LINEAR:  return SVGA3D_TEX_FILTER_LINEAR;
-   case PIPE_TEX_FILTER_ANISO:   return SVGA3D_TEX_FILTER_ANISOTROPIC;
    default:
       assert(0);
       return SVGA3D_TEX_FILTER_NEAREST;
@@ -107,6 +106,8 @@ svga_create_sampler_state(struct pipe_context *pipe,
    cso->magfilter = translate_img_filter( sampler->mag_img_filter );
    cso->minfilter = translate_img_filter( sampler->min_img_filter );
    cso->aniso_level = MAX2( (unsigned) sampler->max_anisotropy, 1 );
+   if(cso->aniso_level != 1)
+      cso->magfilter = cso->minfilter = SVGA3D_TEX_FILTER_ANISOTROPIC;
    cso->lod_bias = sampler->lod_bias;
    cso->addressu = translate_wrap_mode(sampler->wrap_s);
    cso->addressv = translate_wrap_mode(sampler->wrap_t);
diff --git a/src/gallium/drivers/svga/svga_pipe_vs.c b/src/gallium/drivers/svga/svga_pipe_vs.c
index e82d10c259..7e6ab576ad 100644
--- a/src/gallium/drivers/svga/svga_pipe_vs.c
+++ b/src/gallium/drivers/svga/svga_pipe_vs.c
@@ -176,6 +176,13 @@ static void svga_delete_vs_state(struct pipe_context *pipe, void *shader)
       util_bitmask_clear( svga->vs_bm, result->id );
 
       svga_destroy_shader_result( result );
+
+      /*
+       * Remove stale references to this result to ensure a new result on the
+       * same address will be detected as a change.
+       */
+      if(result == svga->state.hw_draw.vs)
+         svga->state.hw_draw.vs = NULL;
    }
 
    FREE((void *)vs->base.tokens);
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
index 1902b0106b..d29f3762d2 100644
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -40,8 +40,13 @@
 static INLINE int compare_fs_keys( const struct svga_fs_compile_key *a,
                                    const struct svga_fs_compile_key *b )
 {
-   unsigned keysize = svga_fs_key_size( a );
-   return memcmp( a, b, keysize );
+   unsigned keysize_a = svga_fs_key_size( a );
+   unsigned keysize_b = svga_fs_key_size( b );
+
+   if (keysize_a != keysize_b) {
+      return (int)(keysize_a - keysize_b);
+   }
+   return memcmp( a, b, keysize_a );
 }
 
 
@@ -67,7 +72,7 @@ static enum pipe_error compile_fs( struct svga_context *svga,
                                    struct svga_shader_result **out_result )
 {
    struct svga_shader_result *result;
-   enum pipe_error ret;
+   enum pipe_error ret = PIPE_ERROR;
 
    result = svga_translate_fragment_program( fs, key );
    if (result == NULL) {
@@ -268,16 +273,13 @@ static int emit_hw_fs( struct svga_context *svga,
    assert(id != SVGA3D_INVALID_ID);
 
    if (result != svga->state.hw_draw.fs) {
-      if (id != svga->state.hw_draw.shader_id[PIPE_SHADER_FRAGMENT]) {
-         ret = SVGA3D_SetShader(svga->swc,
-                                SVGA3D_SHADERTYPE_PS,
-                                id );
-         if (ret)
-            return ret;
-      }
+      ret = SVGA3D_SetShader(svga->swc,
+                             SVGA3D_SHADERTYPE_PS,
+                             id );
+      if (ret)
+         return ret;
 
       svga->dirty |= SVGA_NEW_FS_RESULT;
-      svga->state.hw_draw.shader_id[PIPE_SHADER_FRAGMENT] = id;
       svga->state.hw_draw.fs = result;      
    }
 
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
index 2313eafc37..fef652c0c0 100644
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -150,16 +150,13 @@ static int emit_hw_vs( struct svga_context *svga,
    }
 
    if (result != svga->state.hw_draw.vs) {
-      if (id != svga->state.hw_draw.shader_id[PIPE_SHADER_VERTEX]) {
-         ret = SVGA3D_SetShader(svga->swc,
-                                SVGA3D_SHADERTYPE_VS,
-                                id );
-         if (ret)
-            return ret;
-      }
+      ret = SVGA3D_SetShader(svga->swc,
+                             SVGA3D_SHADERTYPE_VS,
+                             id );
+      if (ret)
+         return ret;
 
       svga->dirty |= SVGA_NEW_VS_RESULT;
-      svga->state.hw_draw.shader_id[PIPE_SHADER_VERTEX] = id;
       svga->state.hw_draw.vs = result;      
    }
 
diff --git a/src/gallium/drivers/svga/svga_tgsi.h b/src/gallium/drivers/svga/svga_tgsi.h
index 896c90a89a..737a2213af 100644
--- a/src/gallium/drivers/svga/svga_tgsi.h
+++ b/src/gallium/drivers/svga/svga_tgsi.h
@@ -39,26 +39,24 @@ struct tgsi_token;
 
 struct svga_vs_compile_key
 {
-   ubyte need_prescale:1;
-   ubyte allow_psiz:1;
    unsigned zero_stride_vertex_elements;
-   ubyte num_zero_stride_vertex_elements:6;
+   unsigned need_prescale:1;
+   unsigned allow_psiz:1;
+   unsigned num_zero_stride_vertex_elements:6;
 };
 
 struct svga_fs_compile_key
 {
-   boolean light_twoside:1;
-   boolean front_cw:1;
-   ubyte num_textures;
-   ubyte num_unnormalized_coords;
+   unsigned light_twoside:1;
+   unsigned front_cw:1;
+   unsigned num_textures:8;
+   unsigned num_unnormalized_coords:8;
    struct {
-      ubyte compare_mode       : 1;
-      ubyte compare_func       : 3;
-      ubyte unnormalized       : 1;
-
-      ubyte width_height_idx   : 7;
-
-      ubyte texture_target;
+      unsigned compare_mode:1;
+      unsigned compare_func:3;
+      unsigned unnormalized:1;
+      unsigned width_height_idx:7;
+      unsigned texture_target:8;
    } tex[PIPE_MAX_SAMPLERS];
 };
 
@@ -121,8 +119,7 @@ static INLINE unsigned svga_vs_key_size( const struct svga_vs_compile_key *key )
 
 static INLINE unsigned svga_fs_key_size( const struct svga_fs_compile_key *key )
 {
-   return (const char *)&key->tex[key->num_textures].texture_target -
-      (const char *)key;
+   return (const char *)&key->tex[key->num_textures] - (const char *)key;
 }
 
 struct svga_shader_result *
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index ad47a56fba..075e4f9a0b 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -161,16 +161,15 @@ trace_context_draw_block(struct trace_context *tr_ctx, int flag)
    pipe_mutex_unlock(tr_ctx->draw_mutex);
 }
 
-static INLINE boolean
+static INLINE void
 trace_context_draw_arrays(struct pipe_context *_pipe,
                           unsigned mode, unsigned start, unsigned count)
 {
    struct trace_context *tr_ctx = trace_context(_pipe);
    struct pipe_context *pipe = tr_ctx->pipe;
-   boolean result;
 
    if (tr_ctx->curr.fs->disabled || tr_ctx->curr.vs->disabled)
-      return 0;
+      return;
 
    trace_context_draw_block(tr_ctx, 1);
 
@@ -181,19 +180,15 @@ trace_context_draw_arrays(struct pipe_context *_pipe,
    trace_dump_arg(uint, start);
    trace_dump_arg(uint, count);
 
-   result = pipe->draw_arrays(pipe, mode, start, count);
-
-   trace_dump_ret(bool, result);
+   pipe->draw_arrays(pipe, mode, start, count);
 
    trace_dump_call_end();
 
    trace_context_draw_block(tr_ctx, 2);
-
-   return result;
 }
 
 
-static INLINE boolean
+static INLINE void
 trace_context_draw_elements(struct pipe_context *_pipe,
                           struct pipe_buffer *_indexBuffer,
                           unsigned indexSize,
@@ -203,10 +198,9 @@ trace_context_draw_elements(struct pipe_context *_pipe,
    struct trace_buffer *tr_buf = trace_buffer(_indexBuffer);
    struct pipe_context *pipe = tr_ctx->pipe;
    struct pipe_buffer *indexBuffer = tr_buf->buffer;
-   boolean result;
 
    if (tr_ctx->curr.fs->disabled || tr_ctx->curr.vs->disabled)
-      return 0;
+      return;
 
    trace_context_draw_block(tr_ctx, 1);
 
@@ -221,19 +215,15 @@ trace_context_draw_elements(struct pipe_context *_pipe,
    trace_dump_arg(uint, start);
    trace_dump_arg(uint, count);
 
-   result = pipe->draw_elements(pipe, indexBuffer, indexSize, mode, start, count);
-
-   trace_dump_ret(bool, result);
+   pipe->draw_elements(pipe, indexBuffer, indexSize, mode, start, count);
 
    trace_dump_call_end();
 
    trace_context_draw_block(tr_ctx, 2);
-
-   return result;
 }
 
 
-static INLINE boolean
+static INLINE void
 trace_context_draw_range_elements(struct pipe_context *_pipe,
                                   struct pipe_buffer *_indexBuffer,
                                   unsigned indexSize,
@@ -247,10 +237,9 @@ trace_context_draw_range_elements(struct pipe_context *_pipe,
    struct trace_buffer *tr_buf = trace_buffer(_indexBuffer);
    struct pipe_context *pipe = tr_ctx->pipe;
    struct pipe_buffer *indexBuffer = tr_buf->buffer;
-   boolean result;
 
    if (tr_ctx->curr.fs->disabled || tr_ctx->curr.vs->disabled)
-      return 0;
+      return;
 
    trace_context_draw_block(tr_ctx, 1);
 
@@ -267,18 +256,14 @@ trace_context_draw_range_elements(struct pipe_context *_pipe,
    trace_dump_arg(uint, start);
    trace_dump_arg(uint, count);
 
-   result = pipe->draw_range_elements(pipe,
-                                      indexBuffer,
-                                      indexSize, minIndex, maxIndex,
-                                      mode, start, count);
-
-   trace_dump_ret(bool, result);
+   pipe->draw_range_elements(pipe,
+                             indexBuffer,
+                             indexSize, minIndex, maxIndex,
+                             mode, start, count);
 
    trace_dump_call_end();
 
    trace_context_draw_block(tr_ctx, 2);
-
-   return result;
 }
 
 
diff --git a/src/gallium/drivers/trace/tr_dump_state.c b/src/gallium/drivers/trace/tr_dump_state.c
index 0102cc1876..86237e03bc 100644
--- a/src/gallium/drivers/trace/tr_dump_state.c
+++ b/src/gallium/drivers/trace/tr_dump_state.c
@@ -409,7 +409,7 @@ void trace_dump_sampler_state(const struct pipe_sampler_state *state)
    trace_dump_member(uint, state, min_img_filter);
    trace_dump_member(uint, state, min_mip_filter);
    trace_dump_member(uint, state, mag_img_filter);
-   trace_dump_member(bool, state, compare_mode);
+   trace_dump_member(uint, state, compare_mode);
    trace_dump_member(uint, state, compare_func);
    trace_dump_member(bool, state, normalized_coords);
    trace_dump_member(uint, state, prefilter);