59 files changed, 480 insertions, 366 deletions
diff --git a/src/gallium/drivers/galahad/glhd_context.c b/src/gallium/drivers/galahad/glhd_context.c
index a572ad22bd..8cbf0b1de4 100644
--- a/src/gallium/drivers/galahad/glhd_context.c
+++ b/src/gallium/drivers/galahad/glhd_context.c
@@ -381,6 +381,8 @@ galahad_create_vertex_elements_state(struct pipe_context *_pipe,
    struct galahad_context *glhd_pipe = galahad_context(_pipe);
    struct pipe_context *pipe = glhd_pipe->pipe;
 
+   /* XXX check if stride lines up with element size, at least for floats */
+
    return pipe->create_vertex_elements_state(pipe,
                                              num_elements,
                                              vertex_elements);
diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
index 25c53210be..9e20010c4a 100644
--- a/src/gallium/drivers/i915/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -924,6 +924,14 @@ i915_translate_instructions(struct i915_fp_compile *p,
       tgsi_parse_token( &parse );
 
       switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_PROPERTY:
+         /*
+          * We only support one cbuf, but we still need to ignore the property
+          * correctly so we don't hit the assert at the end of the switch case.
+          */
+         assert(parse.FullToken.FullProperty.Property.PropertyName ==
+                TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS);
+         break;
       case TGSI_TOKEN_TYPE_DECLARATION:
          if (parse.FullToken.FullDeclaration.Declaration.File
                   == TGSI_FILE_CONSTANT) {
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index f66478e729..bdbc08e808 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -35,7 +35,6 @@
 #include "i915_debug.h"
 #include "i915_context.h"
 #include "i915_screen.h"
-#include "i915_surface.h"
 #include "i915_resource.h"
 #include "i915_winsys.h"
 #include "i915_public.h"
diff --git a/src/gallium/drivers/i965/intel_decode.h b/src/gallium/drivers/i965/intel_decode.h
index 7683097b86..6201a23d6a 100644
--- a/src/gallium/drivers/i965/intel_decode.h
+++ b/src/gallium/drivers/i965/intel_decode.h
@@ -25,5 +25,7 @@
  *
  */
 
+#include "pipe/p_compiler.h"
+
 int intel_decode(const uint32_t *data, int count, uint32_t hw_offset, uint32_t devid);
 void intel_decode_context_reset(void);
diff --git a/src/gallium/drivers/i965/intel_structs.h b/src/gallium/drivers/i965/intel_structs.h
index 522e3bd92c..ec6eec8910 100644
--- a/src/gallium/drivers/i965/intel_structs.h
+++ b/src/gallium/drivers/i965/intel_structs.h
@@ -1,6 +1,8 @@
 #ifndef INTEL_STRUCTS_H
 #define INTEL_STRUCTS_H
 
+#include "brw_types.h"
+
 struct br0 {
    GLuint length:8;
    GLuint pad0:3;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
index 5c9392504f..06206a24d8 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
@@ -34,10 +34,12 @@
 #ifndef LP_BLD_ALPHA_H
 #define LP_BLD_ALPHA_H
 
+#include "pipe/p_compiler.h"
 
 #include "gallivm/lp_bld.h"
 
 struct pipe_alpha_state;
+struct gallivm_state;
 struct lp_type;
 struct lp_build_mask_context;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
index 038b136a28..e01fc46ec1 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
@@ -36,10 +36,14 @@
 #define LP_BLD_DEPTH_H
 
 
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
 #include "gallivm/lp_bld.h"
 
  
 struct pipe_depth_state;
+struct gallivm_state;
 struct util_format_description;
 struct lp_type;
 struct lp_build_mask_context;
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index a775990f92..482a902dd2 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -36,9 +36,7 @@
 #include "util/u_memory.h"
 #include "gallivm/lp_bld_init.h"
 #include "gallivm/lp_bld_debug.h"
-#include "gallivm/lp_bld_intr.h"
 #include "lp_context.h"
-#include "lp_screen.h"
 #include "lp_jit.h"
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_perf.h b/src/gallium/drivers/llvmpipe/lp_perf.h
index b23a100b87..455adf7d6f 100644
--- a/src/gallium/drivers/llvmpipe/lp_perf.h
+++ b/src/gallium/drivers/llvmpipe/lp_perf.h
@@ -33,6 +33,7 @@
 #ifndef LP_PERF_H
 #define LP_PERF_H
 
+#include "pipe/p_compiler.h"
 
 /**
  * Various counters
diff --git a/src/gallium/drivers/llvmpipe/lp_scene_queue.h b/src/gallium/drivers/llvmpipe/lp_scene_queue.h
index fd7c65a2c8..dd9ab593b4 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene_queue.h
+++ b/src/gallium/drivers/llvmpipe/lp_scene_queue.h
@@ -29,6 +29,8 @@
 #ifndef LP_SCENE_QUEUE
 #define LP_SCENE_QUEUE
 
+#include "pipe/p_compiler.h"
+
 struct lp_scene_queue;
 struct lp_scene;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 2c4943a69f..ae207617cc 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -90,7 +90,6 @@
 #include "lp_context.h"
 #include "lp_debug.h"
 #include "lp_perf.h"
-#include "lp_screen.h"
 #include "lp_setup.h"
 #include "lp_state.h"
 #include "lp_tex_sample.h"
diff --git a/src/gallium/drivers/nvfx/nv30_fragtex.c b/src/gallium/drivers/nvfx/nv30_fragtex.c
index 951fb202ed..b609891d31 100644
--- a/src/gallium/drivers/nvfx/nv30_fragtex.c
+++ b/src/gallium/drivers/nvfx/nv30_fragtex.c
@@ -71,6 +71,7 @@ nv30_fragtex_set(struct nvfx_context *nvfx, int unit)
 	struct nvfx_sampler_view* sv = (struct nvfx_sampler_view*)nvfx->fragment_sampler_views[unit];
 	struct nouveau_bo *bo = ((struct nvfx_miptree *)sv->base.texture)->base.bo;
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	unsigned txf;
 	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
 	unsigned use_rect;
@@ -102,7 +103,7 @@ nv30_fragtex_set(struct nvfx_context *nvfx, int unit)
 	txf = sv->u.nv30.fmt[ps->compare + (use_rect ? 2 : 0)];
 
 	MARK_RING(chan, 9, 2);
-	OUT_RING(chan, RING_3D(NV30_3D_TEX_OFFSET(unit), 8));
+	BEGIN_RING(chan, eng3d, NV30_3D_TEX_OFFSET(unit), 8);
 	OUT_RELOC(chan, bo, sv->offset, tex_flags | NOUVEAU_BO_LOW, 0, 0);
 	OUT_RELOC(chan, bo, txf,
 		tex_flags | NOUVEAU_BO_OR,
diff --git a/src/gallium/drivers/nvfx/nv40_fragtex.c b/src/gallium/drivers/nvfx/nv40_fragtex.c
index e8ab403f72..563183d9d0 100644
--- a/src/gallium/drivers/nvfx/nv40_fragtex.c
+++ b/src/gallium/drivers/nvfx/nv40_fragtex.c
@@ -76,6 +76,7 @@ void
 nv40_fragtex_set(struct nvfx_context *nvfx, int unit)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	struct nvfx_sampler_state *ps = nvfx->tex_sampler[unit];
 	struct nvfx_sampler_view* sv = (struct nvfx_sampler_view*)nvfx->fragment_sampler_views[unit];
 	struct nouveau_bo *bo = ((struct nvfx_miptree *)sv->base.texture)->base.bo;
@@ -87,7 +88,7 @@ nv40_fragtex_set(struct nvfx_context *nvfx, int unit)
 	txf = sv->u.nv40.fmt[ps->compare] | ps->fmt;
 
 	MARK_RING(chan, 11, 2);
-	OUT_RING(chan, RING_3D(NV30_3D_TEX_OFFSET(unit), 8));
+	BEGIN_RING(chan, eng3d, NV30_3D_TEX_OFFSET(unit), 8);
 	OUT_RELOC(chan, bo, sv->offset, tex_flags | NOUVEAU_BO_LOW, 0, 0);
 	OUT_RELOC(chan, bo, txf, tex_flags | NOUVEAU_BO_OR,
 			NV30_3D_TEX_FORMAT_DMA0, NV30_3D_TEX_FORMAT_DMA1);
@@ -97,7 +98,7 @@ nv40_fragtex_set(struct nvfx_context *nvfx, int unit)
 	OUT_RING(chan, ps->filt | sv->filt);
 	OUT_RING(chan, sv->npot_size);
 	OUT_RING(chan, ps->bcol);
-	OUT_RING(chan, RING_3D(NV40_3D_TEX_SIZE1(unit), 1));
+	BEGIN_RING(chan, eng3d, NV40_3D_TEX_SIZE1(unit), 1);
 	OUT_RING(chan, sv->u.nv40.npot_size2);
 
 	nvfx->hw_txf[unit] = txf;
diff --git a/src/gallium/drivers/nvfx/nvfx_context.c b/src/gallium/drivers/nvfx/nvfx_context.c
index 95834d2327..6c8934d3a4 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.c
+++ b/src/gallium/drivers/nvfx/nvfx_context.c
@@ -13,13 +13,13 @@ nvfx_flush(struct pipe_context *pipe, unsigned flags,
 	struct nvfx_context *nvfx = nvfx_context(pipe);
 	struct nvfx_screen *screen = nvfx->screen;
 	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
 
 	/* XXX: we need to actually be intelligent here */
 	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
-		WAIT_RING(chan, 4);
-		OUT_RING(chan, RING_3D(0x1fd8, 1));
+		BEGIN_RING(chan, eng3d, 0x1fd8, 1);
 		OUT_RING(chan, 2);
-		OUT_RING(chan, RING_3D(0x1fd8, 1));
+		BEGIN_RING(chan, eng3d, 0x1fd8, 1);
 		OUT_RING(chan, 1);
 	}
 
diff --git a/src/gallium/drivers/nvfx/nvfx_context.h b/src/gallium/drivers/nvfx/nvfx_context.h
index 6ef2a6945d..2238aa1ad0 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.h
+++ b/src/gallium/drivers/nvfx/nvfx_context.h
@@ -339,30 +339,31 @@ extern void nvfx_init_vertprog_functions(struct nvfx_context *nvfx);
 /* nvfx_push.c */
 extern void nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info);
 
-/* must WAIT_RING(chan, ncomp + 1) or equivalent beforehand! */
-static inline void nvfx_emit_vtx_attr(struct nouveau_channel* chan, unsigned attrib, const float* v, unsigned ncomp)
+static inline void nvfx_emit_vtx_attr(struct nouveau_channel* chan,
+		struct nouveau_grobj *eng3d, unsigned attrib, const float* v,
+		unsigned ncomp)
 {
 	switch (ncomp) {
 	case 4:
-		OUT_RING(chan, RING_3D(NV30_3D_VTX_ATTR_4F_X(attrib), 4));
+		BEGIN_RING(chan, eng3d, NV30_3D_VTX_ATTR_4F_X(attrib), 4);
 		OUT_RING(chan, fui(v[0]));
 		OUT_RING(chan, fui(v[1]));
 		OUT_RING(chan,  fui(v[2]));
 		OUT_RING(chan,  fui(v[3]));
 		break;
 	case 3:
-		OUT_RING(chan, RING_3D(NV30_3D_VTX_ATTR_3F_X(attrib), 3));
+		BEGIN_RING(chan, eng3d, NV30_3D_VTX_ATTR_3F_X(attrib), 3);
 		OUT_RING(chan,  fui(v[0]));
 		OUT_RING(chan,  fui(v[1]));
 		OUT_RING(chan,  fui(v[2]));
 		break;
 	case 2:
-		OUT_RING(chan, RING_3D(NV30_3D_VTX_ATTR_2F_X(attrib), 2));
+		BEGIN_RING(chan, eng3d, NV30_3D_VTX_ATTR_2F_X(attrib), 2);
 		OUT_RING(chan,  fui(v[0]));
 		OUT_RING(chan,  fui(v[1]));
 		break;
 	case 1:
-		OUT_RING(chan, RING_3D(NV30_3D_VTX_ATTR_1F(attrib), 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_VTX_ATTR_1F(attrib), 1);
 		OUT_RING(chan,  fui(v[0]));
 		break;
 	}
diff --git a/src/gallium/drivers/nvfx/nvfx_draw.c b/src/gallium/drivers/nvfx/nvfx_draw.c
index 61f888a8ea..81f1ec485d 100644
--- a/src/gallium/drivers/nvfx/nvfx_draw.c
+++ b/src/gallium/drivers/nvfx/nvfx_draw.c
@@ -28,10 +28,10 @@ nvfx_render_flush(struct draw_stage *stage, unsigned flags)
 	struct nvfx_render_stage *rs = nvfx_render_stage(stage);
 	struct nvfx_context *nvfx = rs->nvfx;
 	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 
 	if (rs->prim != NV30_3D_VERTEX_BEGIN_END_STOP) {
-		assert(AVAIL_RING(chan) >= 2);
-		OUT_RING(chan, RING_3D(NV30_3D_VERTEX_BEGIN_END, 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_VERTEX_BEGIN_END, 1);
 		OUT_RING(chan, NV30_3D_VERTEX_BEGIN_END_STOP);
 		rs->prim = NV30_3D_VERTEX_BEGIN_END_STOP;
 	}
@@ -46,6 +46,7 @@ nvfx_render_prim(struct draw_stage *stage, struct prim_header *prim,
 
 	struct nvfx_screen *screen = nvfx->screen;
 	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
 	boolean no_elements = nvfx->vertprog->draw_no_elements;
 	unsigned num_attribs = nvfx->vertprog->draw_elements;
 
@@ -63,7 +64,7 @@ nvfx_render_prim(struct draw_stage *stage, struct prim_header *prim,
 	/* Switch primitive modes if necessary */
 	if (rs->prim != mode) {
 		if (rs->prim != NV30_3D_VERTEX_BEGIN_END_STOP) {
-			OUT_RING(chan, RING_3D(NV30_3D_VERTEX_BEGIN_END, 1));
+			BEGIN_RING(chan, eng3d, NV30_3D_VERTEX_BEGIN_END, 1);
 			OUT_RING(chan, NV30_3D_VERTEX_BEGIN_END_STOP);
 		}
 
@@ -74,23 +75,24 @@ nvfx_render_prim(struct draw_stage *stage, struct prim_header *prim,
 			int i;
 			for(i = 0; i < 32; ++i)
 			{
-				OUT_RING(chan, RING_3D(0x1dac, 1));
+				BEGIN_RING(chan, eng3d, 0x1dac, 1);
 				OUT_RING(chan, 0);
 			}
 		}
 
-		OUT_RING(chan, RING_3D(NV30_3D_VERTEX_BEGIN_END, 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_VERTEX_BEGIN_END, 1);
 		OUT_RING  (chan, mode);
 		rs->prim = mode;
 	}
 
-	OUT_RING(chan, RING_3D_NI(NV30_3D_VERTEX_DATA, num_attribs * 4 * count));
 	if(no_elements) {
+		BEGIN_RING_NI(chan, eng3d, NV30_3D_VERTEX_DATA, 4);
 		OUT_RING(chan, 0);
 		OUT_RING(chan, 0);
 		OUT_RING(chan, 0);
 		OUT_RING(chan, 0);
 	} else {
+		BEGIN_RING_NI(chan, eng3d, NV30_3D_VERTEX_DATA, num_attribs * 4 * count);
 		for (unsigned i = 0; i < count; ++i)
 		{
 			struct vertex_header* v = prim->v[i];
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index 1740d72a8a..dbd7c77346 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -1233,6 +1233,7 @@ void
 nvfx_fragprog_validate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	struct nvfx_pipe_fragment_program *pfp = nvfx->fragprog;
 	struct nvfx_vertex_program* vp;
 
@@ -1499,17 +1500,17 @@ update:
 		nvfx->hw_fragprog = fp;
 
 		MARK_RING(chan, 8, 1);
-		OUT_RING(chan, RING_3D(NV30_3D_FP_ACTIVE_PROGRAM, 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_FP_ACTIVE_PROGRAM, 1);
 		OUT_RELOC(chan, fp->fpbo->bo, offset, NOUVEAU_BO_VRAM |
 			      NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
 			      NOUVEAU_BO_OR, NV30_3D_FP_ACTIVE_PROGRAM_DMA0,
 			      NV30_3D_FP_ACTIVE_PROGRAM_DMA1);
-		OUT_RING(chan, RING_3D(NV30_3D_FP_CONTROL, 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_FP_CONTROL, 1);
 		OUT_RING(chan, fp->fp_control);
 		if(!nvfx->is_nv4x) {
-			OUT_RING(chan, RING_3D(NV30_3D_FP_REG_CONTROL, 1));
+			BEGIN_RING(chan, eng3d, NV30_3D_FP_REG_CONTROL, 1);
 			OUT_RING(chan, (1<<16)|0x4);
-			OUT_RING(chan, RING_3D(NV30_3D_TEX_UNITS_ENABLE, 1));
+			BEGIN_RING(chan, eng3d, NV30_3D_TEX_UNITS_ENABLE, 1);
 			OUT_RING(chan, fp->samplers);
 		}
 	}
@@ -1518,8 +1519,7 @@ update:
 		unsigned pointsprite_control = fp->point_sprite_control | nvfx->rasterizer->pipe.point_quad_rasterization;
 		if(pointsprite_control != nvfx->hw_pointsprite_control)
 		{
-			WAIT_RING(chan, 2);
-			OUT_RING(chan, RING_3D(NV30_3D_POINT_SPRITE, 1));
+			BEGIN_RING(chan, eng3d, NV30_3D_POINT_SPRITE, 1);
 			OUT_RING(chan, pointsprite_control);
 			nvfx->hw_pointsprite_control = pointsprite_control;
 		}
diff --git a/src/gallium/drivers/nvfx/nvfx_fragtex.c b/src/gallium/drivers/nvfx/nvfx_fragtex.c
index fd0aff6a1a..1c4901df0e 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragtex.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragtex.c
@@ -177,6 +177,7 @@ void
 nvfx_fragtex_validate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	unsigned samplers, unit;
 
 	samplers = nvfx->dirty_samplers;
@@ -197,9 +198,8 @@ nvfx_fragtex_validate(struct nvfx_context *nvfx)
 			else
 				nv40_fragtex_set(nvfx, unit);
 		} else {
-			WAIT_RING(chan, 2);
 			/* this is OK for nv40 too */
-			OUT_RING(chan, RING_3D(NV30_3D_TEX_ENABLE(unit), 1));
+			BEGIN_RING(chan, eng3d, NV30_3D_TEX_ENABLE(unit), 1);
 			OUT_RING(chan, 0);
 			nvfx->hw_samplers &= ~(1 << unit);
 		}
diff --git a/src/gallium/drivers/nvfx/nvfx_push.c b/src/gallium/drivers/nvfx/nvfx_push.c
index ebf47e6ed3..6391741a2e 100644
--- a/src/gallium/drivers/nvfx/nvfx_push.c
+++ b/src/gallium/drivers/nvfx/nvfx_push.c
@@ -10,6 +10,7 @@
 
 struct push_context {
 	struct nouveau_channel* chan;
+	struct nouveau_grobj *eng3d;
 
 	void *idxbuf;
 	int32_t idxbias;
@@ -27,9 +28,10 @@ static void
 emit_edgeflag(void *priv, boolean enabled)
 {
 	struct push_context* ctx = priv;
+	struct nouveau_grobj *eng3d = ctx->eng3d;
 	struct nouveau_channel *chan = ctx->chan;
 
-	OUT_RING(chan, RING_3D(NV30_3D_EDGEFLAG, 1));
+	BEGIN_RING(chan, eng3d, NV30_3D_EDGEFLAG, 1);
 	OUT_RING(chan, enabled ? 1 : 0);
 }
 
@@ -37,6 +39,7 @@ static void
 emit_vertices_lookup8(void *priv, unsigned start, unsigned count)
 {
         struct push_context *ctx = priv;
+        struct nouveau_grobj *eng3d = ctx->eng3d;
         uint8_t* elts = (uint8_t*)ctx->idxbuf + start;
 
         while(count)
@@ -44,7 +47,7 @@ emit_vertices_lookup8(void *priv, unsigned start, unsigned count)
                 unsigned push = MIN2(count, ctx->max_vertices_per_packet);
                 unsigned length = push * ctx->vertex_length;
 
-                OUT_RING(ctx->chan, RING_3D_NI(NV30_3D_VERTEX_DATA, length));
+                BEGIN_RING_NI(ctx->chan, eng3d, NV30_3D_VERTEX_DATA, length);
                 ctx->translate->run_elts8(ctx->translate, elts, push, 0, ctx->chan->cur);
                 ctx->chan->cur += length;
 
@@ -57,6 +60,7 @@ static void
 emit_vertices_lookup16(void *priv, unsigned start, unsigned count)
 {
 	struct push_context *ctx = priv;
+	struct nouveau_grobj *eng3d = ctx->eng3d;
         uint16_t* elts = (uint16_t*)ctx->idxbuf + start;
 
         while(count)
@@ -64,7 +68,7 @@ emit_vertices_lookup16(void *priv, unsigned start, unsigned count)
                 unsigned push = MIN2(count, ctx->max_vertices_per_packet);
                 unsigned length = push * ctx->vertex_length;
 
-                OUT_RING(ctx->chan, RING_3D_NI(NV30_3D_VERTEX_DATA, length));
+                BEGIN_RING_NI(ctx->chan, eng3d, NV30_3D_VERTEX_DATA, length);
                 ctx->translate->run_elts16(ctx->translate, elts, push, 0, ctx->chan->cur);
                 ctx->chan->cur += length;
 
@@ -77,6 +81,7 @@ static void
 emit_vertices_lookup32(void *priv, unsigned start, unsigned count)
 {
         struct push_context *ctx = priv;
+        struct nouveau_grobj *eng3d = ctx->eng3d;
         uint32_t* elts = (uint32_t*)ctx->idxbuf + start;
 
         while(count)
@@ -84,7 +89,7 @@ emit_vertices_lookup32(void *priv, unsigned start, unsigned count)
                 unsigned push = MIN2(count, ctx->max_vertices_per_packet);
                 unsigned length = push * ctx->vertex_length;
 
-                OUT_RING(ctx->chan, RING_3D_NI(NV30_3D_VERTEX_DATA, length));
+                BEGIN_RING_NI(ctx->chan, eng3d, NV30_3D_VERTEX_DATA, length);
                 ctx->translate->run_elts(ctx->translate, elts, push, 0, ctx->chan->cur);
                 ctx->chan->cur += length;
 
@@ -97,13 +102,14 @@ static void
 emit_vertices(void *priv, unsigned start, unsigned count)
 {
         struct push_context *ctx = priv;
+        struct nouveau_grobj *eng3d = ctx->eng3d;
 
         while(count)
         {
 		unsigned push = MIN2(count, ctx->max_vertices_per_packet);
 		unsigned length = push * ctx->vertex_length;
 
-		OUT_RING(ctx->chan, RING_3D_NI(NV30_3D_VERTEX_DATA, length));
+		BEGIN_RING_NI(ctx->chan, eng3d, NV30_3D_VERTEX_DATA, length);
 		ctx->translate->run(ctx->translate, start, push, 0, ctx->chan->cur);
 		ctx->chan->cur += length;
 
@@ -116,10 +122,11 @@ static void
 emit_ranges(void* priv, unsigned start, unsigned vc, unsigned reg)
 {
 	struct push_context* ctx = priv;
+	struct nouveau_grobj *eng3d = ctx->eng3d;
 	struct nouveau_channel *chan = ctx->chan;
 	unsigned nr = (vc & 0xff);
 	if (nr) {
-		OUT_RING(chan, RING_3D(reg, 1));
+		BEGIN_RING(chan, eng3d, reg, 1);
 		OUT_RING  (chan, ((nr - 1) << 24) | start);
 		start += nr;
 	}
@@ -130,7 +137,7 @@ emit_ranges(void* priv, unsigned start, unsigned vc, unsigned reg)
 
 		nr -= push;
 
-		OUT_RING(chan, RING_3D_NI(reg, push));
+		BEGIN_RING_NI(chan, eng3d, reg, push);
 		while (push--) {
 			OUT_RING(chan, ((0x100 - 1) << 24) | start);
 			start += 0x100;
@@ -154,12 +161,13 @@ static INLINE void
 emit_elt8(void* priv, unsigned start, unsigned vc)
 {
 	struct push_context* ctx = priv;
+	struct nouveau_grobj *eng3d = ctx->eng3d;
 	struct nouveau_channel *chan = ctx->chan;
 	uint8_t *elts = (uint8_t *)ctx->idxbuf + start;
 	int idxbias = ctx->idxbias;
 
 	if (vc & 1) {
-		OUT_RING(chan, RING_3D(NV30_3D_VB_ELEMENT_U32, 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_VB_ELEMENT_U32, 1);
 		OUT_RING  (chan, elts[0]);
 		elts++; vc--;
 	}
@@ -168,7 +176,7 @@ emit_elt8(void* priv, unsigned start, unsigned vc)
 		unsigned i;
 		unsigned push = MIN2(vc, 2047 * 2);
 
-		OUT_RING(chan, RING_3D_NI(NV30_3D_VB_ELEMENT_U16, push >> 1));
+		BEGIN_RING_NI(chan, eng3d, NV30_3D_VB_ELEMENT_U16, push >> 1);
 		for (i = 0; i < push; i+=2)
 			OUT_RING(chan, ((elts[i+1] + idxbias) << 16) | (elts[i] + idxbias));
 
@@ -181,12 +189,13 @@ static INLINE void
 emit_elt16(void* priv, unsigned start, unsigned vc)
 {
 	struct push_context* ctx = priv;
+	struct nouveau_grobj *eng3d = ctx->eng3d;
 	struct nouveau_channel *chan = ctx->chan;
 	uint16_t *elts = (uint16_t *)ctx->idxbuf + start;
 	int idxbias = ctx->idxbias;
 
 	if (vc & 1) {
-		OUT_RING(chan, RING_3D(NV30_3D_VB_ELEMENT_U32, 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_VB_ELEMENT_U32, 1);
 		OUT_RING  (chan, elts[0]);
 		elts++; vc--;
 	}
@@ -195,7 +204,7 @@ emit_elt16(void* priv, unsigned start, unsigned vc)
 		unsigned i;
 		unsigned push = MIN2(vc, 2047 * 2);
 
-		OUT_RING(chan, RING_3D_NI(NV30_3D_VB_ELEMENT_U16, push >> 1));
+		BEGIN_RING_NI(chan, eng3d, NV30_3D_VB_ELEMENT_U16, push >> 1);
 		for (i = 0; i < push; i+=2)
 			OUT_RING(chan, ((elts[i+1] + idxbias) << 16) | (elts[i] + idxbias));
 
@@ -208,6 +217,7 @@ static INLINE void
 emit_elt32(void* priv, unsigned start, unsigned vc)
 {
 	struct push_context* ctx = priv;
+	struct nouveau_grobj *eng3d = ctx->eng3d;
 	struct nouveau_channel *chan = ctx->chan;
 	uint32_t *elts = (uint32_t *)ctx->idxbuf + start;
 	int idxbias = ctx->idxbias;
@@ -215,8 +225,7 @@ emit_elt32(void* priv, unsigned start, unsigned vc)
 	while (vc) {
 		unsigned push = MIN2(vc, 2047);
 
-		OUT_RING(chan, RING_3D_NI(NV30_3D_VB_ELEMENT_U32, push));
-		assert(AVAIL_RING(chan) >= push);
+		BEGIN_RING_NI(chan, eng3d, NV30_3D_VB_ELEMENT_U32, push);
 		if(idxbias)
 		{
 			for(unsigned i = 0; i < push; ++i)
@@ -235,6 +244,7 @@ nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
 	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	struct push_context ctx;
 	struct util_split_prim s;
 	unsigned instances_left = info->instance_count;
@@ -251,6 +261,7 @@ nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 			+ 4; /* potential edgeflag enable/disable */
 
 	ctx.chan = nvfx->screen->base.channel;
+	ctx.eng3d = nvfx->screen->eng3d;
 	ctx.translate = nvfx->vtxelt->translate;
 	ctx.idxbuf = NULL;
 	ctx.vertex_length = nvfx->vtxelt->vertex_length;
@@ -333,8 +344,9 @@ nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 
 		nvfx->vtxelt->per_instance[i].base.fetch_rgba_float(v, per_instance[i].map, 0, 0);
 
-		WAIT_RING(chan, 5);
-		nvfx_emit_vtx_attr(chan, nvfx->vtxelt->per_instance[i].base.idx, v, nvfx->vtxelt->per_instance[i].base.ncomp);
+		nvfx_emit_vtx_attr(chan, eng3d,
+				   nvfx->vtxelt->per_instance[i].base.idx, v,
+				   nvfx->vtxelt->per_instance[i].base.ncomp);
 	}
 
 	/* per-instance loop */
@@ -374,15 +386,18 @@ nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 					int i;
 					for(i = 0; i < 32; ++i)
 					{
-						OUT_RING(chan, RING_3D(0x1dac, 1));
+						BEGIN_RING(chan, eng3d,
+							   0x1dac, 1);
 						OUT_RING(chan, 0);
 					}
 				}
 
-				OUT_RING(chan, RING_3D(NV30_3D_VERTEX_BEGIN_END, 1));
+				BEGIN_RING(chan, eng3d,
+					   NV30_3D_VERTEX_BEGIN_END, 1);
 				OUT_RING(chan, hw_mode);
 				done = util_split_prim_next(&s, max_verts);
-				OUT_RING(chan, RING_3D(NV30_3D_VERTEX_BEGIN_END, 1));
+				BEGIN_RING(chan, eng3d,
+					   NV30_3D_VERTEX_BEGIN_END, 1);
 				OUT_RING(chan, 0);
 
 				if(done)
@@ -406,8 +421,10 @@ nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 				per_instance[i].step = 0;
 
 				nvfx->vtxelt->per_instance[i].base.fetch_rgba_float(v, per_instance[i].map, 0, 0);
-				WAIT_RING(chan, 5);
-				nvfx_emit_vtx_attr(chan, nvfx->vtxelt->per_instance[i].base.idx, v, nvfx->vtxelt->per_instance[i].base.ncomp);
+				nvfx_emit_vtx_attr(chan, eng3d,
+						   nvfx->vtxelt->per_instance[i].base.idx,
+						   v,
+						   nvfx->vtxelt->per_instance[i].base.ncomp);
 			}
 		}
 	}
diff --git a/src/gallium/drivers/nvfx/nvfx_query.c b/src/gallium/drivers/nvfx/nvfx_query.c
index 3935ffd7f9..3cd6bf1e47 100644
--- a/src/gallium/drivers/nvfx/nvfx_query.c
+++ b/src/gallium/drivers/nvfx/nvfx_query.c
@@ -49,6 +49,7 @@ nvfx_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 	struct nvfx_query *q = nvfx_query(pq);
 	struct nvfx_screen *screen = nvfx->screen;
 	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
 	uint64_t tmp;
 
 	assert(!nvfx->query);
@@ -72,10 +73,9 @@ nvfx_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 
 	nouveau_notifier_reset(nvfx->screen->query, q->object->start);
 
-	WAIT_RING(chan, 4);
-	OUT_RING(chan, RING_3D(NV30_3D_QUERY_RESET, 1));
+	BEGIN_RING(chan, eng3d, NV30_3D_QUERY_RESET, 1);
 	OUT_RING(chan, 1);
-	OUT_RING(chan, RING_3D(NV30_3D_QUERY_ENABLE, 1));
+	BEGIN_RING(chan, eng3d, NV30_3D_QUERY_ENABLE, 1);
 	OUT_RING(chan, 1);
 
 	q->ready = FALSE;
@@ -88,15 +88,15 @@ nvfx_query_end(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
 	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	struct nvfx_query *q = nvfx_query(pq);
 
 	assert(nvfx->query == pq);
 
-	WAIT_RING(chan, 4);
-	OUT_RING(chan, RING_3D(NV30_3D_QUERY_GET, 1));
+	BEGIN_RING(chan, eng3d, NV30_3D_QUERY_GET, 1);
 	OUT_RING  (chan, (0x01 << NV30_3D_QUERY_GET_UNK24__SHIFT) |
 		   ((q->object->start * 32) << NV30_3D_QUERY_GET_OFFSET__SHIFT));
-	OUT_RING(chan, RING_3D(NV30_3D_QUERY_ENABLE, 1));
+	BEGIN_RING(chan, eng3d, NV30_3D_QUERY_ENABLE, 1);
 	OUT_RING(chan, 0);
 	FIRE_RING(chan);
 
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.c b/src/gallium/drivers/nvfx/nvfx_screen.c
index 92e1d33090..aa1e9567d3 100644
--- a/src/gallium/drivers/nvfx/nvfx_screen.c
+++ b/src/gallium/drivers/nvfx/nvfx_screen.c
@@ -301,98 +301,100 @@ nvfx_screen_destroy(struct pipe_screen *pscreen)
 static void nv30_screen_init(struct nvfx_screen *screen)
 {
 	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
 	int i;
 
 	/* TODO: perhaps we should do some of this on nv40 too? */
 	for (i=1; i<8; i++) {
-		OUT_RING(chan, RING_3D(NV30_3D_VIEWPORT_CLIP_HORIZ(i), 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_VIEWPORT_CLIP_HORIZ(i), 1);
 		OUT_RING(chan, 0);
-		OUT_RING(chan, RING_3D(NV30_3D_VIEWPORT_CLIP_VERT(i), 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_VIEWPORT_CLIP_VERT(i), 1);
 		OUT_RING(chan, 0);
 	}
 
-	OUT_RING(chan, RING_3D(0x220, 1));
+	BEGIN_RING(chan, eng3d, 0x220, 1);
 	OUT_RING(chan, 1);
 
-	OUT_RING(chan, RING_3D(0x03b0, 1));
+	BEGIN_RING(chan, eng3d, 0x03b0, 1);
 	OUT_RING(chan, 0x00100000);
-	OUT_RING(chan, RING_3D(0x1454, 1));
+	BEGIN_RING(chan, eng3d, 0x1454, 1);
 	OUT_RING(chan, 0);
-	OUT_RING(chan, RING_3D(0x1d80, 1));
+	BEGIN_RING(chan, eng3d, 0x1d80, 1);
 	OUT_RING(chan, 3);
-	OUT_RING(chan, RING_3D(0x1450, 1));
+	BEGIN_RING(chan, eng3d, 0x1450, 1);
 	OUT_RING(chan, 0x00030004);
 
 	/* NEW */
-	OUT_RING(chan, RING_3D(0x1e98, 1));
+	BEGIN_RING(chan, eng3d, 0x1e98, 1);
 	OUT_RING(chan, 0);
-	OUT_RING(chan, RING_3D(0x17e0, 3));
+	BEGIN_RING(chan, eng3d, 0x17e0, 3);
 	OUT_RING(chan, fui(0.0));
 	OUT_RING(chan, fui(0.0));
 	OUT_RING(chan, fui(1.0));
-	OUT_RING(chan, RING_3D(0x1f80, 16));
+	BEGIN_RING(chan, eng3d, 0x1f80, 16);
 	for (i=0; i<16; i++) {
 		OUT_RING(chan, (i==8) ? 0x0000ffff : 0);
 	}
 
-	OUT_RING(chan, RING_3D(0x120, 3));
+	BEGIN_RING(chan, eng3d, 0x120, 3);
 	OUT_RING(chan, 0);
 	OUT_RING(chan, 1);
 	OUT_RING(chan, 2);
 
-	OUT_RING(chan, RING_3D(0x1d88, 1));
+	BEGIN_RING(chan, eng3d, 0x1d88, 1);
 	OUT_RING(chan, 0x00001200);
 
-	OUT_RING(chan, RING_3D(NV30_3D_RC_ENABLE, 1));
+	BEGIN_RING(chan, eng3d, NV30_3D_RC_ENABLE, 1);
 	OUT_RING(chan, 0);
 
-	OUT_RING(chan, RING_3D(NV30_3D_DEPTH_RANGE_NEAR, 2));
+	BEGIN_RING(chan, eng3d, NV30_3D_DEPTH_RANGE_NEAR, 2);
 	OUT_RING(chan, fui(0.0));
 	OUT_RING(chan, fui(1.0));
 
-	OUT_RING(chan, RING_3D(NV30_3D_MULTISAMPLE_CONTROL, 1));
+	BEGIN_RING(chan, eng3d, NV30_3D_MULTISAMPLE_CONTROL, 1);
 	OUT_RING(chan, 0xffff0000);
 
 	/* enables use of vp rather than fixed-function somehow */
-	OUT_RING(chan, RING_3D(0x1e94, 1));
+	BEGIN_RING(chan, eng3d, 0x1e94, 1);
 	OUT_RING(chan, 0x13);
 }
 
 static void nv40_screen_init(struct nvfx_screen *screen)
 {
 	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
 
-	OUT_RING(chan, RING_3D(NV40_3D_DMA_COLOR2, 2));
+	BEGIN_RING(chan, eng3d, NV40_3D_DMA_COLOR2, 2);
 	OUT_RING(chan, screen->base.channel->vram->handle);
 	OUT_RING(chan, screen->base.channel->vram->handle);
 
-	OUT_RING(chan, RING_3D(0x1450, 1));
+	BEGIN_RING(chan, eng3d, 0x1450, 1);
 	OUT_RING(chan, 0x00000004);
 
-	OUT_RING(chan, RING_3D(0x1ea4, 3));
+	BEGIN_RING(chan, eng3d, 0x1ea4, 3);
 	OUT_RING(chan, 0x00000010);
 	OUT_RING(chan, 0x01000100);
 	OUT_RING(chan, 0xff800006);
 
 	/* vtxprog output routing */
-	OUT_RING(chan, RING_3D(0x1fc4, 1));
+	BEGIN_RING(chan, eng3d, 0x1fc4, 1);
 	OUT_RING(chan, 0x06144321);
-	OUT_RING(chan, RING_3D(0x1fc8, 2));
+	BEGIN_RING(chan, eng3d, 0x1fc8, 2);
 	OUT_RING(chan, 0xedcba987);
 	OUT_RING(chan, 0x0000006f);
-	OUT_RING(chan, RING_3D(0x1fd0, 1));
+	BEGIN_RING(chan, eng3d, 0x1fd0, 1);
 	OUT_RING(chan, 0x00171615);
-	OUT_RING(chan, RING_3D(0x1fd4, 1));
+	BEGIN_RING(chan, eng3d, 0x1fd4, 1);
 	OUT_RING(chan, 0x001b1a19);
 
-	OUT_RING(chan, RING_3D(0x1ef8, 1));
+	BEGIN_RING(chan, eng3d, 0x1ef8, 1);
 	OUT_RING(chan, 0x0020ffff);
-	OUT_RING(chan, RING_3D(0x1d64, 1));
+	BEGIN_RING(chan, eng3d, 0x1d64, 1);
 	OUT_RING(chan, 0x01d300d4);
-	OUT_RING(chan, RING_3D(0x1e94, 1));
+	BEGIN_RING(chan, eng3d, 0x1e94, 1);
 	OUT_RING(chan, 0x00000001);
 
-	OUT_RING(chan, RING_3D(NV40_3D_MIPMAP_ROUNDING, 1));
+	BEGIN_RING(chan, eng3d, NV40_3D_MIPMAP_ROUNDING, 1);
 	OUT_RING(chan, NV40_3D_MIPMAP_ROUNDING_MODE_DOWN);
 }
 
@@ -571,25 +573,25 @@ nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 
 	/* Static eng3d initialisation */
 	/* note that we just started using the channel, so we must have space in the pushbuffer */
-	OUT_RING(chan, RING_3D(NV30_3D_DMA_NOTIFY, 1));
+	BEGIN_RING(chan, screen->eng3d, NV30_3D_DMA_NOTIFY, 1);
 	OUT_RING(chan, screen->sync->handle);
-	OUT_RING(chan, RING_3D(NV30_3D_DMA_TEXTURE0, 2));
+	BEGIN_RING(chan, screen->eng3d, NV30_3D_DMA_TEXTURE0, 2);
 	OUT_RING(chan, chan->vram->handle);
 	OUT_RING(chan, chan->gart->handle);
-	OUT_RING(chan, RING_3D(NV30_3D_DMA_COLOR1, 1));
+	BEGIN_RING(chan, screen->eng3d, NV30_3D_DMA_COLOR1, 1);
 	OUT_RING(chan, chan->vram->handle);
-	OUT_RING(chan, RING_3D(NV30_3D_DMA_COLOR0, 2));
+	BEGIN_RING(chan, screen->eng3d, NV30_3D_DMA_COLOR0, 2);
 	OUT_RING(chan, chan->vram->handle);
 	OUT_RING(chan, chan->vram->handle);
-	OUT_RING(chan, RING_3D(NV30_3D_DMA_VTXBUF0, 2));
+	BEGIN_RING(chan, screen->eng3d, NV30_3D_DMA_VTXBUF0, 2);
 	OUT_RING(chan, chan->vram->handle);
 	OUT_RING(chan, chan->gart->handle);
 
-	OUT_RING(chan, RING_3D(NV30_3D_DMA_FENCE, 2));
+	BEGIN_RING(chan, screen->eng3d, NV30_3D_DMA_FENCE, 2);
 	OUT_RING(chan, 0);
 	OUT_RING(chan, screen->query->handle);
 
-	OUT_RING(chan, RING_3D(NV30_3D_DMA_UNK1AC, 2));
+	BEGIN_RING(chan, screen->eng3d, NV30_3D_DMA_UNK1AC, 2);
 	OUT_RING(chan, chan->vram->handle);
 	OUT_RING(chan, chan->vram->handle);
 
diff --git a/src/gallium/drivers/nvfx/nvfx_state_emit.c b/src/gallium/drivers/nvfx/nvfx_state_emit.c
index 501fdd4430..40ae4f5bd2 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_emit.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_emit.c
@@ -7,11 +7,11 @@ void
 nvfx_state_viewport_validate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	struct pipe_viewport_state *vpt = &nvfx->viewport;
 
-	WAIT_RING(chan, 11);
 	if(nvfx->render_mode == HW) {
-		OUT_RING(chan, RING_3D(NV30_3D_VIEWPORT_TRANSLATE_X, 8));
+		BEGIN_RING(chan, eng3d, NV30_3D_VIEWPORT_TRANSLATE_X, 8);
 		OUT_RINGf(chan, vpt->translate[0]);
 		OUT_RINGf(chan, vpt->translate[1]);
 		OUT_RINGf(chan, vpt->translate[2]);
@@ -20,10 +20,10 @@ nvfx_state_viewport_validate(struct nvfx_context *nvfx)
 		OUT_RINGf(chan, vpt->scale[1]);
 		OUT_RINGf(chan, vpt->scale[2]);
 		OUT_RINGf(chan, vpt->scale[3]);
-		OUT_RING(chan, RING_3D(0x1d78, 1));
+		BEGIN_RING(chan, eng3d, 0x1d78, 1);
 		OUT_RING(chan, 1);
 	} else {
-		OUT_RING(chan, RING_3D(NV30_3D_VIEWPORT_TRANSLATE_X, 8));
+		BEGIN_RING(chan, eng3d, NV30_3D_VIEWPORT_TRANSLATE_X, 8);
 		OUT_RINGf(chan, 0.0f);
 		OUT_RINGf(chan, 0.0f);
 		OUT_RINGf(chan, 0.0f);
@@ -32,7 +32,7 @@ nvfx_state_viewport_validate(struct nvfx_context *nvfx)
 		OUT_RINGf(chan, 1.0f);
 		OUT_RINGf(chan, 1.0f);
 		OUT_RINGf(chan, 1.0f);
-		OUT_RING(chan, RING_3D(0x1d78, 1));
+		BEGIN_RING(chan, eng3d, 0x1d78, 1);
 		OUT_RING(chan, nvfx->is_nv4x ? 0x110 : 1);
 	}
 }
@@ -41,6 +41,7 @@ void
 nvfx_state_scissor_validate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	struct pipe_rasterizer_state *rast = &nvfx->rasterizer->pipe;
 	struct pipe_scissor_state *s = &nvfx->scissor;
 
@@ -48,8 +49,7 @@ nvfx_state_scissor_validate(struct nvfx_context *nvfx)
 		return;
 	nvfx->state.scissor_enabled = rast->scissor;
 
-	WAIT_RING(chan, 3);
-	OUT_RING(chan, RING_3D(NV30_3D_SCISSOR_HORIZ, 2));
+	BEGIN_RING(chan, eng3d, NV30_3D_SCISSOR_HORIZ, 2);
 	if (nvfx->state.scissor_enabled) {
 		OUT_RING(chan, ((s->maxx - s->minx) << 16) | s->minx);
 		OUT_RING(chan, ((s->maxy - s->miny) << 16) | s->miny);
@@ -63,12 +63,12 @@ void
 nvfx_state_sr_validate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	struct pipe_stencil_ref *sr = &nvfx->stencil_ref;
 
-	WAIT_RING(chan, 4);
-	OUT_RING(chan, RING_3D(NV30_3D_STENCIL_FUNC_REF(0), 1));
+	BEGIN_RING(chan, eng3d, NV30_3D_STENCIL_FUNC_REF(0), 1);
 	OUT_RING(chan, sr->ref_value[0]);
-	OUT_RING(chan, RING_3D(NV30_3D_STENCIL_FUNC_REF(1), 1));
+	BEGIN_RING(chan, eng3d, NV30_3D_STENCIL_FUNC_REF(1), 1);
 	OUT_RING(chan, sr->ref_value[1]);
 }
 
@@ -76,10 +76,10 @@ void
 nvfx_state_blend_colour_validate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	struct pipe_blend_color *bcol = &nvfx->blend_colour;
 
-	WAIT_RING(chan, 2);
-	OUT_RING(chan, RING_3D(NV30_3D_BLEND_COLOR, 1));
+	BEGIN_RING(chan, eng3d, NV30_3D_BLEND_COLOR, 1);
 	OUT_RING(chan, ((float_to_ubyte(bcol->color[3]) << 24) |
 		       (float_to_ubyte(bcol->color[0]) << 16) |
 		       (float_to_ubyte(bcol->color[1]) <<  8) |
@@ -90,9 +90,9 @@ void
 nvfx_state_stipple_validate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 
-	WAIT_RING(chan, 33);
-	OUT_RING(chan, RING_3D(NV30_3D_POLYGON_STIPPLE_PATTERN(0), 32));
+	BEGIN_RING(chan, eng3d, NV30_3D_POLYGON_STIPPLE_PATTERN(0), 32);
 	OUT_RINGp(chan, nvfx->stipple, 32);
 }
 
@@ -100,12 +100,12 @@ static void
 nvfx_coord_conventions_validate(struct nvfx_context* nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	unsigned value = nvfx->hw_fragprog->coord_conventions;
 	if(value & NV30_3D_COORD_CONVENTIONS_ORIGIN_INVERTED)
 		value |= nvfx->framebuffer.height << NV30_3D_COORD_CONVENTIONS_HEIGHT__SHIFT;
 
-	WAIT_RING(chan, 2);
-	OUT_RING(chan, RING_3D(NV30_3D_COORD_CONVENTIONS, 1));
+	BEGIN_RING(chan, eng3d, NV30_3D_COORD_CONVENTIONS, 1);
 	OUT_RING(chan, value);
 }
 
@@ -113,6 +113,7 @@ static void
 nvfx_ucp_validate(struct nvfx_context* nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	unsigned enables[7] =
 	{
 			0,
@@ -126,17 +127,15 @@ nvfx_ucp_validate(struct nvfx_context* nvfx)
 
 	if(!nvfx->use_vp_clipping)
 	{
-		WAIT_RING(chan, 2);
-		OUT_RING(chan, RING_3D(NV30_3D_VP_CLIP_PLANES_ENABLE, 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_VP_CLIP_PLANES_ENABLE, 1);
 		OUT_RING(chan, 0);
 
-		WAIT_RING(chan, 6 * 4 + 1);
-		OUT_RING(chan, RING_3D(NV30_3D_VP_CLIP_PLANE(0, 0), nvfx->clip.nr * 4));
+		BEGIN_RING(chan, eng3d, NV30_3D_VP_CLIP_PLANE(0, 0),
+			   nvfx->clip.nr * 4);
 		OUT_RINGp(chan, &nvfx->clip.ucp[0][0], nvfx->clip.nr * 4);
 	}
 
-	WAIT_RING(chan, 2);
-	OUT_RING(chan, RING_3D(NV30_3D_VP_CLIP_PLANES_ENABLE, 1));
+	BEGIN_RING(chan, eng3d, NV30_3D_VP_CLIP_PLANES_ENABLE, 1);
 	OUT_RING(chan, enables[nvfx->clip.nr]);
 }
 
@@ -144,38 +143,37 @@ static void
 nvfx_vertprog_ucp_validate(struct nvfx_context* nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	unsigned i;
 	struct nvfx_vertex_program* vp = nvfx->hw_vertprog;
 	if(nvfx->clip.nr != vp->clip_nr)
 	{
 		unsigned idx;
-		WAIT_RING(chan, 14);
 
 		/* remove last instruction bit */
 		if(vp->clip_nr >= 0)
 		{
 			idx = vp->nr_insns - 7 + vp->clip_nr;
-			OUT_RING(chan, RING_3D(NV30_3D_VP_UPLOAD_FROM_ID, 1));
+			BEGIN_RING(chan, eng3d, NV30_3D_VP_UPLOAD_FROM_ID, 1);
 			OUT_RING(chan,  vp->exec->start + idx);
-			OUT_RING(chan, RING_3D(NV30_3D_VP_UPLOAD_INST(0), 4));
+			BEGIN_RING(chan, eng3d, NV30_3D_VP_UPLOAD_INST(0), 4);
 			OUT_RINGp (chan, vp->insns[idx].data, 4);
 		}
 
 		 /* set last instruction bit */
 		idx = vp->nr_insns - 7 + nvfx->clip.nr;
-		OUT_RING(chan, RING_3D(NV30_3D_VP_UPLOAD_FROM_ID, 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_VP_UPLOAD_FROM_ID, 1);
 		OUT_RING(chan,  vp->exec->start + idx);
-		OUT_RING(chan, RING_3D(NV30_3D_VP_UPLOAD_INST(0), 4));
+		BEGIN_RING(chan, eng3d, NV30_3D_VP_UPLOAD_INST(0), 4);
 		OUT_RINGp(chan, vp->insns[idx].data, 3);
 		OUT_RING(chan, vp->insns[idx].data[3] | 1);
 		vp->clip_nr = nvfx->clip.nr;
 	}
 
 	// TODO: only do this for the ones changed
-	WAIT_RING(chan, 6 * 6);
 	for(i = 0; i < nvfx->clip.nr; ++i)
 	{
-		OUT_RING(chan, RING_3D(NV30_3D_VP_UPLOAD_CONST_ID, 5));
+		BEGIN_RING(chan, eng3d, NV30_3D_VP_UPLOAD_CONST_ID, 5);
 		OUT_RING(chan, vp->data->start + i);
 		OUT_RINGp (chan, nvfx->clip.ucp[i], 4);
 	}
@@ -185,6 +183,7 @@ static boolean
 nvfx_state_validate_common(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	unsigned dirty;
 	unsigned still_dirty = 0;
 	int new_fb_mode = -1; /* 1 = all swizzled, 0 = make all linear */
@@ -287,8 +286,7 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 
 		if(vp_output != nvfx->hw_vp_output)
 		{
-			WAIT_RING(chan, 2);
-			OUT_RING(chan, RING_3D(NV40_3D_VP_RESULT_EN, 1));
+			BEGIN_RING(chan, eng3d, NV40_3D_VP_RESULT_EN, 1);
 			OUT_RING(chan, vp_output);
 			nvfx->hw_vp_output = vp_output;
 		}
@@ -320,8 +318,7 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 
 	if(dirty & NVFX_NEW_ZSA || (new_fb_mode >= 0))
 	{
-		WAIT_RING(chan, 3);
-		OUT_RING(chan, RING_3D(NV30_3D_DEPTH_WRITE_ENABLE, 2));
+		BEGIN_RING(chan, eng3d, NV30_3D_DEPTH_WRITE_ENABLE, 2);
 		OUT_RING(chan, nvfx->framebuffer.zsbuf && nvfx->zsa->pipe.depth.writemask);
 	        OUT_RING(chan, nvfx->framebuffer.zsbuf && nvfx->zsa->pipe.depth.enabled);
 	}
@@ -334,10 +331,9 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 		// TODO: what about nv30?
 		if(nvfx->is_nv4x)
 		{
-			WAIT_RING(chan, 4);
-			OUT_RING(chan, RING_3D(NV40_3D_TEX_CACHE_CTL, 1));
+			BEGIN_RING(chan, eng3d, NV40_3D_TEX_CACHE_CTL, 1);
 			OUT_RING(chan, 2);
-			OUT_RING(chan, RING_3D(NV40_3D_TEX_CACHE_CTL, 1));
+			BEGIN_RING(chan, eng3d, NV40_3D_TEX_CACHE_CTL, 1);
 			OUT_RING(chan, 1);
 		}
 	}
diff --git a/src/gallium/drivers/nvfx/nvfx_state_fb.c b/src/gallium/drivers/nvfx/nvfx_state_fb.c
index 816bb89f2c..f9fed94044 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_fb.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_fb.c
@@ -96,6 +96,7 @@ nvfx_framebuffer_validate(struct nvfx_context *nvfx, unsigned prepare_result)
 {
 	struct pipe_framebuffer_state *fb = &nvfx->framebuffer;
 	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	uint32_t rt_enable, rt_format;
 	int i;
 	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
@@ -204,11 +205,11 @@ nvfx_framebuffer_validate(struct nvfx_context *nvfx, unsigned prepare_result)
 
 		//printf("rendering to bo %p [%i] at offset %i with pitch %i\n", rt0->bo, rt0->bo->handle, rt0->offset, pitch);
 
-		OUT_RING(chan, RING_3D(NV30_3D_DMA_COLOR0, 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_DMA_COLOR0, 1);
 		OUT_RELOC(chan, rt0->bo, 0,
 			      rt_flags | NOUVEAU_BO_OR,
 			      chan->vram->handle, chan->gart->handle);
-		OUT_RING(chan, RING_3D(NV30_3D_COLOR0_PITCH, 2));
+		BEGIN_RING(chan, eng3d, NV30_3D_COLOR0_PITCH, 2);
 		OUT_RING(chan, pitch);
 		OUT_RELOC(chan, rt0->bo,
 			      rt0->offset, rt_flags | NOUVEAU_BO_LOW,
@@ -216,11 +217,11 @@ nvfx_framebuffer_validate(struct nvfx_context *nvfx, unsigned prepare_result)
 	}
 
 	if (rt_enable & NV30_3D_RT_ENABLE_COLOR1) {
-		OUT_RING(chan, RING_3D(NV30_3D_DMA_COLOR1, 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_DMA_COLOR1, 1);
 		OUT_RELOC(chan, nvfx->hw_rt[1].bo, 0,
 			      rt_flags | NOUVEAU_BO_OR,
 			      chan->vram->handle, chan->gart->handle);
-		OUT_RING(chan, RING_3D(NV30_3D_COLOR1_OFFSET, 2));
+		BEGIN_RING(chan, eng3d, NV30_3D_COLOR1_OFFSET, 2);
 		OUT_RELOC(chan, nvfx->hw_rt[1].bo,
 				nvfx->hw_rt[1].offset, rt_flags | NOUVEAU_BO_LOW,
 			      0, 0);
@@ -230,68 +231,68 @@ nvfx_framebuffer_validate(struct nvfx_context *nvfx, unsigned prepare_result)
 	if(nvfx->is_nv4x)
 	{
 		if (rt_enable & NV40_3D_RT_ENABLE_COLOR2) {
-			OUT_RING(chan, RING_3D(NV40_3D_DMA_COLOR2, 1));
+			BEGIN_RING(chan, eng3d, NV40_3D_DMA_COLOR2, 1);
 			OUT_RELOC(chan, nvfx->hw_rt[2].bo, 0,
 				      rt_flags | NOUVEAU_BO_OR,
 				      chan->vram->handle, chan->gart->handle);
-			OUT_RING(chan, RING_3D(NV40_3D_COLOR2_OFFSET, 1));
+			BEGIN_RING(chan, eng3d, NV40_3D_COLOR2_OFFSET, 1);
 			OUT_RELOC(chan, nvfx->hw_rt[2].bo,
 				      nvfx->hw_rt[2].offset, rt_flags | NOUVEAU_BO_LOW,
 				      0, 0);
-			OUT_RING(chan, RING_3D(NV40_3D_COLOR2_PITCH, 1));
+			BEGIN_RING(chan, eng3d, NV40_3D_COLOR2_PITCH, 1);
 			OUT_RING(chan, nvfx->hw_rt[2].pitch);
 		}
 
 		if (rt_enable & NV40_3D_RT_ENABLE_COLOR3) {
-			OUT_RING(chan, RING_3D(NV40_3D_DMA_COLOR3, 1));
+			BEGIN_RING(chan, eng3d, NV40_3D_DMA_COLOR3, 1);
 			OUT_RELOC(chan, nvfx->hw_rt[3].bo, 0,
 				      rt_flags | NOUVEAU_BO_OR,
 				      chan->vram->handle, chan->gart->handle);
-			OUT_RING(chan, RING_3D(NV40_3D_COLOR3_OFFSET, 1));
+			BEGIN_RING(chan, eng3d, NV40_3D_COLOR3_OFFSET, 1);
 			OUT_RELOC(chan, nvfx->hw_rt[3].bo,
 					nvfx->hw_rt[3].offset, rt_flags | NOUVEAU_BO_LOW,
 				      0, 0);
-			OUT_RING(chan, RING_3D(NV40_3D_COLOR3_PITCH, 1));
+			BEGIN_RING(chan, eng3d, NV40_3D_COLOR3_PITCH, 1);
 			OUT_RING(chan, nvfx->hw_rt[3].pitch);
 		}
 	}
 
 	if (fb->zsbuf) {
-		OUT_RING(chan, RING_3D(NV30_3D_DMA_ZETA, 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_DMA_ZETA, 1);
 		OUT_RELOC(chan, nvfx->hw_zeta.bo, 0,
 			      rt_flags | NOUVEAU_BO_OR,
 			      chan->vram->handle, chan->gart->handle);
-		OUT_RING(chan, RING_3D(NV30_3D_ZETA_OFFSET, 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_ZETA_OFFSET, 1);
 		/* TODO: reverse engineer LMA */
 		OUT_RELOC(chan, nvfx->hw_zeta.bo,
 			     nvfx->hw_zeta.offset, rt_flags | NOUVEAU_BO_LOW, 0, 0);
 	        if(nvfx->is_nv4x) {
-			OUT_RING(chan, RING_3D(NV40_3D_ZETA_PITCH, 1));
+			BEGIN_RING(chan, eng3d, NV40_3D_ZETA_PITCH, 1);
 			OUT_RING(chan, nvfx->hw_zeta.pitch);
 		}
 	}
 	else if(nvfx->is_nv4x) {
-		OUT_RING(chan, RING_3D(NV40_3D_ZETA_PITCH, 1));
+		BEGIN_RING(chan, eng3d, NV40_3D_ZETA_PITCH, 1);
 		OUT_RING(chan, 64);
 	}
 
-	OUT_RING(chan, RING_3D(NV30_3D_RT_ENABLE, 1));
+	BEGIN_RING(chan, eng3d, NV30_3D_RT_ENABLE, 1);
 	OUT_RING(chan, rt_enable);
-	OUT_RING(chan, RING_3D(NV30_3D_RT_HORIZ, 3));
+	BEGIN_RING(chan, eng3d, NV30_3D_RT_HORIZ, 3);
 	OUT_RING(chan, (w << 16) | 0);
 	OUT_RING(chan, (h << 16) | 0);
 	OUT_RING(chan, rt_format);
-	OUT_RING(chan, RING_3D(NV30_3D_VIEWPORT_HORIZ, 2));
+	BEGIN_RING(chan, eng3d, NV30_3D_VIEWPORT_HORIZ, 2);
 	OUT_RING(chan, (w << 16) | 0);
 	OUT_RING(chan, (h << 16) | 0);
-	OUT_RING(chan, RING_3D(NV30_3D_VIEWPORT_CLIP_HORIZ(0), 2));
+	BEGIN_RING(chan, eng3d, NV30_3D_VIEWPORT_CLIP_HORIZ(0), 2);
 	OUT_RING(chan, ((w - 1) << 16) | 0);
 	OUT_RING(chan, ((h - 1) << 16) | 0);
 
 	if(!nvfx->is_nv4x) {
 		/* Wonder why this is needed, context should all be set to zero on init */
 		/* TODO: we can most likely remove this, after putting it in context init */
-		OUT_RING(chan, RING_3D(NV30_3D_VIEWPORT_TX_ORIGIN, 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_VIEWPORT_TX_ORIGIN, 1);
 		OUT_RING(chan, 0);
 	}
 	nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAMEBUFFER;
diff --git a/src/gallium/drivers/nvfx/nvfx_surface.c b/src/gallium/drivers/nvfx/nvfx_surface.c
index 6fd6c47081..be31853d71 100644
--- a/src/gallium/drivers/nvfx/nvfx_surface.c
+++ b/src/gallium/drivers/nvfx/nvfx_surface.c
@@ -168,8 +168,8 @@ nvfx_get_blitter(struct pipe_context* pipe, int copy)
 	if(nvfx->query && !nvfx->blitters_in_use)
 	{
 		struct nouveau_channel* chan = nvfx->screen->base.channel;
-		WAIT_RING(chan, 2);
-		OUT_RING(chan, RING_3D(NV30_3D_QUERY_ENABLE, 1));
+		struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
+		BEGIN_RING(chan, eng3d, NV30_3D_QUERY_ENABLE, 1);
 		OUT_RING(chan, 0);
 	}
 
@@ -209,8 +209,8 @@ nvfx_put_blitter(struct pipe_context* pipe, struct blitter_context* blitter)
 	if(nvfx->query && !nvfx->blitters_in_use)
 	{
 		struct nouveau_channel* chan = nvfx->screen->base.channel;
-		WAIT_RING(chan, 2);
-		OUT_RING(chan, RING_3D(NV30_3D_QUERY_ENABLE, 1));
+		struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
+		BEGIN_RING(chan, eng3d, NV30_3D_QUERY_ENABLE, 1);
 		OUT_RING(chan, 1);
 	}
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_vbo.c b/src/gallium/drivers/nvfx/nvfx_vbo.c
index 339b31786d..1c88f5f016 100644
--- a/src/gallium/drivers/nvfx/nvfx_vbo.c
+++ b/src/gallium/drivers/nvfx/nvfx_vbo.c
@@ -246,6 +246,7 @@ boolean
 nvfx_vbo_validate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	int i;
 	int elements = MAX2(nvfx->vtxelt->num_elements, nvfx->hw_vtxelt_nr);
 	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD;
@@ -261,11 +262,11 @@ nvfx_vbo_validate(struct nvfx_context *nvfx)
 		struct nvfx_buffer* buffer = nvfx_buffer(vb->buffer);
 		float v[4];
 		ve->fetch_rgba_float(v, buffer->data + vb->buffer_offset + ve->src_offset, 0, 0);
-		nvfx_emit_vtx_attr(chan, ve->idx, v, ve->ncomp);
+		nvfx_emit_vtx_attr(chan, eng3d, ve->idx, v, ve->ncomp);
 	}
 
 
-	OUT_RING(chan, RING_3D(NV30_3D_VTXFMT(0), elements));
+	BEGIN_RING(chan, eng3d, NV30_3D_VTXFMT(0), elements);
 	if(nvfx->use_vertex_buffers)
 	{
 		unsigned idx = 0;
@@ -296,12 +297,12 @@ nvfx_vbo_validate(struct nvfx_context *nvfx)
 		unsigned i;
 		/* seems to be some kind of cache flushing */
 		for(i = 0; i < 3; ++i) {
-			OUT_RING(chan, RING_3D(0x1718, 1));
+			BEGIN_RING(chan, eng3d, 0x1718, 1);
 			OUT_RING(chan, 0);
 		}
 	}
 
-	OUT_RING(chan, RING_3D(NV30_3D_VTXBUF(0), elements));
+	BEGIN_RING(chan, eng3d, NV30_3D_VTXBUF(0), elements);
 	if(nvfx->use_vertex_buffers)
 	{
 		unsigned idx = 0;
@@ -329,7 +330,7 @@ nvfx_vbo_validate(struct nvfx_context *nvfx)
 			OUT_RING(chan, 0);
 	}
 
-	OUT_RING(chan, RING_3D(0x1710, 1));
+	BEGIN_RING(chan, eng3d, 0x1710, 1);
 	OUT_RING(chan, 0);
 
 	nvfx->hw_vtxelt_nr = nvfx->vtxelt->num_elements;
@@ -341,15 +342,14 @@ void
 nvfx_vbo_swtnl_validate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
 	unsigned num_outputs = nvfx->vertprog->draw_elements;
 	int elements = MAX2(num_outputs, nvfx->hw_vtxelt_nr);
 
 	if (!elements)
 		return;
 
-	WAIT_RING(chan, (1 + 6 + 1 + 2) + elements * 2);
-
-	OUT_RING(chan, RING_3D(NV30_3D_VTXFMT(0), elements));
+	BEGIN_RING(chan, eng3d, NV30_3D_VTXFMT(0), elements);
 	for(unsigned i = 0; i < num_outputs; ++i)
 		OUT_RING(chan, (4 << NV30_3D_VTXFMT_SIZE__SHIFT) | NV30_3D_VTXFMT_TYPE_V32_FLOAT);
 	for(unsigned i = num_outputs; i < elements; ++i)
@@ -359,16 +359,16 @@ nvfx_vbo_swtnl_validate(struct nvfx_context *nvfx)
 		unsigned i;
 		/* seems to be some kind of cache flushing */
 		for(i = 0; i < 3; ++i) {
-			OUT_RING(chan, RING_3D(0x1718, 1));
+			BEGIN_RING(chan, eng3d, 0x1718, 1);
 			OUT_RING(chan, 0);
 		}
 	}
 
-	OUT_RING(chan, RING_3D(NV30_3D_VTXBUF(0), elements));
+	BEGIN_RING(chan, eng3d, NV30_3D_VTXBUF(0), elements);
 	for (unsigned i = 0; i < elements; i++)
 		OUT_RING(chan, 0);
 
-	OUT_RING(chan, RING_3D(0x1710, 1));
+	BEGIN_RING(chan, eng3d, 0x1710, 1);
 	OUT_RING(chan, 0);
 
 	nvfx->hw_vtxelt_nr = num_outputs;
diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c
index e543fda50e..a11941f3d5 100644
--- a/src/gallium/drivers/nvfx/nvfx_vertprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c
@@ -1182,6 +1182,7 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 {
 	struct nvfx_screen *screen = nvfx->screen;
 	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
 	struct nvfx_pipe_vertex_program *pvp = nvfx->vertprog;
 	struct nvfx_vertex_program* vp;
 	struct pipe_resource *constbuf;
@@ -1341,7 +1342,6 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 		}
 		*/
 
-		WAIT_RING(chan, 6 * vp->nr_consts);
 		for (i = nvfx->use_vp_clipping ? 6 : 0; i < vp->nr_consts; i++) {
 			struct nvfx_vertex_program_data *vpd = &vp->consts[i];
 
@@ -1356,7 +1356,7 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 
 			//printf("upload into %i + %i: %f %f %f %f\n", vp->data->start, i, vpd->value[0], vpd->value[1], vpd->value[2], vpd->value[3]);
 
-			OUT_RING(chan, RING_3D(NV30_3D_VP_UPLOAD_CONST_ID, 5));
+			BEGIN_RING(chan, eng3d, NV30_3D_VP_UPLOAD_CONST_ID, 5);
 			OUT_RING(chan, i + vp->data->start);
 			OUT_RINGp(chan, (uint32_t *)vpd->value, 4);
 		}
@@ -1364,11 +1364,10 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 
 	/* Upload vtxprog */
 	if (upload_code) {
-		WAIT_RING(chan, 2 + 5 * vp->nr_insns);
-		OUT_RING(chan, RING_3D(NV30_3D_VP_UPLOAD_FROM_ID, 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_VP_UPLOAD_FROM_ID, 1);
 		OUT_RING(chan, vp->exec->start);
 		for (i = 0; i < vp->nr_insns; i++) {
-			OUT_RING(chan, RING_3D(NV30_3D_VP_UPLOAD_INST(0), 4));
+			BEGIN_RING(chan, eng3d, NV30_3D_VP_UPLOAD_INST(0), 4);
 			//printf("%08x %08x %08x %08x\n", vp->insns[i].data[0], vp->insns[i].data[1], vp->insns[i].data[2], vp->insns[i].data[3]);
 			OUT_RINGp(chan, vp->insns[i].data, 4);
 		}
@@ -1377,11 +1376,10 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 
 	if(nvfx->dirty & (NVFX_NEW_VERTPROG))
 	{
-		WAIT_RING(chan, 6);
-		OUT_RING(chan, RING_3D(NV30_3D_VP_START_FROM_ID, 1));
+		BEGIN_RING(chan, eng3d, NV30_3D_VP_START_FROM_ID, 1);
 		OUT_RING(chan, vp->exec->start);
 		if(nvfx->is_nv4x) {
-			OUT_RING(chan, RING_3D(NV40_3D_VP_ATTRIB_EN, 1));
+			BEGIN_RING(chan, eng3d, NV40_3D_VP_ATTRIB_EN, 1);
 			OUT_RING(chan, vp->ir);
 		}
 	}
diff --git a/src/gallium/drivers/r300/r300_chipset.c b/src/gallium/drivers/r300/r300_chipset.c
index 583e981a4d..2b183f62c5 100644
--- a/src/gallium/drivers/r300/r300_chipset.c
+++ b/src/gallium/drivers/r300/r300_chipset.c
@@ -366,7 +366,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
             caps->family = CHIP_FAMILY_RV530;
             caps->num_vert_fpus = 5;
             caps->is_r500 = TRUE;
-            /*caps->hiz_ram = RV530_HIZ_LIMIT;*/
+            caps->hiz_ram = RV530_HIZ_LIMIT;
             caps->zmask_ram = PIPE_ZMASK_SIZE;
             break;
 
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index 67b011a145..7ba8e71055 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -35,7 +35,9 @@
 #include "r300_screen_buffer.h"
 #include "r300_winsys.h"
 
-#include <inttypes.h>
+#ifdef HAVE_LLVM
+#include "gallivm/lp_bld_init.h"
+#endif
 
 static void r300_update_num_contexts(struct r300_screen *r300screen,
                                      int diff)
@@ -103,9 +105,14 @@ static void r300_destroy_context(struct pipe_context* context)
 
     if (r300->blitter)
         util_blitter_destroy(r300->blitter);
-    if (r300->draw)
+    if (r300->draw) {
         draw_destroy(r300->draw);
 
+#ifdef HAVE_LLVM
+        gallivm_destroy(r300->gallivm);
+#endif
+    }
+
     if (r300->upload_vb)
         u_upload_destroy(r300->upload_vb);
     if (r300->upload_ib)
@@ -424,7 +431,12 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
 
     if (!r300screen->caps.has_tcl) {
         /* Create a Draw. This is used for SW TCL. */
+#ifdef HAVE_LLVM
+        r300->gallivm = gallivm_create();
+        r300->draw = draw_create_gallivm(&r300->context, r300->gallivm);
+#else
         r300->draw = draw_create(&r300->context);
+#endif
         if (r300->draw == NULL)
             goto fail;
         /* Enable our renderer. */
@@ -458,14 +470,14 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
             goto fail;
 
     r300->upload_ib = u_upload_create(&r300->context,
-				      32 * 1024, 16,
+				      64 * 1024, 16,
 				      PIPE_BIND_INDEX_BUFFER);
 
     if (r300->upload_ib == NULL)
         goto fail;
 
     r300->upload_vb = u_upload_create(&r300->context,
-				      128 * 1024, 16,
+				      1024 * 1024, 16,
 				      PIPE_BIND_VERTEX_BUFFER);
     if (r300->upload_vb == NULL)
         goto fail;
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index 39dcde0610..f3a3df08bc 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -459,6 +459,7 @@ struct r300_context {
     struct r300_screen *screen;
 
     /* Draw module. Used mostly for SW TCL. */
+    struct gallivm_state *gallivm;
     struct draw_context* draw;
     /* Vertex buffer for SW TCL. */
     struct pipe_resource* vbo;
@@ -612,8 +613,8 @@ struct r300_context {
     int vs_const_base;
 
     /* AOS (PACKET3_3D_LOAD_VBPNTR) command buffer for the case offset=0. */
-    uint32_t aos_cb[(16 * 3 + 1) / 2];
-    boolean aos_dirty;
+    uint32_t vertex_arrays_cb[(16 * 3 + 1) / 2];
+    boolean vertex_arrays_dirty;
 
     /* Whether any buffer (FB, textures, VBOs) has been set, but buffers
      * haven't been validated yet. */
@@ -697,7 +698,8 @@ void r500_emit_index_bias(struct r300_context *r300, int index_bias);
 enum r300_fb_state_change {
     R300_CHANGED_FB_STATE = 0,
     R300_CHANGED_CBZB_FLAG,
-    R300_CHANGED_ZCLEAR_FLAG
+    R300_CHANGED_ZCLEAR_FLAG,
+    R300_CHANGED_MULTIWRITE
 };
 
 void r300_mark_fb_state_dirty(struct r300_context *r300,
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index 04a5bd92d1..e1a3714aac 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -26,7 +26,6 @@
 #include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_mm.h"
-#include "util/u_simple_list.h"
 
 #include "r300_context.h"
 #include "r300_cb.h"
@@ -370,6 +369,8 @@ void r300_emit_fb_state(struct r300_context* r300, unsigned size, void* state)
     struct r300_surface* surf;
     unsigned i;
     boolean can_hyperz = r300->rws->get_value(r300->rws, R300_CAN_HYPERZ);
+    uint32_t rb3d_cctl = 0;
+
     CS_LOCALS(r300);
 
     BEGIN_CS(size);
@@ -377,11 +378,13 @@ void r300_emit_fb_state(struct r300_context* r300, unsigned size, void* state)
     /* NUM_MULTIWRITES replicates COLOR[0] to all colorbuffers, which is not
      * what we usually want. */
     if (r300->screen->caps.is_r500) {
-        OUT_CS_REG(R300_RB3D_CCTL,
-            R300_RB3D_CCTL_INDEPENDENT_COLORFORMAT_ENABLE_ENABLE);
-    } else {
-        OUT_CS_REG(R300_RB3D_CCTL, 0);
+        rb3d_cctl = R300_RB3D_CCTL_INDEPENDENT_COLORFORMAT_ENABLE_ENABLE;
     }
+    if (r300_fragment_shader_writes_all(r300_fs(r300))) {
+        rb3d_cctl |= R300_RB3D_CCTL_NUM_MULTIWRITES(fb->nr_cbufs);
+    }
+
+    OUT_CS_REG(R300_RB3D_CCTL, rb3d_cctl);
 
     /* Set up colorbuffers. */
     for (i = 0; i < fb->nr_cbufs; i++) {
@@ -483,15 +486,21 @@ void r300_emit_fb_state_pipelined(struct r300_context *r300,
 {
     struct pipe_framebuffer_state* fb =
             (struct pipe_framebuffer_state*)r300->fb_state.state;
-    unsigned i;
+    unsigned i, num_cbufs = fb->nr_cbufs;
     CS_LOCALS(r300);
 
+    /* If we use the multiwrite feature, the colorbuffers 2,3,4 must be
+     * marked as UNUSED in the US block. */
+    if (r300_fragment_shader_writes_all(r300_fs(r300))) {
+        num_cbufs = MIN2(num_cbufs, 1);
+    }
+
     BEGIN_CS(size);
 
     /* Colorbuffer format in the US block.
      * (must be written after unpipelined regs) */
     OUT_CS_REG_SEQ(R300_US_OUT_FMT_0, 4);
-    for (i = 0; i < fb->nr_cbufs; i++) {
+    for (i = 0; i < num_cbufs; i++) {
         OUT_CS(r300_surface(fb->cbufs[i])->format);
     }
     for (; i < 4; i++) {
@@ -807,17 +816,17 @@ void r300_emit_textures_state(struct r300_context *r300,
     END_CS;
 }
 
-static void r300_update_aos_cb(struct r300_context *r300, unsigned packet_size)
+static void r300_update_vertex_arrays_cb(struct r300_context *r300, unsigned packet_size)
 {
     struct pipe_vertex_buffer *vb1, *vb2, *vbuf = r300->vertex_buffer;
     struct pipe_vertex_element *velem = r300->velems->velem;
     unsigned *hw_format_size = r300->velems->hw_format_size;
-    unsigned size1, size2, aos_count = r300->velems->count;
+    unsigned size1, size2, vertex_array_count = r300->velems->count;
     int i;
     CB_LOCALS;
 
-    BEGIN_CB(r300->aos_cb, packet_size);
-    for (i = 0; i < aos_count - 1; i += 2) {
+    BEGIN_CB(r300->vertex_arrays_cb, packet_size);
+    for (i = 0; i < vertex_array_count - 1; i += 2) {
         vb1 = &vbuf[velem[i].vertex_buffer_index];
         vb2 = &vbuf[velem[i+1].vertex_buffer_index];
         size1 = hw_format_size[i];
@@ -829,7 +838,7 @@ static void r300_update_aos_cb(struct r300_context *r300, unsigned packet_size)
         OUT_CB(vb2->buffer_offset + velem[i+1].src_offset);
     }
 
-    if (aos_count & 1) {
+    if (vertex_array_count & 1) {
         vb1 = &vbuf[velem[i].vertex_buffer_index];
         size1 = hw_format_size[i];
 
@@ -838,34 +847,34 @@ static void r300_update_aos_cb(struct r300_context *r300, unsigned packet_size)
     }
     END_CB;
 
-    r300->aos_dirty = FALSE;
+    r300->vertex_arrays_dirty = FALSE;
 }
 
-void r300_emit_aos(struct r300_context* r300, int offset, boolean indexed)
+void r300_emit_vertex_arrays(struct r300_context* r300, int offset, boolean indexed)
 {
     struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
     struct pipe_vertex_element *velem = r300->velems->velem;
     struct r300_buffer *buf;
     int i;
-    unsigned aos_count = r300->velems->count;
-    unsigned packet_size = (aos_count * 3 + 1) / 2;
+    unsigned vertex_array_count = r300->velems->count;
+    unsigned packet_size = (vertex_array_count * 3 + 1) / 2;
     CS_LOCALS(r300);
 
-    BEGIN_CS(2 + packet_size + aos_count * 2);
+    BEGIN_CS(2 + packet_size + vertex_array_count * 2);
     OUT_CS_PKT3(R300_PACKET3_3D_LOAD_VBPNTR, packet_size);
-    OUT_CS(aos_count | (!indexed ? R300_VC_FORCE_PREFETCH : 0));
+    OUT_CS(vertex_array_count | (!indexed ? R300_VC_FORCE_PREFETCH : 0));
 
     if (!offset) {
-        if (r300->aos_dirty) {
-            r300_update_aos_cb(r300, packet_size);
+        if (r300->vertex_arrays_dirty) {
+            r300_update_vertex_arrays_cb(r300, packet_size);
         }
-        OUT_CS_TABLE(r300->aos_cb, packet_size);
+        OUT_CS_TABLE(r300->vertex_arrays_cb, packet_size);
     } else {
         struct pipe_vertex_buffer *vb1, *vb2;
         unsigned *hw_format_size = r300->velems->hw_format_size;
         unsigned size1, size2;
 
-        for (i = 0; i < aos_count - 1; i += 2) {
+        for (i = 0; i < vertex_array_count - 1; i += 2) {
             vb1 = &vbuf[velem[i].vertex_buffer_index];
             vb2 = &vbuf[velem[i+1].vertex_buffer_index];
             size1 = hw_format_size[i];
@@ -877,7 +886,7 @@ void r300_emit_aos(struct r300_context* r300, int offset, boolean indexed)
             OUT_CS(vb2->buffer_offset + velem[i+1].src_offset + offset * vb2->stride);
         }
 
-        if (aos_count & 1) {
+        if (vertex_array_count & 1) {
             vb1 = &vbuf[velem[i].vertex_buffer_index];
             size1 = hw_format_size[i];
 
@@ -886,14 +895,14 @@ void r300_emit_aos(struct r300_context* r300, int offset, boolean indexed)
         }
     }
 
-    for (i = 0; i < aos_count; i++) {
+    for (i = 0; i < vertex_array_count; i++) {
         buf = r300_buffer(vbuf[velem[i].vertex_buffer_index].buffer);
         OUT_CS_BUF_RELOC_NO_OFFSET(&buf->b.b, buf->domain, 0);
     }
     END_CS;
 }
 
-void r300_emit_aos_swtcl(struct r300_context *r300, boolean indexed)
+void r300_emit_vertex_arrays_swtcl(struct r300_context *r300, boolean indexed)
 {
     CS_LOCALS(r300);
 
diff --git a/src/gallium/drivers/r300/r300_emit.h b/src/gallium/drivers/r300/r300_emit.h
index 278dbcb4c7..acea51d942 100644
--- a/src/gallium/drivers/r300/r300_emit.h
+++ b/src/gallium/drivers/r300/r300_emit.h
@@ -31,7 +31,7 @@ struct r300_vertex_program_code;
 
 uint32_t pack_float24(float f);
 
-void r300_emit_aos(struct r300_context* r300, int offset, boolean indexed);
+void r300_emit_vertex_arrays(struct r300_context* r300, int offset, boolean indexed);
 
 void r300_emit_blend_state(struct r300_context* r300,
                            unsigned size, void* state);
@@ -86,7 +86,7 @@ void r300_emit_scissor_state(struct r300_context* r300,
 void r300_emit_textures_state(struct r300_context *r300,
                               unsigned size, void *state);
 
-void r300_emit_aos_swtcl(struct r300_context *r300, boolean indexed);
+void r300_emit_vertex_arrays_swtcl(struct r300_context *r300, boolean indexed);
 
 void r300_emit_vap_invariant_state(struct r300_context *r300,
                                    unsigned size, void *state);
diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index 2936c3486e..6d4091dc87 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -395,6 +395,13 @@ static void r300_translate_fragment_shader(
 
     find_output_registers(&compiler, shader);
 
+    shader->write_all = FALSE;
+    for (i = 0; i < shader->info.num_properties; i++) {
+        if (shader->info.properties[i].name == TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS) {
+            shader->write_all = TRUE;
+        }
+    }
+
     if (compiler.Base.Debug & RC_DBG_LOG) {
         DBG(r300, DBG_FP, "r300: Initial fragment program\n");
         tgsi_dump(tokens, 0);
diff --git a/src/gallium/drivers/r300/r300_fs.h b/src/gallium/drivers/r300/r300_fs.h
index 51bfa88c5e..c86a90b85a 100644
--- a/src/gallium/drivers/r300/r300_fs.h
+++ b/src/gallium/drivers/r300/r300_fs.h
@@ -54,6 +54,9 @@ struct r300_fragment_shader_code {
     uint32_t *cb_code;
 
     struct r300_fragment_shader_code* next;
+
+    boolean write_all;
+
 };
 
 struct r300_fragment_shader {
@@ -81,4 +84,10 @@ static INLINE boolean r300_fragment_shader_writes_depth(struct r300_fragment_sha
     return (fs->shader->code.writes_depth) ? TRUE : FALSE;
 }
 
+static INLINE boolean r300_fragment_shader_writes_all(struct r300_fragment_shader *fs)
+{
+    if (!fs)
+        return FALSE;
+    return (fs->shader->write_all) ? TRUE : FALSE;
+}
 #endif /* R300_FS_H */
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index 613186e815..d1154dee40 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -2631,8 +2631,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_ZB_BW_CNTL                     0x4f1c
 #	define R300_HIZ_DISABLE                              (0 << 0)
 #	define R300_HIZ_ENABLE                               (1 << 0)
-#	define R300_HIZ_MIN                                  (0 << 1)
-#	define R300_HIZ_MAX                                  (1 << 1)
+#	define R300_HIZ_MAX                                  (0 << 1)
+#	define R300_HIZ_MIN                                  (1 << 1)
 #	define R300_FAST_FILL_DISABLE                        (0 << 2)
 #	define R300_FAST_FILL_ENABLE                         (1 << 2)
 #	define R300_RD_COMP_DISABLE                          (0 << 3)
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index f58d511e11..dd3b3c430a 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -130,7 +130,7 @@ void r500_emit_index_bias(struct r300_context *r300, int index_bias)
 
 /* This function splits the index bias value into two parts:
  * - buffer_offset: the value that can be safely added to buffer offsets
- *   in r300_emit_aos (it must yield a positive offset when added to
+ *   in r300_emit_vertex_arrays (it must yield a positive offset when added to
  *   a vertex buffer offset)
  * - index_offset: the value that must be manually subtracted from indices
  *   in an index buffer to achieve negative offsets. */
@@ -166,8 +166,8 @@ static void r300_split_index_bias(struct r300_context *r300, int index_bias,
 enum r300_prepare_flags {
     PREP_FIRST_DRAW     = (1 << 0), /* call emit_dirty_state and friends? */
     PREP_VALIDATE_VBOS  = (1 << 1), /* validate VBOs? */
-    PREP_EMIT_AOS       = (1 << 2), /* call emit_aos? */
-    PREP_EMIT_AOS_SWTCL = (1 << 3), /* call emit_aos_swtcl? */
+    PREP_EMIT_AOS       = (1 << 2), /* call emit_vertex_arrays? */
+    PREP_EMIT_AOS_SWTCL = (1 << 3), /* call emit_vertex_arrays_swtcl? */
     PREP_INDEXED        = (1 << 4)  /* is this draw_elements? */
 };
 
@@ -185,8 +185,8 @@ static boolean r300_reserve_cs_dwords(struct r300_context *r300,
 {
     boolean flushed        = FALSE;
     boolean first_draw     = flags & PREP_FIRST_DRAW;
-    boolean emit_aos       = flags & PREP_EMIT_AOS;
-    boolean emit_aos_swtcl = flags & PREP_EMIT_AOS_SWTCL;
+    boolean emit_vertex_arrays       = flags & PREP_EMIT_AOS;
+    boolean emit_vertex_arrays_swtcl = flags & PREP_EMIT_AOS_SWTCL;
 
     /* Add dirty state, index offset, and AOS. */
     if (first_draw) {
@@ -195,11 +195,11 @@ static boolean r300_reserve_cs_dwords(struct r300_context *r300,
         if (r300->screen->caps.index_bias_supported)
             cs_dwords += 2; /* emit_index_offset */
 
-        if (emit_aos)
-            cs_dwords += 55; /* emit_aos */
+        if (emit_vertex_arrays)
+            cs_dwords += 55; /* emit_vertex_arrays */
 
-        if (emit_aos_swtcl)
-            cs_dwords += 7; /* emit_aos_swtcl */
+        if (emit_vertex_arrays_swtcl)
+            cs_dwords += 7; /* emit_vertex_arrays_swtcl */
     }
 
     cs_dwords += r300_get_num_cs_end_dwords(r300);
@@ -218,19 +218,19 @@ static boolean r300_reserve_cs_dwords(struct r300_context *r300,
  * \param r300          The context.
  * \param flags         See r300_prepare_flags.
  * \param index_buffer  The index buffer to validate. The parameter may be NULL.
- * \param aos_offset    The offset passed to emit_aos.
+ * \param buffer_offset The offset passed to emit_vertex_arrays.
  * \param index_bias    The index bias to emit.
  * \return TRUE if rendering should be skipped
  */
 static boolean r300_emit_states(struct r300_context *r300,
                                 enum r300_prepare_flags flags,
                                 struct pipe_resource *index_buffer,
-                                int aos_offset,
+                                int buffer_offset,
                                 int index_bias)
 {
     boolean first_draw     = flags & PREP_FIRST_DRAW;
-    boolean emit_aos       = flags & PREP_EMIT_AOS;
-    boolean emit_aos_swtcl = flags & PREP_EMIT_AOS_SWTCL;
+    boolean emit_vertex_arrays       = flags & PREP_EMIT_AOS;
+    boolean emit_vertex_arrays_swtcl = flags & PREP_EMIT_AOS_SWTCL;
     boolean indexed        = flags & PREP_INDEXED;
     boolean validate_vbos  = flags & PREP_VALIDATE_VBOS;
 
@@ -264,11 +264,11 @@ static boolean r300_emit_states(struct r300_context *r300,
                 r500_emit_index_bias(r300, 0);
         }
 
-        if (emit_aos)
-            r300_emit_aos(r300, aos_offset, indexed);
+        if (emit_vertex_arrays)
+            r300_emit_vertex_arrays(r300, buffer_offset, indexed);
 
-        if (emit_aos_swtcl)
-            r300_emit_aos_swtcl(r300, indexed);
+        if (emit_vertex_arrays_swtcl)
+            r300_emit_vertex_arrays_swtcl(r300, indexed);
     }
 
     return TRUE;
@@ -281,7 +281,7 @@ static boolean r300_emit_states(struct r300_context *r300,
  * \param flags         See r300_prepare_flags.
  * \param index_buffer  The index buffer to validate. The parameter may be NULL.
  * \param cs_dwords     The number of dwords to reserve in CS.
- * \param aos_offset    The offset passed to emit_aos.
+ * \param buffer_offset The offset passed to emit_vertex_arrays.
  * \param index_bias    The index bias to emit.
  * \return TRUE if rendering should be skipped
  */
@@ -289,13 +289,13 @@ static boolean r300_prepare_for_rendering(struct r300_context *r300,
                                           enum r300_prepare_flags flags,
                                           struct pipe_resource *index_buffer,
                                           unsigned cs_dwords,
-                                          int aos_offset,
+                                          int buffer_offset,
                                           int index_bias)
 {
     if (r300_reserve_cs_dwords(r300, flags, cs_dwords))
         flags |= PREP_FIRST_DRAW;
 
-    return r300_emit_states(r300, flags, index_buffer, aos_offset, index_bias);
+    return r300_emit_states(r300, flags, index_buffer, buffer_offset, index_bias);
 }
 
 static boolean immd_is_good_idea(struct r300_context *r300,
@@ -467,10 +467,10 @@ static void r300_emit_draw_elements(struct r300_context *r300,
                                     unsigned maxIndex,
                                     unsigned mode,
                                     unsigned start,
-                                    unsigned count)
+                                    unsigned count,
+                                    uint16_t *imm_indices3)
 {
-    uint32_t count_dwords;
-    uint32_t offset_dwords = indexSize * start / sizeof(uint32_t);
+    uint32_t count_dwords, offset_dwords;
     boolean alt_num_verts = count > 65535;
     CS_LOCALS(r300);
 
@@ -485,15 +485,39 @@ static void r300_emit_draw_elements(struct r300_context *r300,
     DBG(r300, DBG_DRAW, "r300: Indexbuf of %u indices, min %u max %u\n",
         count, minIndex, maxIndex);
 
-    BEGIN_CS(13 + (alt_num_verts ? 2 : 0));
-    if (alt_num_verts) {
-        OUT_CS_REG(R500_VAP_ALT_NUM_VERTICES, count);
-    }
+    BEGIN_CS(5);
     OUT_CS_REG(R300_GA_COLOR_CONTROL,
             r300_provoking_vertex_fixes(r300, mode));
     OUT_CS_REG_SEQ(R300_VAP_VF_MAX_VTX_INDX, 2);
     OUT_CS(maxIndex);
     OUT_CS(minIndex);
+    END_CS;
+
+    /* If start is odd, render the first triangle with indices embedded
+     * in the command stream. This will increase start by 3 and make it
+     * even. We can then proceed without a fallback. */
+    if (indexSize == 2 && (start & 1) &&
+        mode == PIPE_PRIM_TRIANGLES) {
+        BEGIN_CS(4);
+        OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, 2);
+        OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (3 << 16) |
+               R300_VAP_VF_CNTL__PRIM_TRIANGLES);
+        OUT_CS(imm_indices3[1] << 16 | imm_indices3[0]);
+        OUT_CS(imm_indices3[2]);
+        END_CS;
+
+        start += 3;
+        count -= 3;
+        if (!count)
+           return;
+    }
+
+    offset_dwords = indexSize * start / sizeof(uint32_t);
+
+    BEGIN_CS(8 + (alt_num_verts ? 2 : 0));
+    if (alt_num_verts) {
+        OUT_CS_REG(R500_VAP_ALT_NUM_VERTICES, count);
+    }
     OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, 0);
     if (indexSize == 4) {
         count_dwords = count;
@@ -541,7 +565,7 @@ static void r300_draw_range_elements(struct pipe_context* pipe,
                             r300->rws->get_value(r300->rws, R300_VID_DRM_2_3_0);
     unsigned short_count;
     int buffer_offset = 0, index_offset = 0; /* for index bias emulation */
-    unsigned new_offset;
+    uint16_t indices3[3];
 
     if (indexBias && !r300->screen->caps.index_bias_supported) {
         r300_split_index_bias(r300, indexBias, &buffer_offset, &index_offset);
@@ -553,43 +577,48 @@ static void r300_draw_range_elements(struct pipe_context* pipe,
     r300_update_derived_state(r300);
 
     /* Fallback for misaligned ushort indices. */
-    if (indexSize == 2 && start % 2 == 1) {
+    if (indexSize == 2 && (start & 1) &&
+        !r300_buffer_is_user_buffer(indexBuffer)) {
         struct pipe_transfer *transfer;
         struct pipe_resource *userbuf;
+
         uint16_t *ptr = pipe_buffer_map(pipe, indexBuffer,
                                         PIPE_TRANSFER_READ, &transfer);
 
-        /* Copy the mapped index buffer directly to the upload buffer.
-         * The start index will be aligned simply from the fact that
-         * every sub-buffer in u_upload_mgr is aligned. */
-        userbuf = pipe->screen->user_buffer_create(pipe->screen,
-                                                   ptr + start, count * 2,
-                                                   PIPE_BIND_INDEX_BUFFER);
-        indexBuffer = userbuf;
-        r300_upload_index_buffer(r300, &indexBuffer, indexSize, 0, count, &new_offset);
-        pipe_resource_reference(&userbuf, NULL);
+        if (mode == PIPE_PRIM_TRIANGLES) {
+           memcpy(indices3, ptr + start, 6);
+        } else {
+            /* Copy the mapped index buffer directly to the upload buffer.
+             * The start index will be aligned simply from the fact that
+             * every sub-buffer in u_upload_mgr is aligned. */
+            userbuf = pipe->screen->user_buffer_create(pipe->screen,
+                                                       ptr, count * 2,
+                                                       PIPE_BIND_INDEX_BUFFER);
+            indexBuffer = userbuf;
+            r300_upload_index_buffer(r300, &indexBuffer, indexSize, &start, count);
+            pipe_resource_reference(&userbuf, NULL);
+        }
         pipe_buffer_unmap(pipe, transfer);
     } else {
-        r300_upload_index_buffer(r300, &indexBuffer, indexSize, start, count, &new_offset);
+        if (r300_buffer_is_user_buffer(indexBuffer))
+            r300_upload_index_buffer(r300, &indexBuffer, indexSize, &start, count);
     }
 
-    start = new_offset;
-
-    /* 15 dwords for emit_draw_elements. Give up if the function fails. */
+    /* 19 dwords for emit_draw_elements. Give up if the function fails. */
     if (!r300_prepare_for_rendering(r300,
             PREP_FIRST_DRAW | PREP_VALIDATE_VBOS | PREP_EMIT_AOS |
-            PREP_INDEXED, indexBuffer, 15, buffer_offset, indexBias))
+            PREP_INDEXED, indexBuffer, 19, buffer_offset, indexBias))
         goto done;
 
     if (alt_num_verts || count <= 65535) {
         r300_emit_draw_elements(r300, indexBuffer, indexSize,
-				minIndex, maxIndex, mode, start, count);
+				minIndex, maxIndex, mode, start, count, indices3);
     } else {
         do {
             short_count = MIN2(count, 65534);
             r300_emit_draw_elements(r300, indexBuffer, indexSize,
                                      minIndex, maxIndex,
-                                     mode, start, short_count);
+                                     mode, start, short_count, indices3);
 
             start += short_count;
             count -= short_count;
@@ -598,7 +627,7 @@ static void r300_draw_range_elements(struct pipe_context* pipe,
             if (count) {
                 if (!r300_prepare_for_rendering(r300,
                         PREP_VALIDATE_VBOS | PREP_EMIT_AOS | PREP_INDEXED,
-                        indexBuffer, 15, buffer_offset, indexBias))
+                        indexBuffer, 19, buffer_offset, indexBias))
                     goto done;
             }
         } while (count);
@@ -1041,8 +1070,7 @@ static struct vbuf_render* r300_render_create(struct r300_context* r300)
 
     r300render->r300 = r300;
 
-    /* XXX find real numbers plz */
-    r300render->base.max_vertex_buffer_bytes = 128 * 1024;
+    r300render->base.max_vertex_buffer_bytes = 1024 * 1024;
     r300render->base.max_indices = 16 * 1024;
 
     r300render->base.get_vertex_info = r300_render_get_vertex_info;
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 921d6f1e67..c75aeaa10a 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -34,6 +34,10 @@
 
 #include "draw/draw_context.h"
 
+#ifdef HAVE_LLVM
+#include "gallivm/lp_bld_init.h"
+#endif
+
 /* Return the identifier behind whom the brave coders responsible for this
  * amalgamation of code, sweat, and duct tape, routinely obscure their names.
  *
@@ -309,7 +313,9 @@ static boolean r300_is_format_supported(struct pipe_screen* screen,
                                         unsigned usage,
                                         unsigned geom_flags)
 {
+    struct r300_winsys_screen *rws = r300_screen(screen)->rws;
     uint32_t retval = 0;
+    boolean drm_2_8_0 = rws->get_value(rws, R300_VID_DRM_2_8_0);
     boolean is_r500 = r300_screen(screen)->caps.is_r500;
     boolean is_r400 = r300_screen(screen)->caps.is_r400;
     boolean is_color2101010 = format == PIPE_FORMAT_R10G10B10A2_UNORM ||
@@ -363,7 +369,7 @@ static boolean r300_is_format_supported(struct pipe_screen* screen,
                   PIPE_BIND_SCANOUT |
                   PIPE_BIND_SHARED)) &&
         /* 2101010 cannot be rendered to on non-r5xx. */
-        (is_r500 || !is_color2101010) &&
+        (!is_color2101010 || (is_r500 && drm_2_8_0)) &&
         r300_is_colorbuffer_format_supported(format)) {
         retval |= usage &
             (PIPE_BIND_RENDER_TARGET |
@@ -484,5 +490,9 @@ struct pipe_screen* r300_screen_create(struct r300_winsys_screen *rws)
 
     util_format_s3tc_init();
 
+#ifdef HAVE_LLVM
+    lp_build_init();
+#endif
+
     return &r300screen->screen;
 }
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.c b/src/gallium/drivers/r300/r300_screen_buffer.c
index 4436443522..f96998195a 100644
--- a/src/gallium/drivers/r300/r300_screen_buffer.c
+++ b/src/gallium/drivers/r300/r300_screen_buffer.c
@@ -56,72 +56,43 @@ static unsigned r300_buffer_is_referenced_by_cs(struct pipe_context *context,
     return r300_buffer_is_referenced(context, buf, R300_REF_CS);
 }
 
-/* External helper, not required to implent u_resource_vtbl:
- */
-int r300_upload_index_buffer(struct r300_context *r300,
-			     struct pipe_resource **index_buffer,
-			     unsigned index_size,
-			     unsigned start,
-			     unsigned count,
-			     unsigned *out_offset)
+void r300_upload_index_buffer(struct r300_context *r300,
+			      struct pipe_resource **index_buffer,
+			      unsigned index_size, unsigned *start,
+			      unsigned count)
 {
-   struct pipe_resource *upload_buffer = NULL;
-   unsigned index_offset = start * index_size;
-   int ret = 0;
-
-    if (r300_buffer_is_user_buffer(*index_buffer)) {
-	ret = u_upload_buffer(r300->upload_ib,
-			      index_offset,
-			      count * index_size,
-			      *index_buffer,
-			      &index_offset,
-			      &upload_buffer);
-	if (ret) {
-	    goto done;
-	}
-	*index_buffer = upload_buffer;
-	*out_offset = index_offset / index_size;
-    } else
-        *out_offset = start;
-
- done:
-    //    if (upload_buffer)
-    //	pipe_resource_reference(&upload_buffer, NULL);
-    return ret;
+    unsigned index_offset;
+    uint8_t *ptr = r300_buffer(*index_buffer)->user_buffer;
+
+    *index_buffer = NULL;
+
+    u_upload_data(r300->upload_ib,
+                  count * index_size,
+                  ptr + (*start * index_size),
+                  &index_offset,
+                  index_buffer);
+
+    *start = index_offset / index_size;
 }
 
-/* External helper, not required to implement u_resource_vtbl:
- */
-int r300_upload_user_buffers(struct r300_context *r300)
+void r300_upload_user_buffers(struct r300_context *r300)
 {
-    enum pipe_error ret = PIPE_OK;
-    int i, nr;
-
-    nr = r300->velems->count;
+    int i, nr = r300->velems->count;
 
     for (i = 0; i < nr; i++) {
         struct pipe_vertex_buffer *vb =
             &r300->vertex_buffer[r300->velems->velem[i].vertex_buffer_index];
 
         if (r300_buffer_is_user_buffer(vb->buffer)) {
-            struct pipe_resource *upload_buffer = NULL;
-            unsigned offset = 0; /*vb->buffer_offset * 4;*/
-            unsigned size = vb->buffer->width0;
-            unsigned upload_offset;
-            ret = u_upload_buffer(r300->upload_vb,
-                                  offset, size,
-                                  vb->buffer,
-                                  &upload_offset, &upload_buffer);
-            if (ret)
-                return ret;
-
-            pipe_resource_reference(&vb->buffer, NULL);
-            vb->buffer = upload_buffer;
-            vb->buffer_offset = upload_offset;
+            u_upload_data(r300->upload_vb,
+                          vb->buffer->width0,
+                          r300_buffer(vb->buffer)->user_buffer,
+                          &vb->buffer_offset, &vb->buffer);
+
             r300->validate_buffers = TRUE;
+            r300->vertex_arrays_dirty = TRUE;
         }
     }
-    return ret;
 }
 
 static void r300_buffer_destroy(struct pipe_screen *screen,
@@ -278,26 +249,26 @@ static void r300_buffer_transfer_inline_write(struct pipe_context *pipe,
                                               unsigned stride,
                                               unsigned layer_stride)
 {
+    struct r300_context *r300 = r300_context(pipe);
+    struct r300_winsys_screen *rws = r300->screen->rws;
     struct r300_buffer *rbuf = r300_buffer(resource);
-    struct pipe_transfer *transfer = NULL;
     uint8_t *map = NULL;
 
     if (rbuf->constant_buffer) {
         memcpy(rbuf->constant_buffer + box->x, data, box->width);
         return;
     }
+    assert(rbuf->user_buffer == NULL);
 
-    transfer = r300_buffer_get_transfer(pipe, resource, 0,
-                        PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD, box);
-    map = r300_buffer_transfer_map(pipe, transfer);
+    map = rws->buffer_map(rws, rbuf->buf, r300->cs,
+                          PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD | usage);
 
-    memcpy(map, data, box->width);
+    memcpy(map + box->x, data, box->width);
 
-    r300_buffer_transfer_unmap(pipe, transfer);
-    r300_buffer_transfer_destroy(pipe, transfer);
+    rws->buffer_unmap(rws, rbuf->buf);
 }
 
-struct u_resource_vtbl r300_buffer_vtbl = 
+struct u_resource_vtbl r300_buffer_vtbl =
 {
    u_default_resource_get_handle,      /* get_handle */
    r300_buffer_destroy,                /* resource_destroy */
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.h b/src/gallium/drivers/r300/r300_screen_buffer.h
index 0b3555dd81..fb0033c0f5 100644
--- a/src/gallium/drivers/r300/r300_screen_buffer.h
+++ b/src/gallium/drivers/r300/r300_screen_buffer.h
@@ -63,13 +63,12 @@ struct r300_buffer
 
 /* Functions. */
 
-int r300_upload_user_buffers(struct r300_context *r300);
+void r300_upload_user_buffers(struct r300_context *r300);
 
-int r300_upload_index_buffer(struct r300_context *r300,
-			     struct pipe_resource **index_buffer,
-			     unsigned index_size,
-			     unsigned start,
-			     unsigned count, unsigned *out_offset);
+void r300_upload_index_buffer(struct r300_context *r300,
+			      struct pipe_resource **index_buffer,
+			      unsigned index_size, unsigned *start,
+			      unsigned count);
 
 struct pipe_resource *r300_buffer_create(struct pipe_screen *screen,
 					 const struct pipe_resource *templ);
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index f902db54cc..f748fe5997 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -686,13 +686,22 @@ void r300_mark_fb_state_dirty(struct r300_context *r300,
     struct pipe_framebuffer_state *state = r300->fb_state.state;
     boolean can_hyperz = r300->rws->get_value(r300->rws, R300_CAN_HYPERZ);
 
-    /* What is marked as dirty depends on the enum r300_fb_state_change. */
     r300_mark_atom_dirty(r300, &r300->gpu_flush);
     r300_mark_atom_dirty(r300, &r300->fb_state);
-    r300_mark_atom_dirty(r300, &r300->hyperz_state);
 
+    /* What is marked as dirty depends on the enum r300_fb_state_change. */
     if (change == R300_CHANGED_FB_STATE) {
         r300_mark_atom_dirty(r300, &r300->aa_state);
+    }
+
+    if (change == R300_CHANGED_FB_STATE ||
+        change == R300_CHANGED_CBZB_FLAG ||
+        change == R300_CHANGED_ZCLEAR_FLAG) {
+        r300_mark_atom_dirty(r300, &r300->hyperz_state);
+    }
+
+    if (change == R300_CHANGED_FB_STATE ||
+        change == R300_CHANGED_MULTIWRITE) {
         r300_mark_atom_dirty(r300, &r300->fb_state_pipelined);
     }
 
@@ -876,16 +885,25 @@ static void r300_bind_fs_state(struct pipe_context* pipe, void* shader)
 {
     struct r300_context* r300 = r300_context(pipe);
     struct r300_fragment_shader* fs = (struct r300_fragment_shader*)shader;
+    struct pipe_framebuffer_state *fb = r300->fb_state.state;
+    boolean last_multi_write;
 
     if (fs == NULL) {
         r300->fs.state = NULL;
         return;
     }
 
+    last_multi_write = r300_fragment_shader_writes_all(r300_fs(r300));
+
     r300->fs.state = fs;
     r300_pick_fragment_shader(r300);
     r300_mark_fs_code_dirty(r300);
 
+    if (fb->nr_cbufs > 1 &&
+        last_multi_write != r300_fragment_shader_writes_all(fs)) {
+        r300_mark_fb_state_dirty(r300, R300_CHANGED_MULTIWRITE);
+    }
+
     r300_mark_atom_dirty(r300, &r300->rs_block_state); /* Will be updated before the emission. */
 }
 
@@ -1509,7 +1527,7 @@ static void r300_set_vertex_buffers(struct pipe_context* pipe,
 
         r300->any_user_vbs = any_user_buffer;
         r300->vertex_buffer_max_index = max_index;
-        r300->aos_dirty = TRUE;
+        r300->vertex_arrays_dirty = TRUE;
         r300->validate_buffers = TRUE;
     } else {
         /* SW TCL. */
@@ -1717,7 +1735,7 @@ static void r300_bind_vertex_elements_state(struct pipe_context *pipe,
 
     UPDATE_STATE(&velems->vertex_stream, r300->vertex_stream_state);
     r300->vertex_stream_state.size = (1 + velems->vertex_stream.count) * 2;
-    r300->aos_dirty = TRUE;
+    r300->vertex_arrays_dirty = TRUE;
 }
 
 static void r300_delete_vertex_elements_state(struct pipe_context *pipe, void *state)
@@ -1809,6 +1827,7 @@ static void r300_set_constant_buffer(struct pipe_context *pipe,
 {
     struct r300_context* r300 = r300_context(pipe);
     struct r300_constant_buffer *cbuf;
+    struct r300_buffer *rbuf = r300_buffer(buf);
     uint32_t *mapped;
 
     switch (shader) {
@@ -1822,10 +1841,15 @@ static void r300_set_constant_buffer(struct pipe_context *pipe,
             return;
     }
 
-    if (buf == NULL || buf->width0 == 0 ||
-        (mapped = (uint32_t*)r300_buffer(buf)->constant_buffer) == NULL) {
+    if (buf == NULL || buf->width0 == 0)
+        return;
+
+    if (rbuf->user_buffer)
+        mapped = (uint32_t*)rbuf->user_buffer;
+    else if (rbuf->constant_buffer)
+        mapped = (uint32_t*)rbuf->constant_buffer;
+    else
         return;
-    }
 
     if (shader == PIPE_SHADER_FRAGMENT ||
         (shader == PIPE_SHADER_VERTEX && r300->screen->caps.has_tcl)) {
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index d5fc8ece25..d3985c11aa 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -645,6 +645,10 @@ static uint32_t r300_get_border_color(enum pipe_format format,
     }
 
     switch (desc->channel[0].size) {
+        case 2:
+            util_pack_color(border_swizzled, PIPE_FORMAT_B2G3R3_UNORM, &uc);
+            break;
+
         case 4:
             util_pack_color(border_swizzled, PIPE_FORMAT_B4G4R4A4_UNORM, &uc);
             break;
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 6d86bc282f..6fdc504ed5 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -244,6 +244,11 @@ uint32_t r300_translate_texformat(enum pipe_format format,
                     desc->channel[2].size == 6) {
                     return R300_TX_FORMAT_Z6Y5X5 | result;
                 }
+                if (desc->channel[0].size == 2 &&
+                    desc->channel[1].size == 3 &&
+                    desc->channel[2].size == 3) {
+                    return R300_TX_FORMAT_Z3Y3X2 | result;
+                }
                 return ~0; /* Unsupported/unknown. */
 
             case 4:
@@ -481,6 +486,8 @@ static uint32_t r300_translate_out_fmt(enum pipe_format format)
     } else {
         if (desc->channel[i].size == 16) {
             modifier |= R300_US_OUT_FMT_C4_16;
+        } else if (desc->channel[i].size == 10) {
+            modifier |= R300_US_OUT_FMT_C4_10;
         } else {
             /* C4_8 seems to be used for the formats whose pixel size
              * is <= 32 bits. */
diff --git a/src/gallium/drivers/r300/r300_winsys.h b/src/gallium/drivers/r300/r300_winsys.h
index 0dd330d101..b8324afe51 100644
--- a/src/gallium/drivers/r300/r300_winsys.h
+++ b/src/gallium/drivers/r300/r300_winsys.h
@@ -51,8 +51,9 @@ enum r300_value_id {
     R300_VID_GB_PIPES,
     R300_VID_Z_PIPES,
     R300_VID_SQUARE_TILING_SUPPORT,
-    R300_VID_DRM_2_3_0,
-    R300_VID_DRM_2_6_0,
+    R300_VID_DRM_2_3_0, /* R500 VAP regs, MSPOS regs, fixed tex3D size checking */
+    R300_VID_DRM_2_6_0, /* Hyper-Z, GB_Z_PEQ_CONFIG on rv350->r4xx, R500 FG_ALPHA_VALUE */
+    R300_VID_DRM_2_8_0, /* R500 US_FORMAT regs, R500 ARGB2101010 colorbuffer */
     R300_CAN_HYPERZ,
 };
 
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index af19beb6f3..c6f3669c9a 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -770,8 +770,6 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 
 	util_copy_framebuffer_state(&rctx->framebuffer, state);
 
-	rctx->pframebuffer = &rctx->framebuffer;
-
 	/* build states */
 	for (int i = 0; i < state->nr_cbufs; i++) {
 		evergreen_cb(rctx, rstate, state, i);
@@ -1334,7 +1332,7 @@ void evergreen_vertex_buffer_update(struct r600_pipe_context *rctx)
 			rbuffer = (struct r600_resource*)vertex_buffer->buffer;
 			offset = 0;
 		}
-		if (vertex_buffer == NULL)
+		if (vertex_buffer == NULL || rbuffer == NULL)
 			continue;
 		offset += vertex_buffer->buffer_offset + r600_bo_offset(rbuffer->bo);
 
@@ -1359,7 +1357,7 @@ void evergreen_vertex_buffer_update(struct r600_pipe_context *rctx)
 					0x00000000, 0xFFFFFFFF, NULL);
 		r600_pipe_state_add_reg(rstate, R_03001C_RESOURCE0_WORD7,
 					0xC0000000, 0xFFFFFFFF, NULL);
-		evergreen_fs_resource_set(&rctx->ctx, rstate, i);
+		evergreen_context_pipe_state_set_fs_resource(&rctx->ctx, rstate, i);
 	}
 }
 
diff --git a/src/gallium/drivers/r600/r600.h b/src/gallium/drivers/r600/r600.h
index aa456d493f..578ac40ba9 100644
--- a/src/gallium/drivers/r600/r600.h
+++ b/src/gallium/drivers/r600/r600.h
@@ -35,7 +35,7 @@
 #define RADEON_CTX_MAX_PM4	(64 * 1024 / 4)
 
 #define R600_ERR(fmt, args...) \
-	fprintf(stderr, "EE %s/%s:%d - "fmt, __FILE__, __func__, __LINE__, ##args)
+	fprintf(stderr, "EE %s:%d %s - "fmt, __FILE__, __LINE__, __func__, ##args)
 
 typedef uint64_t		u64;
 typedef uint32_t		u32;
@@ -284,10 +284,6 @@ void r600_context_queries_resume(struct r600_context *ctx);
 
 int evergreen_context_init(struct r600_context *ctx, struct radeon *radeon);
 void evergreen_context_draw(struct r600_context *ctx, const struct r600_draw *draw);
-void evergreen_ps_resource_set(struct r600_context *ctx, struct r600_pipe_state *state, unsigned rid);
-void evergreen_vs_resource_set(struct r600_context *ctx, struct r600_pipe_state *state, unsigned rid);
-void evergreen_fs_resource_set(struct r600_context *ctx, struct r600_pipe_state *state, unsigned rid);
-
 void evergreen_context_pipe_state_set_ps_resource(struct r600_context *ctx, struct r600_pipe_state *state, unsigned rid);
 void evergreen_context_pipe_state_set_vs_resource(struct r600_context *ctx, struct r600_pipe_state *state, unsigned rid);
 void evergreen_context_pipe_state_set_fs_resource(struct r600_context *ctx, struct r600_pipe_state *state, unsigned rid);
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index 0f04136fb2..b9ec9592e3 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -78,7 +78,7 @@ static void r600_blitter_end(struct pipe_context *ctx)
 	r600_context_queries_resume(&rctx->ctx);
 }
 
-int r600_blit_uncompress_depth(struct pipe_context *ctx, struct r600_resource_texture *texture)
+void r600_blit_uncompress_depth(struct pipe_context *ctx, struct r600_resource_texture *texture)
 {
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct pipe_surface *zsurf, *cbsurf, surf_tmpl;
@@ -107,9 +107,6 @@ int r600_blit_uncompress_depth(struct pipe_context *ctx, struct r600_resource_te
 
 	pipe_surface_reference(&zsurf, NULL);
 	pipe_surface_reference(&cbsurf, NULL);
-
-
-	return 0;
 }
 
 static void r600_clear(struct pipe_context *ctx, unsigned buffers,
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 72988b946e..8f6836a573 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -198,8 +198,6 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void
 	else
 		rctx->custom_dsa_flush = evergreen_create_db_flush_dsa(rctx);
 
-	r600_blit_uncompress_depth_ptr = r600_blit_uncompress_depth;
-
 	return &rctx->context;
 }
 
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 43dbee99b0..1cdca9cb3d 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -136,7 +136,6 @@ struct r600_upload;
 struct r600_pipe_context {
 	struct pipe_context		context;
 	struct blitter_context		*blitter;
-	struct pipe_framebuffer_state	*pframebuffer;
 	unsigned			family;
 	void				*custom_dsa_flush;
 	struct r600_screen		*screen;
@@ -197,7 +196,7 @@ void evergreen_vertex_buffer_update(struct r600_pipe_context *rctx);
 
 /* r600_blit.c */
 void r600_init_blit_functions(struct r600_pipe_context *rctx);
-int r600_blit_uncompress_depth(struct pipe_context *ctx, struct r600_resource_texture *texture);
+void r600_blit_uncompress_depth(struct pipe_context *ctx, struct r600_resource_texture *texture);
 
 /* r600_buffer.c */
 struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
diff --git a/src/gallium/drivers/r600/r600_resource.h b/src/gallium/drivers/r600/r600_resource.h
index 8ca2769920..9b1af5e6f2 100644
--- a/src/gallium/drivers/r600/r600_resource.h
+++ b/src/gallium/drivers/r600/r600_resource.h
@@ -106,7 +106,6 @@ static INLINE boolean r600_buffer_is_user_buffer(struct pipe_resource *buffer)
 }
 
 int r600_texture_depth_flush(struct pipe_context *ctx, struct pipe_resource *texture);
-int (*r600_blit_uncompress_depth_ptr)(struct pipe_context *ctx, struct r600_resource_texture *texture);
 
 /* r600_texture.c texture transfer functions. */
 struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx,
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 9c7b7f0a57..bb5038c49b 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -589,6 +589,8 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s
 			if (r)
 				goto out_err;
 			break;
+		case TGSI_TOKEN_TYPE_PROPERTY:
+			break;
 		default:
 			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
 			r = -EINVAL;
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 0d76afd6cd..b68203f78f 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -174,7 +174,7 @@ void r600_vertex_buffer_update(struct r600_pipe_context *rctx)
 			rbuffer = (struct r600_resource*)vertex_buffer->buffer;
 			offset = 0;
 		}
-		if (vertex_buffer == NULL)
+		if (vertex_buffer == NULL || rbuffer == NULL)
 			continue;
 		offset += vertex_buffer->buffer_offset + r600_bo_offset(rbuffer->bo);
 
@@ -1019,8 +1019,6 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 
 	util_copy_framebuffer_state(&rctx->framebuffer, state);
 
-	rctx->pframebuffer = &rctx->framebuffer;
-
 	/* build states */
 	for (int i = 0; i < state->nr_cbufs; i++) {
 		r600_cb(rctx, rstate, state, i);
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index f488cf74ff..3603376f73 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -182,7 +182,7 @@ void r600_set_vertex_buffers(struct pipe_context *ctx, unsigned count,
 	if (rctx->family >= CHIP_CEDAR) {
 		for (int i = 0; i < rctx->nvertex_buffer; i++) {
 			pipe_resource_reference(&rctx->vertex_buffer[i].buffer, NULL);
-			evergreen_fs_resource_set(&rctx->ctx, NULL, i);
+			evergreen_context_pipe_state_set_fs_resource(&rctx->ctx, NULL, i);
 		}
 	} else {
 		for (int i = 0; i < rctx->nvertex_buffer; i++) {
diff --git a/src/gallium/drivers/r600/r600_texture.c b/src/gallium/drivers/r600/r600_texture.c
index d4d9b07c0e..e274562457 100644
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@@ -443,8 +443,6 @@ static unsigned int r600_texture_is_referenced(struct pipe_context *context,
 	return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
 }
 
-int (*r600_blit_uncompress_depth_ptr)(struct pipe_context *ctx, struct r600_resource_texture *texture);
-
 int r600_texture_depth_flush(struct pipe_context *ctx,
 			     struct pipe_resource *texture)
 {
@@ -476,7 +474,7 @@ int r600_texture_depth_flush(struct pipe_context *ctx,
 out:
 	/* XXX: only do this if the depth texture has actually changed:
 	 */
-	r600_blit_uncompress_depth_ptr(ctx, rtex);
+	r600_blit_uncompress_depth(ctx, rtex);
 	return 0;
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c
index 6af1b2d061..76cfc0bf51 100644
--- a/src/gallium/drivers/softpipe/sp_quad_blend.c
+++ b/src/gallium/drivers/softpipe/sp_quad_blend.c
@@ -35,6 +35,7 @@
 #include "util/u_memory.h"
 #include "util/u_format.h"
 #include "sp_context.h"
+#include "sp_state.h"
 #include "sp_quad.h"
 #include "sp_tile_cache.h"
 #include "sp_quad_pipe.h"
@@ -794,6 +795,9 @@ blend_fallback(struct quad_stage *qs,
    struct softpipe_context *softpipe = qs->softpipe;
    const struct pipe_blend_state *blend = softpipe->blend;
    unsigned cbuf;
+   boolean write_all;
+
+   write_all = softpipe->fs->color0_writes_all_cbufs;
 
    for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) 
    {
@@ -806,15 +810,19 @@ blend_fallback(struct quad_stage *qs,
                               quads[0]->input.y0);
       boolean has_dst_alpha
          = util_format_has_alpha(softpipe->framebuffer.cbufs[cbuf]->format);
-      uint q, i, j;
+      uint q, i, j, qbuf;
+
+      qbuf = write_all ? 0 : cbuf;
 
       for (q = 0; q < nr; q++) {
          struct quad_header *quad = quads[q];
-         float (*quadColor)[4] = quad->output.color[cbuf];
+         float (*quadColor)[4];
          const int itx = (quad->input.x0 & (TILE_SIZE-1));
          const int ity = (quad->input.y0 & (TILE_SIZE-1));
 
-         /* get/swizzle dest colors 
+         quadColor = quad->output.color[qbuf];
+
+         /* get/swizzle dest colors
           */
          for (j = 0; j < QUAD_SIZE; j++) {
             int x = itx + (j & 1);
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 525bf23734..bb19f8cff2 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -74,7 +74,7 @@ struct sp_fragment_shader {
 
    boolean origin_lower_left; /**< fragment shader uses lower left position origin? */
    boolean pixel_center_integer; /**< fragment shader uses integer pixel center? */
-
+   boolean color0_writes_all_cbufs; /**< fragment shader writes color0 to all bound cbufs */
    void (*prepare)( const struct sp_fragment_shader *shader,
 		    struct tgsi_exec_machine *machine,
 		    struct tgsi_sampler **samplers);
diff --git a/src/gallium/drivers/softpipe/sp_state_shader.c b/src/gallium/drivers/softpipe/sp_state_shader.c
index 7fff338cce..66ddc56572 100644
--- a/src/gallium/drivers/softpipe/sp_state_shader.c
+++ b/src/gallium/drivers/softpipe/sp_state_shader.c
@@ -78,6 +78,8 @@ softpipe_create_fs_state(struct pipe_context *pipe,
          state->origin_lower_left = state->info.properties[i].data[0];
       else if (state->info.properties[i].name == TGSI_PROPERTY_FS_COORD_PIXEL_CENTER)
 	 state->pixel_center_integer = state->info.properties[i].data[0];
+      else if (state->info.properties[i].name == TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS)
+	 state->color0_writes_all_cbufs = state->info.properties[i].data[0];
    }
 
    return state;
diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
index 81dd4778d0..97cbac447d 100644
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -315,7 +315,6 @@ enum pipe_error svga_hwtnl_prim( struct svga_hwtnl *hwtnl,
             break;
          }
 
-         assert(!stride || width <= stride);
          if (max_index != ~0) {
             assert(offset + (index_bias + max_index) * stride + width <= size);
          }
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 078190342a..d0f42c614c 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -35,7 +35,6 @@
 #include "svga_resource_texture.h"
 #include "svga_resource.h"
 #include "svga_debug.h"
-#include "svga_surface.h"
 
 #include "svga3d_shaderdefs.h"
 
diff --git a/src/gallium/drivers/svga/svga_state_need_swtnl.c b/src/gallium/drivers/svga/svga_state_need_swtnl.c
index 66fea02a4b..99263d82e1 100644
--- a/src/gallium/drivers/svga/svga_state_need_swtnl.c
+++ b/src/gallium/drivers/svga/svga_state_need_swtnl.c
@@ -177,7 +177,7 @@ static int update_need_swtnl( struct svga_context *svga,
 
    if (need_swtnl != svga->state.sw.need_swtnl) {
       SVGA_DBG(DEBUG_SWTNL|DEBUG_PERF,
-               "%s need_swvfetch: %s, need_pipeline %s\n",
+               "%s: need_swvfetch %s, need_pipeline %s\n",
                __FUNCTION__,
                svga->state.sw.need_swvfetch ? "true" : "false",
                svga->state.sw.need_pipeline ? "true" : "false");
diff --git a/src/gallium/drivers/svga/svga_swtnl_backend.c b/src/gallium/drivers/svga/svga_swtnl_backend.c
index 24646b48f6..d5db6bf641 100644
--- a/src/gallium/drivers/svga/svga_swtnl_backend.c
+++ b/src/gallium/drivers/svga/svga_swtnl_backend.c
@@ -158,7 +158,7 @@ svga_vbuf_render_set_primitive( struct vbuf_render *render,
 }
 
 static void
-svga_vbuf_sumbit_state( struct svga_vbuf_render *svga_render )
+svga_vbuf_submit_state( struct svga_vbuf_render *svga_render )
 {
    struct svga_context *svga = svga_render->svga;
    SVGA3dVertexDecl vdecl[PIPE_MAX_ATTRIBS];
@@ -221,7 +221,8 @@ svga_vbuf_render_draw_arrays( struct vbuf_render *render,
    unsigned bias = (svga_render->vbuf_offset - svga_render->vdecl_offset) / svga_render->vertex_size;
    enum pipe_error ret = 0;
 
-   svga_vbuf_sumbit_state(svga_render);
+   /* off to hardware */
+   svga_vbuf_submit_state(svga_render);
 
    /* Need to call update_state() again as the draw module may have
     * altered some of our state behind our backs.  Testcase:
@@ -267,9 +268,8 @@ svga_vbuf_render_draw_elements( struct vbuf_render *render,
    pipe_buffer_write_nooverlap(&svga->pipe, svga_render->ibuf,
 			       svga_render->ibuf_offset, 2 * nr_indices, indices);
 
-
    /* off to hardware */
-   svga_vbuf_sumbit_state(svga_render);
+   svga_vbuf_submit_state(svga_render);
 
    /* Need to call update_state() again as the draw module may have
     * altered some of our state behind our backs.  Testcase: