299 files changed, 7531 insertions, 2375 deletions
diff --git a/src/egl/drivers/glx/egl_glx.c b/src/egl/drivers/glx/egl_glx.c
index 96292b0e9e..7c6e8637a1 100644
--- a/src/egl/drivers/glx/egl_glx.c
+++ b/src/egl/drivers/glx/egl_glx.c
@@ -899,7 +899,7 @@ GLX_eglSwapBuffers(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *draw)
  * Called from eglGetProcAddress() via drv->API.GetProcAddress().
  */
 static _EGLProc
-GLX_eglGetProcAddress(const char *procname)
+GLX_eglGetProcAddress(_EGLDriver *drv, const char *procname)
 {
    return (_EGLProc) glXGetProcAddress((const GLubyte *) procname);
 }
diff --git a/src/egl/drivers/xdri/egl_xdri.c b/src/egl/drivers/xdri/egl_xdri.c
index d2affc66dd..8425b3d11e 100644
--- a/src/egl/drivers/xdri/egl_xdri.c
+++ b/src/egl/drivers/xdri/egl_xdri.c
@@ -355,7 +355,7 @@ xdri_eglTerminate(_EGLDriver *drv, _EGLDisplay *dpy)
  * Called from eglGetProcAddress() via drv->API.GetProcAddress().
  */
 static _EGLProc
-xdri_eglGetProcAddress(const char *procname)
+xdri_eglGetProcAddress(_EGLDriver *drv, const char *procname)
 {
    /* the symbol is defined in libGL.so */
    return (_EGLProc) _glapi_get_proc_address(procname);
@@ -562,7 +562,7 @@ xdri_eglSwapBuffers(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *draw)
    struct xdri_egl_display *xdri_dpy = lookup_display(dpy);
    struct xdri_egl_surface *xdri_surf = lookup_surface(draw);
 
-   xdri_dpy->psc->driScreen->swapBuffers(xdri_surf->driDrawable);
+   xdri_dpy->psc->driScreen->swapBuffers(xdri_surf->driDrawable, 0, 0, 0);
 
    return EGL_TRUE;
 }
diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index 14cc5fa613..26e0602453 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -716,7 +716,8 @@ void (* EGLAPIENTRY eglGetProcAddress(const char *procname))()
 
    /* now loop over drivers to query their procs */
    for (i = 0; i < _eglGlobal.NumDrivers; i++) {
-      _EGLProc p = _eglGlobal.Drivers[i]->API.GetProcAddress(procname);
+      _EGLDriver *drv = _eglGlobal.Drivers[i];
+      _EGLProc p = drv->API.GetProcAddress(drv, procname);
       if (p)
          return p;
    }
diff --git a/src/egl/main/eglapi.h b/src/egl/main/eglapi.h
index aa0abe3eb6..080f2155e3 100644
--- a/src/egl/main/eglapi.h
+++ b/src/egl/main/eglapi.h
@@ -44,7 +44,7 @@ typedef const char *(*QueryString_t)(_EGLDriver *drv, _EGLDisplay *dpy, EGLint n
 typedef EGLBoolean (*WaitClient_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx);
 typedef EGLBoolean (*WaitNative_t)(_EGLDriver *drv, _EGLDisplay *dpy, EGLint engine);
 
-typedef _EGLProc (*GetProcAddress_t)(const char *procname);
+typedef _EGLProc (*GetProcAddress_t)(_EGLDriver *drv, const char *procname);
 
 
 
diff --git a/src/egl/main/eglcompiler.h b/src/egl/main/eglcompiler.h
index f7c93f14ce..5a3fb49ac2 100644
--- a/src/egl/main/eglcompiler.h
+++ b/src/egl/main/eglcompiler.h
@@ -64,7 +64,8 @@
 /**
  * Function visibility
  */
-#if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303
+#if (defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303) \
+	|| (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
 #  define PUBLIC __attribute__((visibility("default")))
 #else
 #  define PUBLIC
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
index e3af41c6e0..8f937e3b4e 100644
--- a/src/gallium/auxiliary/Makefile
+++ b/src/gallium/auxiliary/Makefile
@@ -111,6 +111,7 @@ C_SOURCES = \
 	util/u_math.c \
 	util/u_mm.c \
 	util/u_rect.c \
+	util/u_ringbuffer.c \
 	util/u_simple_shaders.c \
 	util/u_snprintf.c \
 	util/u_stream_stdc.c \
diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript
index 782eb53386..f957090b5f 100644
--- a/src/gallium/auxiliary/SConscript
+++ b/src/gallium/auxiliary/SConscript
@@ -147,6 +147,7 @@ source = [
     'util/u_math.c',
     'util/u_mm.c',
     'util/u_rect.c',
+    'util/u_ringbuffer.c',
     'util/u_simple_shaders.c',
     'util/u_snprintf.c',
     'util/u_stream_stdc.c',
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 2b16332e14..fdfb5faa59 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -539,6 +539,38 @@ void cso_restore_samplers(struct cso_context *ctx)
    cso_single_sampler_done( ctx );
 }
 
+/*
+ * If the function encouters any errors it will return the
+ * last one. Done to always try to set as many samplers
+ * as possible.
+ */
+enum pipe_error cso_set_vertex_samplers(struct cso_context *ctx,
+                                        unsigned nr,
+                                        const struct pipe_sampler_state **templates)
+{
+   unsigned i;
+   enum pipe_error temp, error = PIPE_OK;
+
+   /* TODO: fastpath
+    */
+
+   for (i = 0; i < nr; i++) {
+      temp = cso_single_vertex_sampler( ctx, i, templates[i] );
+      if (temp != PIPE_OK)
+         error = temp;
+   }
+
+   for ( ; i < ctx->nr_samplers; i++) {
+      temp = cso_single_vertex_sampler( ctx, i, NULL );
+      if (temp != PIPE_OK)
+         error = temp;
+   }
+
+   cso_single_vertex_sampler_done( ctx );
+
+   return error;
+}
+
 void
 cso_save_vertex_samplers(struct cso_context *ctx)
 {
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index b9e313e32d..d2089b1c88 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -84,6 +84,10 @@ enum pipe_error cso_single_sampler( struct cso_context *cso,
 
 void cso_single_sampler_done( struct cso_context *cso );
 
+enum pipe_error cso_set_vertex_samplers(struct cso_context *cso,
+                                        unsigned count,
+                                        const struct pipe_sampler_state **states);
+
 void
 cso_save_vertex_samplers(struct cso_context *cso);
 
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 667aa46b20..e90dfc5aec 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -95,6 +95,7 @@ void draw_destroy( struct draw_context *draw )
    draw_pipeline_destroy( draw );
    draw_pt_destroy( draw );
    draw_vs_destroy( draw );
+   draw_gs_destroy( draw );
 
    FREE( draw );
 }
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index b716209df2..8a64c06efc 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -164,6 +164,14 @@ void draw_set_mapped_constant_buffer(struct draw_context *draw,
 void draw_arrays(struct draw_context *draw, unsigned prim,
 		 unsigned start, unsigned count);
 
+void
+draw_arrays_instanced(struct draw_context *draw,
+                      unsigned mode,
+                      unsigned start,
+                      unsigned count,
+                      unsigned startInstance,
+                      unsigned instanceCount);
+
 void draw_flush(struct draw_context *draw);
 
 
diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index 5db2e75542..daf8d071f1 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -59,6 +59,15 @@ draw_gs_init( struct draw_context *draw )
    return TRUE;
 }
 
+void draw_gs_destroy( struct draw_context *draw )
+{
+   if (!draw->gs.machine)
+      return;
+
+   align_free(draw->gs.machine->Primitives);
+
+   tgsi_exec_machine_destroy(draw->gs.machine);
+}
 
 void draw_gs_set_constants( struct draw_context *draw,
                             const float (*constants)[4],
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index 205cda5eab..51a6115ebf 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -55,7 +55,7 @@
 
 
 
-struct clipper {
+struct clip_stage {
    struct draw_stage stage;      /**< base class */
 
    /* Basically duplicate some of the flatshading logic here:
@@ -70,9 +70,9 @@ struct clipper {
 
 /* This is a bit confusing:
  */
-static INLINE struct clipper *clipper_stage( struct draw_stage *stage )
+static INLINE struct clip_stage *clip_stage( struct draw_stage *stage )
 {
-   return (struct clipper *)stage;
+   return (struct clip_stage *)stage;
 }
 
 
@@ -92,11 +92,12 @@ static void interp_attr( float *fdst,
    fdst[3] = LINTERP( t, fout[3], fin[3] );
 }
 
+
 static void copy_colors( struct draw_stage *stage,
 			 struct vertex_header *dst,
 			 const struct vertex_header *src )
 {
-   const struct clipper *clipper = clipper_stage(stage);
+   const struct clip_stage *clipper = clip_stage(stage);
    uint i;
    for (i = 0; i < clipper->num_color_attribs; i++) {
       const uint attr = clipper->color_attribs[i];
@@ -108,7 +109,7 @@ static void copy_colors( struct draw_stage *stage,
 
 /* Interpolate between two vertices to produce a third.  
  */
-static void interp( const struct clipper *clip,
+static void interp( const struct clip_stage *clip,
 		    struct vertex_header *dst,
 		    float t,
 		    const struct vertex_header *out, 
@@ -179,7 +180,7 @@ static void emit_poly( struct draw_stage *stage,
       header.v[2] = inlist[0];	/* keep in v[2] for flatshading */
 
       if (i == n-1)
-        header.flags |= edge_last;
+         header.flags |= edge_last;
 
       if (0) {
          const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader;
@@ -200,13 +201,14 @@ static void emit_poly( struct draw_stage *stage,
    }
 }
 
+
 static INLINE float
 dot4(const float *a, const float *b)
 {
-   return (a[0]*b[0] +
-           a[1]*b[1] +
-           a[2]*b[2] +
-           a[3]*b[3]);
+   return (a[0] * b[0] +
+           a[1] * b[1] +
+           a[2] * b[2] +
+           a[3] * b[3]);
 }
 
 
@@ -217,7 +219,7 @@ do_clip_tri( struct draw_stage *stage,
 	     struct prim_header *header,
 	     unsigned clipmask )
 {
-   struct clipper *clipper = clipper_stage( stage );
+   struct clip_stage *clipper = clip_stage( stage );
    struct vertex_header *a[MAX_CLIPPED_VERTICES];
    struct vertex_header *b[MAX_CLIPPED_VERTICES];
    struct vertex_header **inlist = a;
@@ -280,6 +282,7 @@ do_clip_tri( struct draw_stage *stage,
 	 dp_prev = dp;
       }
 
+      /* swap in/out lists */
       {
 	 struct vertex_header **tmp = inlist;
 	 inlist = outlist;
@@ -291,15 +294,11 @@ do_clip_tri( struct draw_stage *stage,
    /* If flat-shading, copy color to new provoking vertex.
     */
    if (clipper->flat && inlist[0] != header->v[2]) {
-      if (1) {
-	 inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
-      }
+      inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
 
       copy_colors(stage, inlist[0], header->v[2]);
    }
 
-
-
    /* Emit the polygon as triangles to the setup stage:
     */
    if (n >= 3)
@@ -314,7 +313,7 @@ do_clip_line( struct draw_stage *stage,
 	      struct prim_header *header,
 	      unsigned clipmask )
 {
-   const struct clipper *clipper = clipper_stage( stage );
+   const struct clip_stage *clipper = clip_stage( stage );
    struct vertex_header *v0 = header->v[0];
    struct vertex_header *v1 = header->v[1];
    const float *pos0 = v0->clip;
@@ -416,13 +415,14 @@ clip_tri( struct draw_stage *stage,
    }
 }
 
+
 /* Update state.  Could further delay this until we hit the first
  * primitive that really requires clipping.
  */
 static void 
 clip_init_state( struct draw_stage *stage )
 {
-   struct clipper *clipper = clipper_stage( stage );
+   struct clip_stage *clipper = clip_stage( stage );
 
    clipper->flat = stage->draw->rasterizer->flatshade ? TRUE : FALSE;
 
@@ -488,7 +488,7 @@ static void clip_destroy( struct draw_stage *stage )
  */
 struct draw_stage *draw_clip_stage( struct draw_context *draw )
 {
-   struct clipper *clipper = CALLOC_STRUCT(clipper);
+   struct clip_stage *clipper = CALLOC_STRUCT(clip_stage);
    if (clipper == NULL)
       goto fail;
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index 1a5269c0de..d40c035240 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -138,7 +138,7 @@ emit_vertex( struct vbuf_stage *vbuf,
       /* Note: we really do want data[0] here, not data[pos]: 
        */
       vbuf->translate->set_buffer(vbuf->translate, 0, vertex->data[0], 0);
-      vbuf->translate->run(vbuf->translate, 0, 1, vbuf->vertex_ptr);
+      vbuf->translate->run(vbuf->translate, 0, 1, 0, vbuf->vertex_ptr);
 
       if (0) draw_dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr);
       
@@ -271,10 +271,12 @@ vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
 	 emit_sz = 0;
 	 break;
       }
-      
+
+      hw_key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
       hw_key.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
       hw_key.element[i].input_buffer = src_buffer;
       hw_key.element[i].input_offset = src_offset;
+      hw_key.element[i].instance_divisor = 0;
       hw_key.element[i].output_format = output_format;
       hw_key.element[i].output_offset = dst_offset;
 
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index e49041556b..ef49e57536 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -172,6 +172,8 @@ struct draw_context
 
    boolean force_passthrough; /**< never clip or shade */
 
+   boolean dump_vs;
+
    double mrd;  /**< minimum resolvable depth value, for polygon offset */
 
    /* pipe state that we need: */
@@ -239,6 +241,8 @@ struct draw_context
 
    unsigned reduced_prim;
 
+   unsigned instance_id;
+
    void *driver_private;
 };
 
@@ -265,6 +269,7 @@ boolean draw_gs_init( struct draw_context *draw );
 void draw_gs_set_constants( struct draw_context *,
                             const float (*constants)[4],
                             unsigned size );
+void draw_gs_destroy( struct draw_context *draw );
 
 /*******************************************************************************
  * Common shading code:
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 2801dbafe4..a5ddec5286 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -280,20 +280,33 @@ void
 draw_arrays(struct draw_context *draw, unsigned prim,
             unsigned start, unsigned count)
 {
-   unsigned reduced_prim = u_reduced_prim(prim);
+   draw_arrays_instanced(draw, prim, start, count, 0, 1);
+}
+
+void
+draw_arrays_instanced(struct draw_context *draw,
+                      unsigned mode,
+                      unsigned start,
+                      unsigned count,
+                      unsigned startInstance,
+                      unsigned instanceCount)
+{
+   unsigned reduced_prim = u_reduced_prim(mode);
+   unsigned instance;
+
    if (reduced_prim != draw->reduced_prim) {
-      draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+      draw_do_flush(draw, DRAW_FLUSH_STATE_CHANGE);
       draw->reduced_prim = reduced_prim;
    }
 
    if (0)
-      draw_print_arrays(draw, prim, start, MIN2(count, 20));
+      draw_print_arrays(draw, mode, start, MIN2(count, 20));
 
 #if 0
    {
       int i;
-      debug_printf("draw_arrays(prim=%u start=%u count=%u):\n",
-                   prim, start, count);
+      debug_printf("draw_arrays(mode=%u start=%u count=%u):\n",
+                   mode, start, count);
       tgsi_dump(draw->vs.vertex_shader->state.tokens, 0);
       debug_printf("Elements:\n");
       for (i = 0; i < draw->pt.nr_vertex_elements; i++) {
@@ -311,6 +324,8 @@ draw_arrays(struct draw_context *draw, unsigned prim,
    }
 #endif
 
-   /* drawing done here: */
-   draw_pt_arrays(draw, prim, start, count);
+   for (instance = 0; instance < instanceCount; instance++) {
+      draw->instance_id = instance + startInstance;
+      draw_pt_arrays(draw, mode, start, count);
+   }
 }
diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h
index 20edf7a227..d5e0d92a60 100644
--- a/src/gallium/auxiliary/draw/draw_pt.h
+++ b/src/gallium/auxiliary/draw/draw_pt.h
@@ -183,7 +183,8 @@ struct pt_emit *draw_pt_emit_create( struct draw_context *draw );
 struct pt_fetch;
 void draw_pt_fetch_prepare( struct pt_fetch *fetch,
                             unsigned vertex_input_count,
-			    unsigned vertex_size );
+                            unsigned vertex_size,
+                            unsigned instance_id_index );
 
 void draw_pt_fetch_run( struct pt_fetch *fetch,
 			const unsigned *elts,
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index 064e16c295..4fb53276bb 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -121,10 +121,12 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
 	 emit_sz = 0;
 	 break;
       }
-      
+
+      hw_key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
       hw_key.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
       hw_key.element[i].input_buffer = src_buffer;
       hw_key.element[i].input_offset = src_offset;
+      hw_key.element[i].instance_divisor = 0;
       hw_key.element[i].output_format = output_format;
       hw_key.element[i].output_offset = dst_offset;
 
@@ -204,6 +206,7 @@ void draw_pt_emit( struct pt_emit *emit,
    translate->run( translate,
 		   0, 
 		   vertex_count,
+                   draw->instance_id,
 		   hw_verts );
 
    render->unmap_vertices( render, 
@@ -263,6 +266,7 @@ void draw_pt_emit_linear(struct pt_emit *emit,
    translate->run(translate,
                   0,
                   count,
+                  draw->instance_id,
                   hw_verts);
 
    if (0) {
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch.c b/src/gallium/auxiliary/draw/draw_pt_fetch.c
index 305bfef435..55e7a7b81a 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c
@@ -58,12 +58,14 @@ struct pt_fetch {
  */
 void draw_pt_fetch_prepare( struct pt_fetch *fetch,
                             unsigned vs_input_count,
-			    unsigned vertex_size )
+                            unsigned vertex_size,
+                            unsigned instance_id_index )
 {
    struct draw_context *draw = fetch->draw;
    unsigned nr_inputs;
-   unsigned i, nr = 0;
+   unsigned i, nr = 0, ei = 0;
    unsigned dst_offset = 0;
+   unsigned num_extra_inputs = 0;
    struct translate_key key;
 
    fetch->vertex_size = vertex_size;
@@ -78,9 +80,11 @@ void draw_pt_fetch_prepare( struct pt_fetch *fetch,
    {
       /* Need to set header->vertex_id = 0xffff somehow.
        */
+      key.element[nr].type = TRANSLATE_ELEMENT_NORMAL;
       key.element[nr].input_format = PIPE_FORMAT_R32_FLOAT;
       key.element[nr].input_buffer = draw->pt.nr_vertex_buffers;
       key.element[nr].input_offset = 0;
+      key.element[nr].instance_divisor = 0;
       key.element[nr].output_format = PIPE_FORMAT_R32_FLOAT;
       key.element[nr].output_offset = dst_offset;
       dst_offset += 1 * sizeof(float);
@@ -91,19 +95,36 @@ void draw_pt_fetch_prepare( struct pt_fetch *fetch,
        */
       dst_offset += 4 * sizeof(float);
    }
-      
-   assert( draw->pt.nr_vertex_elements >= vs_input_count );
 
-   nr_inputs = MIN2( vs_input_count, draw->pt.nr_vertex_elements );
+   if (instance_id_index != ~0) {
+      num_extra_inputs++;
+   }
+
+   assert(draw->pt.nr_vertex_elements + num_extra_inputs >= vs_input_count);
+
+   nr_inputs = MIN2(vs_input_count, draw->pt.nr_vertex_elements + num_extra_inputs);
 
    for (i = 0; i < nr_inputs; i++) {
-      key.element[nr].input_format = draw->pt.vertex_element[i].src_format;
-      key.element[nr].input_buffer = draw->pt.vertex_element[i].vertex_buffer_index;
-      key.element[nr].input_offset = draw->pt.vertex_element[i].src_offset;
-      key.element[nr].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
-      key.element[nr].output_offset = dst_offset;
+      if (i == instance_id_index) {
+         key.element[nr].type = TRANSLATE_ELEMENT_INSTANCE_ID;
+         key.element[nr].input_format = PIPE_FORMAT_R32_USCALED;
+         key.element[nr].output_format = PIPE_FORMAT_R32_USCALED;
+         key.element[nr].output_offset = dst_offset;
+
+         dst_offset += sizeof(uint);
+      } else {
+         key.element[nr].type = TRANSLATE_ELEMENT_NORMAL;
+         key.element[nr].input_format = draw->pt.vertex_element[ei].src_format;
+         key.element[nr].input_buffer = draw->pt.vertex_element[ei].vertex_buffer_index;
+         key.element[nr].input_offset = draw->pt.vertex_element[ei].src_offset;
+         key.element[nr].instance_divisor = draw->pt.vertex_element[ei].instance_divisor;
+         key.element[nr].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+         key.element[nr].output_offset = dst_offset;
+
+         ei++;
+         dst_offset += 4 * sizeof(float);
+      }
 
-      dst_offset += 4 * sizeof(float);
       nr++;
    }
 
@@ -158,6 +179,7 @@ void draw_pt_fetch_run( struct pt_fetch *fetch,
    translate->run_elts( translate,
 			elts, 
 			count,
+                        draw->instance_id,
 			verts );
 
 }
@@ -183,6 +205,7 @@ void draw_pt_fetch_run_linear( struct pt_fetch *fetch,
    translate->run( translate,
                    start,
                    count,
+                   draw->instance_id,
                    verts );
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index e7fe6b3b76..2a604470e9 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -166,9 +166,11 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
 	 continue;
       }
 
+      key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
       key.element[i].input_format = input_format;
       key.element[i].input_buffer = input_buffer;
       key.element[i].input_offset = input_offset;
+      key.element[i].instance_divisor = src->instance_divisor;
       key.element[i].output_format = output_format;
       key.element[i].output_offset = dst_offset;
       
@@ -256,6 +258,7 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
    feme->translate->run_elts( feme->translate, 
 			      fetch_elts,
 			      fetch_count,
+                              draw->instance_id,
 			      hw_verts );
 
    if (0) {
@@ -314,6 +317,7 @@ static void fetch_emit_run_linear( struct draw_pt_middle_end *middle,
    feme->translate->run( feme->translate,
                          start,
                          count,
+                         draw->instance_id,
                          hw_verts );
 
    if (0) {
@@ -374,6 +378,7 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle,
    feme->translate->run( feme->translate,
                          start,
                          count,
+                         draw->instance_id,
                          hw_verts );
 
    draw->render->unmap_vertices( draw->render, 0, (ushort)(count - 1) );
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index 1a9df4cac5..23da556f79 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -59,6 +59,8 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_context *draw = fpme->draw;
    struct draw_vertex_shader *vs = draw->vs.vertex_shader;
+   unsigned i;
+   unsigned instance_id_index = ~0;
 
    /* Add one to num_outputs because the pipeline occasionally tags on
     * an additional texcoord, eg for AA lines.
@@ -66,6 +68,15 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
    unsigned nr = MAX2( vs->info.num_inputs,
 		       vs->info.num_outputs + 1 );
 
+   /* Scan for instanceID system value.
+    */
+   for (i = 0; i < vs->info.num_inputs; i++) {
+      if (vs->info.input_semantic_name[i] == TGSI_SEMANTIC_INSTANCEID) {
+         instance_id_index = i;
+         break;
+      }
+   }
+
    fpme->prim = prim;
    fpme->opt = opt;
 
@@ -79,7 +90,8 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
 
    draw_pt_fetch_prepare( fpme->fetch, 
                           vs->info.num_inputs,
-			  fpme->vertex_size );
+                          fpme->vertex_size,
+                          instance_id_index );
    /* XXX: it's not really gl rasterization rules we care about here,
     * but gl vs dx9 clip spaces.
     */
diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c
index 3553689532..e03ac8c229 100644
--- a/src/gallium/auxiliary/draw/draw_vs.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -43,11 +43,11 @@
 #include "translate/translate.h"
 #include "translate/translate_cache.h"
 
+#include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_exec.h"
 
 
 
-
 void draw_vs_set_constants( struct draw_context *draw,
                             const float (*constants)[4],
                             unsigned size )
@@ -83,6 +83,10 @@ draw_create_vertex_shader(struct draw_context *draw,
 {
    struct draw_vertex_shader *vs;
 
+   if (draw->dump_vs) {
+      tgsi_dump(shader->tokens, 0);
+   }
+
    vs = draw_create_vs_llvm( draw, shader );
    if (!vs) {
       vs = draw_create_vs_sse( draw, shader );
@@ -152,6 +156,8 @@ draw_delete_vertex_shader(struct draw_context *draw,
 boolean 
 draw_vs_init( struct draw_context *draw )
 {
+   draw->dump_vs = debug_get_bool_option("GALLIUM_DUMP_VS", FALSE);
+
    draw->vs.machine = tgsi_exec_machine_create();
    if (!draw->vs.machine)
       return FALSE;
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index e3b807ebd0..00036cfe68 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -43,6 +43,7 @@ struct draw_varient_input
    enum pipe_format format;
    unsigned buffer;
    unsigned offset; 
+   unsigned instance_divisor;
 };
 
 struct draw_varient_output
diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index ad184bd696..da9f3e3d35 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -98,9 +98,9 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
    /* loop over verts */
    for (i = 0; i < count; i += MAX_VERTICES) {
       const uint max_vertices = MIN2(MAX_VERTICES, count - i);
-      float inputs_soa[PIPE_MAX_SHADER_INPUTS][4][4] ALIGN16_ATTRIB;
-      float outputs_soa[PIPE_MAX_SHADER_OUTPUTS][4][4] ALIGN16_ATTRIB;
-      float temps_soa[TGSI_EXEC_NUM_TEMPS][4][4] ALIGN16_ATTRIB;
+      PIPE_ALIGN_VAR(16) float inputs_soa[PIPE_MAX_SHADER_INPUTS][4][4];
+      PIPE_ALIGN_VAR(16) float outputs_soa[PIPE_MAX_SHADER_OUTPUTS][4][4];
+      PIPE_ALIGN_VAR(16) float temps_soa[TGSI_EXEC_NUM_TEMPS][4][4];
       uint attr;
 
       /* convert (up to) four input verts to SoA format */
diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c
index d16692584e..9f40030f39 100644
--- a/src/gallium/auxiliary/draw/draw_vs_varient.c
+++ b/src/gallium/auxiliary/draw/draw_vs_varient.c
@@ -142,6 +142,7 @@ static void PIPE_CDECL vsvg_run_elts( struct draw_vs_varient *varient,
    vsvg->fetch->run_elts( vsvg->fetch, 
                           elts,
                           count,
+                          vsvg->draw->instance_id,
                           temp_buffer );
 
    vsvg->base.vs->run_linear( vsvg->base.vs, 
@@ -181,6 +182,7 @@ static void PIPE_CDECL vsvg_run_elts( struct draw_vs_varient *varient,
 
    vsvg->emit->run( vsvg->emit,
                     0, count,
+                    vsvg->draw->instance_id,
                     output_buffer );
 
    FREE(temp_buffer);
@@ -203,6 +205,7 @@ static void PIPE_CDECL vsvg_run_linear( struct draw_vs_varient *varient,
    vsvg->fetch->run( vsvg->fetch, 
                      start,
                      count,
+                     vsvg->draw->instance_id,
                      temp_buffer );
 
    vsvg->base.vs->run_linear( vsvg->base.vs, 
@@ -239,6 +242,7 @@ static void PIPE_CDECL vsvg_run_linear( struct draw_vs_varient *varient,
    
    vsvg->emit->run( vsvg->emit,
                     0, count,
+                    vsvg->draw->instance_id,
                     output_buffer );
 
    FREE(temp_buffer);
@@ -281,9 +285,11 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs,
    fetch.nr_elements = key->nr_inputs;
    fetch.output_stride = vsvg->temp_vertex_stride;
    for (i = 0; i < key->nr_inputs; i++) {
+      fetch.element[i].type = TRANSLATE_ELEMENT_NORMAL;
       fetch.element[i].input_format = key->element[i].in.format;
       fetch.element[i].input_buffer = key->element[i].in.buffer;
       fetch.element[i].input_offset = key->element[i].in.offset;
+      fetch.element[i].instance_divisor = 0;
       fetch.element[i].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
       fetch.element[i].output_offset = i * 4 * sizeof(float);
       assert(fetch.element[i].output_offset < fetch.output_stride);
@@ -295,17 +301,21 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs,
    for (i = 0; i < key->nr_outputs; i++) {
       if (key->element[i].out.format != EMIT_1F_PSIZE)
       {      
+         emit.element[i].type = TRANSLATE_ELEMENT_NORMAL;
          emit.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
          emit.element[i].input_buffer = 0;
          emit.element[i].input_offset = key->element[i].out.vs_output * 4 * sizeof(float);
+         emit.element[i].instance_divisor = 0;
          emit.element[i].output_format = draw_translate_vinfo_format(key->element[i].out.format);
          emit.element[i].output_offset = key->element[i].out.offset;
          assert(emit.element[i].input_offset <= fetch.output_stride);
       }
       else {
+         emit.element[i].type = TRANSLATE_ELEMENT_NORMAL;
          emit.element[i].input_format = PIPE_FORMAT_R32_FLOAT;
          emit.element[i].input_buffer = 1;
          emit.element[i].input_offset = 0;
+         emit.element[i].instance_divisor = 0;
          emit.element[i].output_format = PIPE_FORMAT_R32_FLOAT;
          emit.element[i].output_offset = key->element[i].out.offset;
       }
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index ba6f7b15f9..a4b78f1494 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -80,27 +80,11 @@ struct fenced_buffer_list
  */
 struct fenced_buffer
 {
-   /*
-    * Immutable members.
-    */
-
    struct pb_buffer base;
+   
    struct pb_buffer *buffer;
-   struct fenced_buffer_list *list;
-
-   /**
-    * Protected by fenced_buffer_list::mutex
-    */
-   struct list_head head;
 
-   /**
-    * Following members are mutable and protected by this mutex.
-    * 
-    * You may lock this mutex alone, or lock it with fenced_buffer_list::mutex
-    * held, but in order to prevent deadlocks you must never lock 
-    * fenced_buffer_list::mutex with this mutex held.
-    */
-   pipe_mutex mutex;
+   /* FIXME: protect access with mutex */
 
    /**
     * A bitmask of PIPE_BUFFER_USAGE_CPU/GPU_READ/WRITE describing the current
@@ -112,6 +96,9 @@ struct fenced_buffer
    struct pb_validate *vl;
    unsigned validation_flags;
    struct pipe_fence_handle *fence;
+
+   struct list_head head;
+   struct fenced_buffer_list *list;
 };
 
 
@@ -123,24 +110,15 @@ fenced_buffer(struct pb_buffer *buf)
 }
 
 
-/**
- * Add the buffer to the fenced list.
- * 
- * fenced_buffer_list::mutex and fenced_buffer::mutex must be held, in this
- * order, before calling this function.
- * 
- * Reference count should be incremented before calling this function.
- */
 static INLINE void
-fenced_buffer_add_locked(struct fenced_buffer_list *fenced_list, 
-                         struct fenced_buffer *fenced_buf)
+_fenced_buffer_add(struct fenced_buffer *fenced_buf)
 {
+   struct fenced_buffer_list *fenced_list = fenced_buf->list;
+
    assert(pipe_is_referenced(&fenced_buf->base.base.reference));
    assert(fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE);
    assert(fenced_buf->fence);
 
-   /* TODO: Move the reference count increment here */
-   
 #ifdef DEBUG
    LIST_DEL(&fenced_buf->head);
    assert(fenced_list->numUnfenced);
@@ -152,16 +130,32 @@ fenced_buffer_add_locked(struct fenced_buffer_list *fenced_list,
 
 
 /**
- * Remove the buffer from the fenced list.
- * 
- * fenced_buffer_list::mutex and fenced_buffer::mutex must be held, in this 
- * order before calling this function.
- * 
- * Reference count should be decremented after calling this function.
+ * Actually destroy the buffer.
  */
 static INLINE void
-fenced_buffer_remove_locked(struct fenced_buffer_list *fenced_list,
-                            struct fenced_buffer *fenced_buf)
+_fenced_buffer_destroy(struct fenced_buffer *fenced_buf)
+{
+   struct fenced_buffer_list *fenced_list = fenced_buf->list;
+   
+   assert(!pipe_is_referenced(&fenced_buf->base.base.reference));
+   assert(!fenced_buf->fence);
+#ifdef DEBUG
+   assert(fenced_buf->head.prev);
+   assert(fenced_buf->head.next);
+   LIST_DEL(&fenced_buf->head);
+   assert(fenced_list->numUnfenced);
+   --fenced_list->numUnfenced;
+#else
+   (void)fenced_list;
+#endif
+   pb_reference(&fenced_buf->buffer, NULL);
+   FREE(fenced_buf);
+}
+
+
+static INLINE void
+_fenced_buffer_remove(struct fenced_buffer_list *fenced_list,
+                      struct fenced_buffer *fenced_buf)
 {
    struct pb_fence_ops *ops = fenced_list->ops;
 
@@ -183,56 +177,37 @@ fenced_buffer_remove_locked(struct fenced_buffer_list *fenced_list,
    ++fenced_list->numUnfenced;
 #endif
    
-   /* TODO: Move the reference count decrement and destruction here */
+   /**
+    * FIXME!!!
+    */
+
+   if(!pipe_is_referenced(&fenced_buf->base.base.reference))
+      _fenced_buffer_destroy(fenced_buf);
 }
 
 
-/**
- * Wait for the fence to expire, and remove it from the fenced list.
- * 
- * fenced_buffer::mutex must be held. fenced_buffer_list::mutex must not be 
- * held -- it will be acquired internally.
- */
 static INLINE enum pipe_error
-fenced_buffer_finish_locked(struct fenced_buffer_list *fenced_list,
-                              struct fenced_buffer *fenced_buf)
+_fenced_buffer_finish(struct fenced_buffer *fenced_buf)
 {
+   struct fenced_buffer_list *fenced_list = fenced_buf->list;
    struct pb_fence_ops *ops = fenced_list->ops;
-   enum pipe_error ret = PIPE_ERROR;
 
 #if 0
    debug_warning("waiting for GPU");
 #endif
 
-   assert(pipe_is_referenced(&fenced_buf->base.base.reference));
    assert(fenced_buf->fence);
-
-   /*
-    * Acquire the global lock. Must release buffer mutex first to preserve
-    * lock order.
-    */
-   pipe_mutex_unlock(fenced_buf->mutex);
-   pipe_mutex_lock(fenced_list->mutex);
-   pipe_mutex_lock(fenced_buf->mutex);
-
    if(fenced_buf->fence) {
-      if(ops->fence_finish(ops, fenced_buf->fence, 0) == 0) {
-         /* Remove from the fenced list */
-         /* TODO: remove consequents */
-         fenced_buffer_remove_locked(fenced_list, fenced_buf);
-
-         p_atomic_dec(&fenced_buf->base.base.reference.count);
-         assert(pipe_is_referenced(&fenced_buf->base.base.reference));
-
-         fenced_buf->flags &= ~PIPE_BUFFER_USAGE_GPU_READ_WRITE;
-
-         ret = PIPE_OK;
+      if(ops->fence_finish(ops, fenced_buf->fence, 0) != 0) {
+	 return PIPE_ERROR;
       }
+      /* Remove from the fenced list */
+      /* TODO: remove consequents */
+      _fenced_buffer_remove(fenced_list, fenced_buf);
    }
 
-   pipe_mutex_unlock(fenced_list->mutex);
-
-   return ret;
+   fenced_buf->flags &= ~PIPE_BUFFER_USAGE_GPU_READ_WRITE;
+   return PIPE_OK;
 }
 
 
@@ -240,8 +215,8 @@ fenced_buffer_finish_locked(struct fenced_buffer_list *fenced_list,
  * Free as many fenced buffers from the list head as possible. 
  */
 static void
-fenced_buffer_list_check_free_locked(struct fenced_buffer_list *fenced_list, 
-                                     int wait)
+_fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list, 
+                               int wait)
 {
    struct pb_fence_ops *ops = fenced_list->ops;
    struct list_head *curr, *next;
@@ -254,29 +229,21 @@ fenced_buffer_list_check_free_locked(struct fenced_buffer_list *fenced_list,
    while(curr != &fenced_list->delayed) {
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
 
-      pipe_mutex_lock(fenced_buf->mutex);
-
       if(fenced_buf->fence != prev_fence) {
 	 int signaled;
 	 if (wait)
 	    signaled = ops->fence_finish(ops, fenced_buf->fence, 0);
 	 else
 	    signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
-	 if (signaled != 0) {
-            pipe_mutex_unlock(fenced_buf->mutex);
+	 if (signaled != 0)
 	    break;
-         }
 	 prev_fence = fenced_buf->fence;
       }
       else {
 	 assert(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0);
       }
 
-      fenced_buffer_remove_locked(fenced_list, fenced_buf);
-      pipe_mutex_unlock(fenced_buf->mutex);
-
-      pb_buf = &fenced_buf->base;
-      pb_reference(&pb_buf, NULL);
+      _fenced_buffer_remove(fenced_list, fenced_buf);
 
       curr = next; 
       next = curr->next;
@@ -290,25 +257,30 @@ fenced_buffer_destroy(struct pb_buffer *buf)
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);   
    struct fenced_buffer_list *fenced_list = fenced_buf->list;
 
-   assert(!pipe_is_referenced(&fenced_buf->base.base.reference));
-   assert(!fenced_buf->fence);
-
-#ifdef DEBUG
    pipe_mutex_lock(fenced_list->mutex);
-   assert(fenced_buf->head.prev);
-   assert(fenced_buf->head.next);
-   LIST_DEL(&fenced_buf->head);
-   assert(fenced_list->numUnfenced);
-   --fenced_list->numUnfenced;
+   assert(!pipe_is_referenced(&fenced_buf->base.base.reference));
+   if (fenced_buf->fence) {
+      struct pb_fence_ops *ops = fenced_list->ops;
+      if(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0) {
+	 struct list_head *curr, *prev;
+	 curr = &fenced_buf->head;
+	 prev = curr->prev;
+	 do {
+	    fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
+	    assert(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0);
+	    _fenced_buffer_remove(fenced_list, fenced_buf);
+	    curr = prev;
+	    prev = curr->prev;
+	 } while (curr != &fenced_list->delayed);
+      }	  
+      else {
+	 /* delay destruction */
+      }
+   }
+   else {
+      _fenced_buffer_destroy(fenced_buf);
+   }
    pipe_mutex_unlock(fenced_list->mutex);
-#else
-   (void)fenced_list;
-#endif
-
-   pb_reference(&fenced_buf->buffer, NULL);
-
-   pipe_mutex_destroy(fenced_buf->mutex);
-   FREE(fenced_buf);
 }
 
 
@@ -319,23 +291,24 @@ fenced_buffer_map(struct pb_buffer *buf,
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);
    struct fenced_buffer_list *fenced_list = fenced_buf->list;
    struct pb_fence_ops *ops = fenced_list->ops;
-   void *map = NULL;
-
-   pipe_mutex_lock(fenced_buf->mutex);
+   void *map;
 
    assert(!(flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE));
    
    /* Serialize writes */
    if((fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_WRITE) ||
       ((fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_READ) && (flags & PIPE_BUFFER_USAGE_CPU_WRITE))) {
-      if((flags & PIPE_BUFFER_USAGE_DONTBLOCK) &&
-          ops->fence_signalled(ops, fenced_buf->fence, 0) == 0) {
+      if(flags & PIPE_BUFFER_USAGE_DONTBLOCK) {
          /* Don't wait for the GPU to finish writing */
-         goto done;
+         if(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0)
+            _fenced_buffer_remove(fenced_list, fenced_buf);
+         else
+            return NULL;
+      }
+      else {
+         /* Wait for the GPU to finish writing */
+         _fenced_buffer_finish(fenced_buf);
       }
-
-      /* Wait for the GPU to finish writing */
-      fenced_buffer_finish_locked(fenced_list, fenced_buf);
    }
 
 #if 0
@@ -352,9 +325,6 @@ fenced_buffer_map(struct pb_buffer *buf,
       fenced_buf->flags |= flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE;
    }
 
-done:
-   pipe_mutex_unlock(fenced_buf->mutex);
-   
    return map;
 }
 
@@ -363,9 +333,6 @@ static void
 fenced_buffer_unmap(struct pb_buffer *buf)
 {
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);
-   
-   pipe_mutex_lock(fenced_buf->mutex);
-   
    assert(fenced_buf->mapcount);
    if(fenced_buf->mapcount) {
       pb_unmap(fenced_buf->buffer);
@@ -373,8 +340,6 @@ fenced_buffer_unmap(struct pb_buffer *buf)
       if(!fenced_buf->mapcount)
 	 fenced_buf->flags &= ~PIPE_BUFFER_USAGE_CPU_READ_WRITE;
    }
-   
-   pipe_mutex_unlock(fenced_buf->mutex);
 }
 
 
@@ -386,14 +351,11 @@ fenced_buffer_validate(struct pb_buffer *buf,
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);
    enum pipe_error ret;
    
-   pipe_mutex_lock(fenced_buf->mutex);
-
    if(!vl) {
       /* invalidate */
       fenced_buf->vl = NULL;
       fenced_buf->validation_flags = 0;
-      ret = PIPE_OK;
-      goto done;
+      return PIPE_OK;
    }
    
    assert(flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE);
@@ -401,17 +363,14 @@ fenced_buffer_validate(struct pb_buffer *buf,
    flags &= PIPE_BUFFER_USAGE_GPU_READ_WRITE;
 
    /* Buffer cannot be validated in two different lists */ 
-   if(fenced_buf->vl && fenced_buf->vl != vl) {
-      ret = PIPE_ERROR_RETRY;
-      goto done;
-   }
+   if(fenced_buf->vl && fenced_buf->vl != vl)
+      return PIPE_ERROR_RETRY;
    
 #if 0
    /* Do not validate if buffer is still mapped */
    if(fenced_buf->flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE) {
       /* TODO: wait for the thread that mapped the buffer to unmap it */
-      ret = PIPE_ERROR_RETRY;
-      goto done;
+      return PIPE_ERROR_RETRY;
    }
    /* Final sanity checking */
    assert(!(fenced_buf->flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE));
@@ -421,21 +380,17 @@ fenced_buffer_validate(struct pb_buffer *buf,
    if(fenced_buf->vl == vl &&
       (fenced_buf->validation_flags & flags) == flags) {
       /* Nothing to do -- buffer already validated */
-      ret = PIPE_OK;
-      goto done;
+      return PIPE_OK;
    }
    
    ret = pb_validate(fenced_buf->buffer, vl, flags);
    if (ret != PIPE_OK)
-      goto done;
+      return ret;
    
    fenced_buf->vl = vl;
    fenced_buf->validation_flags |= flags;
    
-done:
-   pipe_mutex_unlock(fenced_buf->mutex);
-
-   return ret;
+   return PIPE_OK;
 }
 
 
@@ -450,36 +405,29 @@ fenced_buffer_fence(struct pb_buffer *buf,
    fenced_buf = fenced_buffer(buf);
    fenced_list = fenced_buf->list;
    ops = fenced_list->ops;
-
-   pipe_mutex_lock(fenced_list->mutex);
-   pipe_mutex_lock(fenced_buf->mutex);
-
-   assert(pipe_is_referenced(&fenced_buf->base.base.reference));
-
-   if(fence != fenced_buf->fence) {
-      assert(fenced_buf->vl);
-      assert(fenced_buf->validation_flags);
-      
-      if (fenced_buf->fence) {
-         fenced_buffer_remove_locked(fenced_list, fenced_buf);
-         p_atomic_dec(&fenced_buf->base.base.reference.count);
-         assert(pipe_is_referenced(&fenced_buf->base.base.reference));
-      }
-      if (fence) {
-         ops->fence_reference(ops, &fenced_buf->fence, fence);
-         fenced_buf->flags |= fenced_buf->validation_flags;
-         p_atomic_inc(&fenced_buf->base.base.reference.count);
-         fenced_buffer_add_locked(fenced_list, fenced_buf);
-      }
-
-      pb_fence(fenced_buf->buffer, fence);
    
-      fenced_buf->vl = NULL;
-      fenced_buf->validation_flags = 0;
+   if(fence == fenced_buf->fence) {
+      /* Nothing to do */
+      return;
    }
 
-   pipe_mutex_unlock(fenced_buf->mutex);
+   assert(fenced_buf->vl);
+   assert(fenced_buf->validation_flags);
+   
+   pipe_mutex_lock(fenced_list->mutex);
+   if (fenced_buf->fence)
+      _fenced_buffer_remove(fenced_list, fenced_buf);
+   if (fence) {
+      ops->fence_reference(ops, &fenced_buf->fence, fence);
+      fenced_buf->flags |= fenced_buf->validation_flags;
+      _fenced_buffer_add(fenced_buf);
+   }
    pipe_mutex_unlock(fenced_list->mutex);
+   
+   pb_fence(fenced_buf->buffer, fence);
+
+   fenced_buf->vl = NULL;
+   fenced_buf->validation_flags = 0;
 }
 
 
@@ -489,7 +437,6 @@ fenced_buffer_get_base_buffer(struct pb_buffer *buf,
                               pb_size *offset)
 {
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);
-   /* NOTE: accesses immutable members only -- mutex not necessary */
    pb_get_base_buffer(fenced_buf->buffer, base_buf, offset);
 }
 
@@ -529,8 +476,6 @@ fenced_buffer_create(struct fenced_buffer_list *fenced_list,
    buf->buffer = buffer;
    buf->list = fenced_list;
    
-   pipe_mutex_init(buf->mutex);
-
 #ifdef DEBUG
    pipe_mutex_lock(fenced_list->mutex);
    LIST_ADDTAIL(&buf->head, &fenced_list->unfenced);
@@ -572,7 +517,7 @@ fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list,
                               int wait)
 {
    pipe_mutex_lock(fenced_list->mutex);
-   fenced_buffer_list_check_free_locked(fenced_list, wait);
+   _fenced_buffer_list_check_free(fenced_list, wait);
    pipe_mutex_unlock(fenced_list->mutex);
 }
 
@@ -594,13 +539,11 @@ fenced_buffer_list_dump(struct fenced_buffer_list *fenced_list)
    next = curr->next;
    while(curr != &fenced_list->unfenced) {
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
-      pipe_mutex_lock(fenced_buf->mutex);
       assert(!fenced_buf->fence);
       debug_printf("%10p %7u %7u\n",
                    (void *) fenced_buf,
                    fenced_buf->base.base.size,
                    p_atomic_read(&fenced_buf->base.base.reference.count));
-      pipe_mutex_unlock(fenced_buf->mutex);
       curr = next; 
       next = curr->next;
    }
@@ -610,7 +553,6 @@ fenced_buffer_list_dump(struct fenced_buffer_list *fenced_list)
    while(curr != &fenced_list->delayed) {
       int signaled;
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
-      pipe_mutex_lock(fenced_buf->mutex);
       signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
       debug_printf("%10p %7u %7u %10p %s\n",
                    (void *) fenced_buf,
@@ -618,7 +560,6 @@ fenced_buffer_list_dump(struct fenced_buffer_list *fenced_list)
                    p_atomic_read(&fenced_buf->base.base.reference.count),
                    (void *) fenced_buf->fence,
                    signaled == 0 ? "y" : "n");
-      pipe_mutex_unlock(fenced_buf->mutex);
       curr = next; 
       next = curr->next;
    }
@@ -639,8 +580,8 @@ fenced_buffer_list_destroy(struct fenced_buffer_list *fenced_list)
 #if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS)
       sched_yield();
 #endif
+      _fenced_buffer_list_check_free(fenced_list, 1);
       pipe_mutex_lock(fenced_list->mutex);
-      fenced_buffer_list_check_free_locked(fenced_list, 1);
    }
 
 #ifdef DEBUG
@@ -648,7 +589,6 @@ fenced_buffer_list_destroy(struct fenced_buffer_list *fenced_list)
 #endif
       
    pipe_mutex_unlock(fenced_list->mutex);
-   pipe_mutex_destroy(fenced_list->mutex);
    
    fenced_list->ops->destroy(fenced_list->ops);
    
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 1acf3c373e..f675427d98 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -673,6 +673,13 @@ void x86_and( struct x86_function *p,
    emit_op_modrm( p, 0x23, 0x21, dst, src );
 }
 
+void x86_div( struct x86_function *p,
+              struct x86_reg src )
+{
+   assert(src.file == file_REG32 && src.mod == mod_REG);
+   emit_op_modrm(p, 0xf7, 0, x86_make_reg(file_REG32, 6), src);
+}
+
 
 
 /***********************************************************************
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 731a651796..f7612d416a 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -244,6 +244,7 @@ void x86_sub( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_sahf( struct x86_function *p );
+void x86_div( struct x86_function *p, struct x86_reg src );
 
 
 void x86_cdecl_caller_push_regs( struct x86_function *p );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index e2e5394f86..c254a7274f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -123,7 +123,8 @@ static const char *semantic_names[] =
    "NORMAL",
    "FACE",
    "EDGEFLAG",
-   "PRIM_ID"
+   "PRIM_ID",
+   "INSTANCEID"
 };
 
 static const char *immediate_type_names[] =
@@ -218,8 +219,13 @@ _dump_register_src(
    struct dump_ctx *ctx,
    const struct tgsi_full_src_register *src )
 {
+   ENM(src->Register.File, file_names);
+   if (src->Register.Dimension) {
+      CHR('[');
+      SID(src->Dimension.Index);
+      CHR(']');
+   }
    if (src->Register.Indirect) {
-      ENM( src->Register.File, file_names );
       CHR( '[' );
       ENM( src->Indirect.File, file_names );
       CHR( '[' );
@@ -233,16 +239,10 @@ _dump_register_src(
       }
       CHR( ']' );
    } else {
-      ENM( src->Register.File, file_names );
       CHR( '[' );
       SID( src->Register.Index );
       CHR( ']' );
    }
-   if (src->Register.Dimension) {
-      CHR( '[' );
-      SID( src->Dimension.Index );
-      CHR( ']' );
-   }
 }
 
 static void
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 2bcb33392a..83646b73c1 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -1041,11 +1041,19 @@ fetch_src_file_channel(
 
       default:
          assert( 0 );
+         chan->u[0] = 0;
+         chan->u[1] = 0;
+         chan->u[2] = 0;
+         chan->u[3] = 0;
       }
       break;
 
    default:
       assert( 0 );
+      chan->u[0] = 0;
+      chan->u[1] = 0;
+      chan->u[2] = 0;
+      chan->u[3] = 0;
    }
 }
 
@@ -1121,11 +1129,14 @@ fetch_source(const struct tgsi_exec_machine *mach,
     * subscript to a register file. Effectively it means that
     * the register file is actually a 2D array of registers.
     *
-    *    file[1][3] == file[1*sizeof(file[1])+3],
+    *    file[3][1] == file[3*sizeof(file[1])+1],
     *    where:
     *       [3] = Dimension.Index
     */
    if (reg->Register.Dimension) {
+      int array_size;
+      union tgsi_exec_channel dim_index;
+
       /* The size of the first-order array depends on the register file type.
        * We need to multiply the index to the first array to get an effective,
        * "flat" index that points to the beginning of the second-order array.
@@ -1133,32 +1144,27 @@ fetch_source(const struct tgsi_exec_machine *mach,
       switch (reg->Register.File) {
       case TGSI_FILE_INPUT:
       case TGSI_FILE_SYSTEM_VALUE:
-         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
-         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
-         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
-         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
+         array_size = TGSI_EXEC_MAX_INPUT_ATTRIBS;
          break;
       case TGSI_FILE_CONSTANT:
-         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
-         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
-         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
-         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
+         array_size = TGSI_EXEC_MAX_CONST_BUFFER;
          break;
       default:
          assert( 0 );
+         array_size = 0;
       }
 
-      index.i[0] += reg->Dimension.Index;
-      index.i[1] += reg->Dimension.Index;
-      index.i[2] += reg->Dimension.Index;
-      index.i[3] += reg->Dimension.Index;
+      dim_index.i[0] =
+      dim_index.i[1] =
+      dim_index.i[2] =
+      dim_index.i[3] = reg->Dimension.Index;
 
       /* Again, the second subscript index can be addressed indirectly
        * identically to the first one.
        * Nothing stops us from indirectly addressing the indirect register,
        * but there is no need for that, so we won't exercise it.
        *
-       *    file[1][ind[4].y+3],
+       *    file[ind[4].y+3][1],
        *    where:
        *       ind = DimIndirect.File
        *       [4] = DimIndirect.Index
@@ -1183,20 +1189,25 @@ fetch_source(const struct tgsi_exec_machine *mach,
             &index2,
             &indir_index );
 
-         index.i[0] += indir_index.i[0];
-         index.i[1] += indir_index.i[1];
-         index.i[2] += indir_index.i[2];
-         index.i[3] += indir_index.i[3];
+         dim_index.i[0] += indir_index.i[0];
+         dim_index.i[1] += indir_index.i[1];
+         dim_index.i[2] += indir_index.i[2];
+         dim_index.i[3] += indir_index.i[3];
 
          /* for disabled execution channels, zero-out the index to
           * avoid using a potential garbage value.
           */
          for (i = 0; i < QUAD_SIZE; i++) {
             if ((execmask & (1 << i)) == 0)
-               index.i[i] = 0;
+               dim_index.i[i] = 0;
          }
       }
 
+      index.i[0] += dim_index.i[0] * array_size;
+      index.i[1] += dim_index.i[1] * array_size;
+      index.i[2] += dim_index.i[2] * array_size;
+      index.i[3] += dim_index.i[3] * array_size;
+
       /* If by any chance there was a need for a 3D array of register
        * files, we would have to check whether Dimension is followed
        * by a dimension register and continue the saga.
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 138d2d095b..ad553c71a5 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -51,7 +51,8 @@
  * Since it's pretty much impossible to form PPC vector immediates, load
  * them from memory here:
  */
-const float ppc_builtin_constants[] ALIGN16_ATTRIB = {
+PIPE_ALIGN_VAR(16) const float 
+ppc_builtin_constants[] = {
    1.0f, -128.0f, 128.0, 0.0
 };
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index 7f1c8e5dd6..431c3ffb14 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -335,13 +335,9 @@ iter_instruction(
          fill_scan_register1d(ind_reg,
                               inst->Src[i].Indirect.File,
                               inst->Src[i].Indirect.Index);
-         if (!(reg->file == TGSI_FILE_ADDRESS || reg->file == TGSI_FILE_LOOP) ||
-             reg->indices[0] != 0) {
-            report_warning(ctx, "Indirect register neither ADDR[0] nor LOOP[0]");
-         }
          check_register_usage(
             ctx,
-            reg,
+            ind_reg,
             "indirect",
             FALSE );
       }
@@ -412,7 +408,7 @@ iter_declaration(
          uint vert;
          for (vert = 0; vert < ctx->implied_array_size; ++vert) {
             scan_register *reg = MALLOC(sizeof(scan_register));
-            fill_scan_register2d(reg, file, vert, i);
+            fill_scan_register2d(reg, file, i, vert);
             check_and_declare(ctx, reg);
          }
       } else {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index 9fcffeda36..7fe5dad5ff 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -933,7 +933,8 @@ static const char *semantic_names[TGSI_SEMANTIC_COUNT] =
    "NORMAL",
    "FACE",
    "EDGEFLAG",
-   "PRIM_ID"
+   "PRIM_ID",
+   "INSTANCEID"
 };
 
 static const char *interpolate_names[TGSI_INTERPOLATE_COUNT] =
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index e64e2b731d..ab557a23f9 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -40,6 +40,8 @@ union tgsi_any_token {
    struct tgsi_header header;
    struct tgsi_processor processor;
    struct tgsi_token token;
+   struct tgsi_property prop;
+   struct tgsi_property_data prop_data;
    struct tgsi_declaration decl;
    struct tgsi_declaration_range decl_range;
    struct tgsi_declaration_semantic decl_semantic;
@@ -64,6 +66,7 @@ struct ureg_tokens {
 };
 
 #define UREG_MAX_INPUT PIPE_MAX_ATTRIBS
+#define UREG_MAX_SYSTEM_VALUE PIPE_MAX_ATTRIBS
 #define UREG_MAX_OUTPUT PIPE_MAX_ATTRIBS
 #define UREG_MAX_CONSTANT_RANGE 32
 #define UREG_MAX_IMMEDIATE 32
@@ -95,6 +98,13 @@ struct ureg_program
    unsigned nr_gs_inputs;
 
    struct {
+      unsigned index;
+      unsigned semantic_name;
+      unsigned semantic_index;
+   } system_value[UREG_MAX_SYSTEM_VALUE];
+   unsigned nr_system_values;
+
+   struct {
       unsigned semantic_name;
       unsigned semantic_index;
    } output[UREG_MAX_OUTPUT];
@@ -123,6 +133,8 @@ struct ureg_program
    } constant_range[UREG_MAX_CONSTANT_RANGE];
    unsigned nr_constant_ranges;
 
+   unsigned property_gs_input_prim;
+
    unsigned nr_addrs;
    unsigned nr_preds;
    unsigned nr_loops;
@@ -234,19 +246,29 @@ ureg_src_register( unsigned file,
    src.SwizzleY = TGSI_SWIZZLE_Y;
    src.SwizzleZ = TGSI_SWIZZLE_Z;
    src.SwizzleW = TGSI_SWIZZLE_W;
-   src.Pad      = 0;
    src.Indirect = 0;
+   src.IndirectFile = TGSI_FILE_NULL;
    src.IndirectIndex = 0;
    src.IndirectSwizzle = 0;
    src.Absolute = 0;
    src.Index    = index;
    src.Negate   = 0;
+   src.Dimension = 0;
+   src.DimensionIndex = 0;
 
    return src;
 }
 
 
 
+void
+ureg_property_gs_input_prim(struct ureg_program *ureg,
+                            unsigned gs_input_prim)
+{
+   ureg->property_gs_input_prim = gs_input_prim;
+}
+
+
 
 struct ureg_src 
 ureg_DECL_fs_input( struct ureg_program *ureg,
@@ -304,6 +326,25 @@ ureg_DECL_gs_input(struct ureg_program *ureg,
 }
 
 
+struct ureg_src
+ureg_DECL_system_value(struct ureg_program *ureg,
+                       unsigned index,
+                       unsigned semantic_name,
+                       unsigned semantic_index)
+{
+   if (ureg->nr_system_values < UREG_MAX_SYSTEM_VALUE) {
+      ureg->system_value[ureg->nr_system_values].index = index;
+      ureg->system_value[ureg->nr_system_values].semantic_name = semantic_name;
+      ureg->system_value[ureg->nr_system_values].semantic_index = semantic_index;
+      ureg->nr_system_values++;
+   } else {
+      set_bad(ureg);
+   }
+
+   return ureg_src_register(TGSI_FILE_SYSTEM_VALUE, index);
+}
+
+
 struct ureg_dst 
 ureg_DECL_output( struct ureg_program *ureg,
                   unsigned name,
@@ -616,6 +657,35 @@ ureg_DECL_immediate_uint( struct ureg_program *ureg,
 
 
 struct ureg_src
+ureg_DECL_immediate_block_uint( struct ureg_program *ureg,
+                                const unsigned *v,
+                                unsigned nr )
+{
+   uint index;
+   uint i;
+
+   if (ureg->nr_immediates + (nr + 3) / 4 > UREG_MAX_IMMEDIATE) {
+      set_bad(ureg);
+      return ureg_src_register(TGSI_FILE_IMMEDIATE, 0);
+   }
+
+   index = ureg->nr_immediates;
+   ureg->nr_immediates += (nr + 3) / 4;
+
+   for (i = index; i < ureg->nr_immediates; i++) {
+      ureg->immediate[i].type = TGSI_IMM_UINT32;
+      ureg->immediate[i].nr = nr > 4 ? 4 : nr;
+      memcpy(ureg->immediate[i].value.u,
+             &v[(i - index) * 4],
+             ureg->immediate[i].nr * sizeof(uint));
+      nr -= 4;
+   }
+
+   return ureg_src_register(TGSI_FILE_IMMEDIATE, index);
+}
+
+
+struct ureg_src
 ureg_DECL_immediate_int( struct ureg_program *ureg,
                          const int *v,
                          unsigned nr )
@@ -628,7 +698,7 @@ void
 ureg_emit_src( struct ureg_program *ureg,
                struct ureg_src src )
 {
-   unsigned size = 1 + (src.Indirect ? 1 : 0);
+   unsigned size = 1 + (src.Indirect ? 1 : 0) + (src.Dimension ? 1 : 0);
 
    union tgsi_any_token *out = get_tokens( ureg, DOMAIN_INSN, size );
    unsigned n = 0;
@@ -651,7 +721,7 @@ ureg_emit_src( struct ureg_program *ureg,
    if (src.Indirect) {
       out[0].src.Indirect = 1;
       out[n].value = 0;
-      out[n].src.File = TGSI_FILE_ADDRESS;
+      out[n].src.File = src.IndirectFile;
       out[n].src.SwizzleX = src.IndirectSwizzle;
       out[n].src.SwizzleY = src.IndirectSwizzle;
       out[n].src.SwizzleZ = src.IndirectSwizzle;
@@ -660,6 +730,15 @@ ureg_emit_src( struct ureg_program *ureg,
       n++;
    }
 
+   if (src.Dimension) {
+      out[0].src.Dimension = 1;
+      out[n].dim.Indirect = 0;
+      out[n].dim.Dimension = 0;
+      out[n].dim.Padding = 0;
+      out[n].dim.Index = src.DimensionIndex;
+      n++;
+   }
+
    assert(n == size);
 }
 
@@ -1027,13 +1106,34 @@ emit_immediate( struct ureg_program *ureg,
    out[4].imm_data.Uint = v[3];
 }
 
+static void
+emit_property(struct ureg_program *ureg,
+              unsigned name,
+              unsigned data)
+{
+   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 2);
+
+   out[0].value = 0;
+   out[0].prop.Type = TGSI_TOKEN_TYPE_PROPERTY;
+   out[0].prop.NrTokens = 2;
+   out[0].prop.PropertyName = name;
 
+   out[1].prop_data.Data = data;
+}
 
 
 static void emit_decls( struct ureg_program *ureg )
 {
    unsigned i;
 
+   if (ureg->property_gs_input_prim != ~0) {
+      assert(ureg->processor == TGSI_PROCESSOR_GEOMETRY);
+
+      emit_property(ureg,
+                    TGSI_PROPERTY_GS_INPUT_PRIM,
+                    ureg->property_gs_input_prim);
+   }
+
    if (ureg->processor == TGSI_PROCESSOR_VERTEX) {
       for (i = 0; i < UREG_MAX_INPUT; i++) {
          if (ureg->vs_inputs[i/32] & (1 << (i%32))) {
@@ -1058,6 +1158,15 @@ static void emit_decls( struct ureg_program *ureg )
       }
    }
 
+   for (i = 0; i < ureg->nr_system_values; i++) {
+      emit_decl(ureg,
+                TGSI_FILE_SYSTEM_VALUE,
+                ureg->system_value[i].index,
+                ureg->system_value[i].semantic_name,
+                ureg->system_value[i].semantic_index,
+                TGSI_INTERPOLATE_CONSTANT);
+   }
+
    for (i = 0; i < ureg->nr_outputs; i++) {
       emit_decl( ureg, 
                  TGSI_FILE_OUTPUT, 
@@ -1234,6 +1343,7 @@ struct ureg_program *ureg_create( unsigned processor )
       return NULL;
 
    ureg->processor = processor;
+   ureg->property_gs_input_prim = ~0;
    return ureg;
 }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 6f11273320..8c8a6bbce6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -47,13 +47,15 @@ struct ureg_src
    unsigned SwizzleY    : 2;  /* TGSI_SWIZZLE_ */
    unsigned SwizzleZ    : 2;  /* TGSI_SWIZZLE_ */
    unsigned SwizzleW    : 2;  /* TGSI_SWIZZLE_ */
-   unsigned Pad         : 1;  /* BOOL */
    unsigned Indirect    : 1;  /* BOOL */
+   unsigned Dimension   : 1;  /* BOOL */
    unsigned Absolute    : 1;  /* BOOL */
-   int      Index       : 16; /* SINT */
    unsigned Negate      : 1;  /* BOOL */
+   int      Index       : 16; /* SINT */
+   unsigned IndirectFile    : 4;  /* TGSI_FILE_ */
    int      IndirectIndex   : 16; /* SINT */
-   int      IndirectSwizzle : 2;  /* TGSI_SWIZZLE_ */
+   unsigned IndirectSwizzle : 2;  /* TGSI_SWIZZLE_ */
+   int      DimensionIndex  : 16; /* SINT */
 };
 
 /* Very similar to a tgsi_dst_register, removing unsupported fields
@@ -118,6 +120,14 @@ ureg_create_shader_and_destroy( struct ureg_program *p,
 }
 
 
+/***********************************************************************
+ * Build shader properties:
+ */
+
+void
+ureg_property_gs_input_prim(struct ureg_program *ureg,
+                            unsigned gs_input_prim);
+
 
 /***********************************************************************
  * Build shader declarations:
@@ -137,6 +147,12 @@ struct ureg_src
 ureg_DECL_gs_input(struct ureg_program *,
                    unsigned index);
 
+struct ureg_src
+ureg_DECL_system_value(struct ureg_program *,
+                       unsigned index,
+                       unsigned semantic_name,
+                       unsigned semantic_index);
+
 struct ureg_dst
 ureg_DECL_output( struct ureg_program *,
                   unsigned semantic_name,
@@ -153,6 +169,11 @@ ureg_DECL_immediate_uint( struct ureg_program *,
                           unsigned nr );
 
 struct ureg_src
+ureg_DECL_immediate_block_uint( struct ureg_program *,
+                                const unsigned *v,
+                                unsigned nr );
+
+struct ureg_src
 ureg_DECL_immediate_int( struct ureg_program *,
                          const int *v,
                          unsigned nr );
@@ -753,18 +774,30 @@ static INLINE struct ureg_src
 ureg_src_indirect( struct ureg_src reg, struct ureg_src addr )
 {
    assert(reg.File != TGSI_FILE_NULL);
-   assert(addr.File == TGSI_FILE_ADDRESS);
+   assert(addr.File == TGSI_FILE_ADDRESS || addr.File == TGSI_FILE_TEMPORARY);
    reg.Indirect = 1;
+   reg.IndirectFile = addr.File;
    reg.IndirectIndex = addr.Index;
    reg.IndirectSwizzle = addr.SwizzleX;
    return reg;
 }
 
+static INLINE struct ureg_src 
+ureg_src_dimension( struct ureg_src reg, int index )
+{
+   assert(reg.File != TGSI_FILE_NULL);
+   reg.Dimension = 1;
+   reg.DimensionIndex = index;
+   return reg;
+}
+
 static INLINE struct ureg_dst
 ureg_dst( struct ureg_src src )
 {
    struct ureg_dst dst;
 
+   assert(!src.Indirect || src.IndirectFile == TGSI_FILE_ADDRESS);
+
    dst.File      = src.File;
    dst.WriteMask = TGSI_WRITEMASK_XYZW;
    dst.Indirect  = src.Indirect;
@@ -792,13 +825,15 @@ ureg_src( struct ureg_dst dst )
    src.SwizzleY  = TGSI_SWIZZLE_Y;
    src.SwizzleZ  = TGSI_SWIZZLE_Z;
    src.SwizzleW  = TGSI_SWIZZLE_W;
-   src.Pad       = 0;
    src.Indirect  = dst.Indirect;
+   src.IndirectFile = TGSI_FILE_ADDRESS;
    src.IndirectIndex = dst.IndirectIndex;
    src.IndirectSwizzle = dst.IndirectSwizzle;
    src.Absolute  = 0;
    src.Index     = dst.Index;
    src.Negate    = 0;
+   src.Dimension = 0;
+   src.DimensionIndex = 0;
 
    return src;
 }
@@ -837,13 +872,15 @@ ureg_src_undef( void )
    src.SwizzleY  = 0;
    src.SwizzleZ  = 0;
    src.SwizzleW  = 0;
-   src.Pad       = 0;
    src.Indirect  = 0;
+   src.IndirectFile = TGSI_FILE_NULL;
    src.IndirectIndex = 0;
    src.IndirectSwizzle = 0;
    src.Absolute  = 0;
    src.Index     = 0;
    src.Negate    = 0;
+   src.Dimension = 0;
+   src.DimensionIndex = 0;
    
    return src;
 }
diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h
index 34526eb061..54ed2c1a4b 100644
--- a/src/gallium/auxiliary/translate/translate.h
+++ b/src/gallium/auxiliary/translate/translate.h
@@ -44,12 +44,19 @@
 #include "pipe/p_format.h"
 #include "pipe/p_state.h"
 
+enum translate_element_type {
+   TRANSLATE_ELEMENT_NORMAL,
+   TRANSLATE_ELEMENT_INSTANCE_ID
+};
+
 struct translate_element 
 {
+   enum translate_element_type type;
    enum pipe_format input_format;
    enum pipe_format output_format;
    unsigned input_buffer:8;
    unsigned input_offset:24;
+   unsigned instance_divisor;
    unsigned output_offset;
 };
 
@@ -74,11 +81,13 @@ struct translate {
    void (PIPE_CDECL *run_elts)( struct translate *,
                                 const unsigned *elts,
                                 unsigned count,
+                                unsigned instance_id,
                                 void *output_buffer);
 
    void (PIPE_CDECL *run)( struct translate *,
                            unsigned start,
                            unsigned count,
+                           unsigned instance_id,
                            void *output_buffer);
 };
 
@@ -103,8 +112,13 @@ static INLINE int translate_keysize( const struct translate_key *key )
 static INLINE int translate_key_compare( const struct translate_key *a,
                                          const struct translate_key *b )
 {
-   int keysize = translate_keysize(a);
-   return memcmp(a, b, keysize);
+   int keysize_a = translate_keysize(a);
+   int keysize_b = translate_keysize(b);
+
+   if (keysize_a != keysize_b) {
+      return keysize_a - keysize_b;
+   }
+   return memcmp(a, b, keysize_a);
 }
 
 
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
index 266e7ee81e..24727d4988 100644
--- a/src/gallium/auxiliary/translate/translate_generic.c
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -46,9 +46,12 @@ struct translate_generic {
    struct translate translate;
 
    struct {
+      enum translate_element_type type;
+
       fetch_func fetch;
       unsigned buffer;
       unsigned input_offset;
+      unsigned instance_divisor;
 
       emit_func emit;
       unsigned output_offset;
@@ -568,6 +571,7 @@ static emit_func get_emit_func( enum pipe_format format )
 static void PIPE_CDECL generic_run_elts( struct translate *translate,
                                          const unsigned *elts,
                                          unsigned count,
+                                         unsigned instance_id,
                                          void *output_buffer )
 {
    struct translate_generic *tg = translate_generic(translate);
@@ -583,13 +587,20 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate,
 
       for (attr = 0; attr < nr_attrs; attr++) {
 	 float data[4];
-
-	 const char *src = (tg->attrib[attr].input_ptr + 
-			    tg->attrib[attr].input_stride * elt);
+         const char *src;
 
 	 char *dst = (vert + 
 		      tg->attrib[attr].output_offset);
 
+         if (tg->attrib[attr].instance_divisor) {
+            src = tg->attrib[attr].input_ptr +
+                  tg->attrib[attr].input_stride *
+                  (instance_id / tg->attrib[attr].instance_divisor);
+         } else {
+            src = tg->attrib[attr].input_ptr +
+                  tg->attrib[attr].input_stride * elt;
+         }
+
 	 tg->attrib[attr].fetch( src, data );
 
          if (0) debug_printf("vert %d/%d attr %d: %f %f %f %f\n",
@@ -607,6 +618,7 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate,
 static void PIPE_CDECL generic_run( struct translate *translate,
                                     unsigned start,
                                     unsigned count,
+                                    unsigned instance_id,
                                     void *output_buffer )
 {
    struct translate_generic *tg = translate_generic(translate);
@@ -623,13 +635,25 @@ static void PIPE_CDECL generic_run( struct translate *translate,
       for (attr = 0; attr < nr_attrs; attr++) {
 	 float data[4];
 
-	 const char *src = (tg->attrib[attr].input_ptr + 
-			    tg->attrib[attr].input_stride * elt);
-
 	 char *dst = (vert + 
 		      tg->attrib[attr].output_offset);
 
-	 tg->attrib[attr].fetch( src, data );
+         if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) {
+            const char *src;
+
+            if (tg->attrib[attr].instance_divisor) {
+               src = tg->attrib[attr].input_ptr +
+                     tg->attrib[attr].input_stride *
+                     (instance_id / tg->attrib[attr].instance_divisor);
+            } else {
+               src = tg->attrib[attr].input_ptr +
+                     tg->attrib[attr].input_stride * elt;
+            }
+
+            tg->attrib[attr].fetch( src, data );
+         } else {
+            data[0] = (float)instance_id;
+         }
 
          if (0) debug_printf("vert %d attr %d: %f %f %f %f\n",
                              i, attr, data[0], data[1], data[2], data[3]);
@@ -683,10 +707,12 @@ struct translate *translate_generic_create( const struct translate_key *key )
    tg->translate.run = generic_run;
 
    for (i = 0; i < key->nr_elements; i++) {
+      tg->attrib[i].type = key->element[i].type;
 
       tg->attrib[i].fetch = get_fetch_func(key->element[i].input_format);
       tg->attrib[i].buffer = key->element[i].input_buffer;
       tg->attrib[i].input_offset = key->element[i].input_offset;
+      tg->attrib[i].instance_divisor = key->element[i].instance_divisor;
 
       tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
       tg->attrib[i].output_offset = key->element[i].output_offset;
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index b62db8d8f3..c13e742738 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -49,19 +49,29 @@
 typedef void (PIPE_CDECL *run_func)( struct translate *translate,
                                      unsigned start,
                                      unsigned count,
-                                     void *output_buffer );
+                                     unsigned instance_id,
+                                     void *output_buffer);
 
 typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
                                           const unsigned *elts,
                                           unsigned count,
-                                          void *output_buffer );
+                                          unsigned instance_id,
+                                          void *output_buffer);
 
 struct translate_buffer {
    const void *base_ptr;
    unsigned stride;
-   void *ptr;                   /* updated per vertex */
 };
 
+struct translate_buffer_varient {
+   unsigned buffer_index;
+   unsigned instance_divisor;
+   void *ptr;                    /* updated either per vertex or per instance */
+};
+
+
+#define ELEMENT_BUFFER_INSTANCE_ID  1001
+
 
 struct translate_sse {
    struct translate translate;
@@ -81,6 +91,16 @@ struct translate_sse {
    struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
    unsigned nr_buffers;
 
+   /* Multiple buffer varients can map to a single buffer. */
+   struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
+   unsigned nr_buffer_varients;
+
+   /* Multiple elements can map to a single buffer varient. */
+   unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
+
+   boolean use_instancing;
+   unsigned instance_id;
+
    run_func      gen_run;
    run_elts_func gen_run_elts;
 
@@ -359,32 +379,61 @@ static boolean init_inputs( struct translate_sse *p,
                             boolean linear )
 {
    unsigned i;
-   if (linear) {
-      for (i = 0; i < p->nr_buffers; i++) {
+   struct x86_reg instance_id = x86_make_disp(p->machine_EDX,
+                                              get_offset(p, &p->instance_id));
+
+   for (i = 0; i < p->nr_buffer_varients; i++) {
+      struct translate_buffer_varient *varient = &p->buffer_varient[i];
+      struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
+
+      if (linear || varient->instance_divisor) {
          struct x86_reg buf_stride   = x86_make_disp(p->machine_EDX,
-                                                     get_offset(p, &p->buffer[i].stride));
+                                                     get_offset(p, &buffer->stride));
          struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDX,
-                                                     get_offset(p, &p->buffer[i].ptr));
+                                                     get_offset(p, &varient->ptr));
          struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
-                                                     get_offset(p, &p->buffer[i].base_ptr));
+                                                     get_offset(p, &buffer->base_ptr));
          struct x86_reg elt = p->idx_EBX;
-         struct x86_reg tmp = p->tmp_EAX;
-
+         struct x86_reg tmp_EAX = p->tmp_EAX;
 
          /* Calculate pointer to first attrib:
+          *   base_ptr + stride * index, where index depends on instance divisor
           */
-         x86_mov(p->func, tmp, buf_stride);
-         x86_imul(p->func, tmp, elt);
-         x86_add(p->func, tmp, buf_base_ptr);
+         if (varient->instance_divisor) {
+            /* Our index is instance ID divided by instance divisor.
+             */
+            x86_mov(p->func, tmp_EAX, instance_id);
+
+            if (varient->instance_divisor != 1) {
+               struct x86_reg tmp_EDX = p->machine_EDX;
+               struct x86_reg tmp_ECX = p->outbuf_ECX;
+
+               /* TODO: Add x86_shr() to rtasm and use it whenever
+                *       instance divisor is power of two.
+                */
+
+               x86_push(p->func, tmp_EDX);
+               x86_push(p->func, tmp_ECX);
+               x86_xor(p->func, tmp_EDX, tmp_EDX);
+               x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
+               x86_div(p->func, tmp_ECX);    /* EAX = EDX:EAX / ECX */
+               x86_pop(p->func, tmp_ECX);
+               x86_pop(p->func, tmp_EDX);
+            }
+         } else {
+            x86_mov(p->func, tmp_EAX, elt);
+         }
+         x86_imul(p->func, tmp_EAX, buf_stride);
+         x86_add(p->func, tmp_EAX, buf_base_ptr);
 
 
          /* In the linear case, keep the buffer pointer instead of the
           * index number.
           */
-         if (p->nr_buffers == 1) 
-            x86_mov( p->func, elt, tmp );
+         if (linear && p->nr_buffer_varients == 1)
+            x86_mov(p->func, elt, tmp_EAX);
          else
-            x86_mov( p->func, buf_ptr, tmp );
+            x86_mov(p->func, buf_ptr, tmp_EAX);
       }
    }
 
@@ -394,31 +443,36 @@ static boolean init_inputs( struct translate_sse *p,
 
 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
                                       boolean linear,
-                                      unsigned buf_idx,
+                                      unsigned var_idx,
                                       struct x86_reg elt )
 {
-   if (linear && p->nr_buffers == 1) {
+   if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
+      return x86_make_disp(p->machine_EDX,
+                           get_offset(p, &p->instance_id));
+   }
+   if (linear && p->nr_buffer_varients == 1) {
       return p->idx_EBX;
    }
-   else if (linear) {
+   else if (linear || p->buffer_varient[var_idx].instance_divisor) {
       struct x86_reg ptr = p->tmp_EAX;
       struct x86_reg buf_ptr = 
          x86_make_disp(p->machine_EDX, 
-                       get_offset(p, &p->buffer[buf_idx].ptr));
+                       get_offset(p, &p->buffer_varient[var_idx].ptr));
       
       x86_mov(p->func, ptr, buf_ptr);
       return ptr;
    }
    else {
       struct x86_reg ptr = p->tmp_EAX;
+      const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
 
       struct x86_reg buf_stride = 
          x86_make_disp(p->machine_EDX, 
-                       get_offset(p, &p->buffer[buf_idx].stride));
+                       get_offset(p, &p->buffer[varient->buffer_index].stride));
 
       struct x86_reg buf_base_ptr = 
          x86_make_disp(p->machine_EDX, 
-                       get_offset(p, &p->buffer[buf_idx].base_ptr));
+                       get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
 
 
 
@@ -436,28 +490,33 @@ static struct x86_reg get_buffer_ptr( struct translate_sse *p,
 static boolean incr_inputs( struct translate_sse *p, 
                             boolean linear )
 {
-   if (linear && p->nr_buffers == 1) {
+   if (linear && p->nr_buffer_varients == 1) {
       struct x86_reg stride = x86_make_disp(p->machine_EDX,
                                             get_offset(p, &p->buffer[0].stride));
 
-      x86_add(p->func, p->idx_EBX, stride);
-      sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
+      if (p->buffer_varient[0].instance_divisor == 0) {
+         x86_add(p->func, p->idx_EBX, stride);
+         sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
+      }
    }
    else if (linear) {
       unsigned i;
 
       /* Is this worthwhile??
        */
-      for (i = 0; i < p->nr_buffers; i++) {
+      for (i = 0; i < p->nr_buffer_varients; i++) {
+         struct translate_buffer_varient *varient = &p->buffer_varient[i];
          struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
-                                                get_offset(p, &p->buffer[i].ptr));
+                                                get_offset(p, &varient->ptr));
          struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
-                                                   get_offset(p, &p->buffer[i].stride));
+                                                   get_offset(p, &p->buffer[varient->buffer_index].stride));
 
-         x86_mov(p->func, p->tmp_EAX, buf_ptr);
-         x86_add(p->func, p->tmp_EAX, buf_stride);
-         if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
-         x86_mov(p->func, buf_ptr, p->tmp_EAX);
+         if (varient->instance_divisor == 0) {
+            x86_mov(p->func, p->tmp_EAX, buf_ptr);
+            x86_add(p->func, p->tmp_EAX, buf_stride);
+            if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
+            x86_mov(p->func, buf_ptr, p->tmp_EAX);
+         }
       }
    } 
    else {
@@ -514,7 +573,18 @@ static boolean build_vertex_emit( struct translate_sse *p,
    x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
    x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
    x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
-   x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 4));
+   x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5));
+
+   /* Load instance ID.
+    */
+   if (p->use_instancing) {
+      x86_mov(p->func,
+              p->tmp_EAX,
+              x86_fn_arg(p->func, 4));
+      x86_mov(p->func,
+              x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)),
+              p->tmp_EAX);
+   }
 
    /* Get vertex count, compare to zero
     */
@@ -531,17 +601,18 @@ static boolean build_vertex_emit( struct translate_sse *p,
    label = x86_get_label(p->func);
    {
       struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
-      int last_vb = -1;
+      int last_varient = -1;
       struct x86_reg vb;
 
       for (j = 0; j < p->translate.key.nr_elements; j++) {
          const struct translate_element *a = &p->translate.key.element[j];
+         unsigned varient = p->element_to_buffer_varient[j];
 
          /* Figure out source pointer address:
           */
-         if (a->input_buffer != last_vb) {
-            last_vb = a->input_buffer;
-            vb = get_buffer_ptr(p, linear, a->input_buffer, elt);
+         if (varient != last_varient) {
+            last_varient = varient;
+            vb = get_buffer_ptr(p, linear, varient, elt);
          }
          
          if (!translate_attr( p, a, 
@@ -624,6 +695,7 @@ static void translate_sse_release( struct translate *translate )
 static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
 			      const unsigned *elts,
 			      unsigned count,
+                              unsigned instance_id,
 			      void *output_buffer )
 {
    struct translate_sse *p = (struct translate_sse *)translate;
@@ -631,12 +703,14 @@ static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
    p->gen_run_elts( translate,
 		    elts,
 		    count,
-		    output_buffer );
+                    instance_id,
+                    output_buffer);
 }
 
 static void PIPE_CDECL translate_sse_run( struct translate *translate,
 			 unsigned start,
 			 unsigned count,
+                         unsigned instance_id,
 			 void *output_buffer )
 {
    struct translate_sse *p = (struct translate_sse *)translate;
@@ -644,7 +718,8 @@ static void PIPE_CDECL translate_sse_run( struct translate *translate,
    p->gen_run( translate,
 	       start,
 	       count,
-	       output_buffer );
+               instance_id,
+               output_buffer);
 }
 
 
@@ -666,8 +741,37 @@ struct translate *translate_sse2_create( const struct translate_key *key )
    p->translate.run_elts = translate_sse_run_elts;
    p->translate.run = translate_sse_run;
 
-   for (i = 0; i < key->nr_elements; i++) 
-      p->nr_buffers = MAX2( p->nr_buffers, key->element[i].input_buffer + 1 );
+   for (i = 0; i < key->nr_elements; i++) {
+      if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
+         unsigned j;
+
+         p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
+
+         if (key->element[i].instance_divisor) {
+            p->use_instancing = TRUE;
+         }
+
+         /*
+          * Map vertex element to vertex buffer varient.
+          */
+         for (j = 0; j < p->nr_buffer_varients; j++) {
+            if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer &&
+                p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) {
+               break;
+            }
+         }
+         if (j == p->nr_buffer_varients) {
+            p->buffer_varient[j].buffer_index = key->element[i].input_buffer;
+            p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor;
+            p->nr_buffer_varients++;
+         }
+         p->element_to_buffer_varient[i] = j;
+      } else {
+         assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
+
+         p->element_to_buffer_varient[i] = ELEMENT_BUFFER_INSTANCE_ID;
+      }
+   }
 
    if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
 
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index 3f74e2aa8b..9725890bd4 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -262,6 +262,10 @@ regions_overlap(int srcX0, int srcY0,
  * Copy pixel block from src surface to dst surface.
  * Overlapping regions are acceptable.
  * Flipping and stretching are supported.
+ * \param filter  one of PIPE_TEX_MIPFILTER_NEAREST/LINEAR
+ * \param writemask  controls which channels in the dest surface are sourced
+ *                   from the src surface.  Disabled channels are sourced
+ *                   from (0,0,0,1).
  * XXX what about clipping???
  * XXX need some control over blitting Z and/or stencil.
  */
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index 46b4706b76..249a0375fc 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -379,9 +379,16 @@ static void blitter_set_texcoords_cube(struct blitter_context_priv *ctx,
    float t1 = y1 / (float)surf->height;
    float s2 = x2 / (float)surf->width;
    float t2 = y2 / (float)surf->height;
-   const float st[4][2] = {
-      {s1, t1}, {s2, t1}, {s2, t2}, {s1, t2}
-   };
+   float st[4][2];
+
+   st[0][0] = s1;
+   st[0][1] = t1;
+   st[1][0] = s2;
+   st[1][1] = t1;
+   st[2][0] = s2;
+   st[2][1] = t2;
+   st[3][0] = s1;
+   st[3][1] = t2;
 
    util_map_texcoords2d_onto_cubemap(surf->face,
                                      /* pointer, stride in floats */
diff --git a/src/gallium/auxiliary/util/u_draw_quad.c b/src/gallium/auxiliary/util/u_draw_quad.c
index 4110485fb1..e2e23c3cdd 100644
--- a/src/gallium/auxiliary/util/u_draw_quad.c
+++ b/src/gallium/auxiliary/util/u_draw_quad.c
@@ -61,6 +61,7 @@ util_draw_vertex_buffer(struct pipe_context *pipe,
    /* tell pipe about the vertex attributes */
    for (i = 0; i < num_attribs; i++) {
       velements[i].src_offset = i * 4 * sizeof(float);
+      velements[i].instance_divisor = 0;
       velements[i].vertex_buffer_index = 0;
       velements[i].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
       velements[i].nr_components = 4;
diff --git a/src/gallium/auxiliary/util/u_format.csv b/src/gallium/auxiliary/util/u_format.csv
index 9f16b42944..01f7931aed 100644
--- a/src/gallium/auxiliary/util/u_format.csv
+++ b/src/gallium/auxiliary/util/u_format.csv
@@ -62,10 +62,10 @@ PIPE_FORMAT_R16G16_SSCALED        , array , 1, 1, s16 , s16 ,     ,     , xy01,
 PIPE_FORMAT_R16G16B16_SSCALED     , array , 1, 1, s16 , s16 , s16 ,     , xyz1, rgb
 PIPE_FORMAT_R16G16B16A16_SSCALED  , array , 1, 1, s16 , s16 , s16 , s16 , xyzw, rgb
 PIPE_FORMAT_R8_UNORM              , array , 1, 1, un8 ,     ,     ,     , x001, rgb
-PIPE_FORMAT_R8G8_UNORM            , array , 1, 1, un8 , un8 ,     ,     , xy01, rgb
-PIPE_FORMAT_R8G8B8_UNORM          , array , 1, 1, un8 , un8 , un8 ,     , xyz1, rgb
-PIPE_FORMAT_R8G8B8A8_UNORM        , array , 1, 1, un8 , un8 , un8 , un8 , xyzw, rgb
-PIPE_FORMAT_R8G8B8X8_UNORM        , array , 1, 1, un8 , un8 , un8 , un8 , xyz1, rgb
+PIPE_FORMAT_R8G8_UNORM            , array , 1, 1, un8 , un8 ,     ,     , yx01, rgb
+PIPE_FORMAT_R8G8B8_UNORM          , array , 1, 1, un8 , un8 , un8 ,     , zyx1, rgb
+PIPE_FORMAT_R8G8B8A8_UNORM        , array , 1, 1, un8 , un8 , un8 , un8 , wzyx, rgb
+PIPE_FORMAT_R8G8B8X8_UNORM        , array , 1, 1, un8 , un8 , un8 , un8 , wzy1, rgb
 PIPE_FORMAT_R8_USCALED            , array , 1, 1, u8  ,     ,     ,     , x001, rgb
 PIPE_FORMAT_R8G8_USCALED          , array , 1, 1, u8  , u8  ,     ,     , xy01, rgb
 PIPE_FORMAT_R8G8B8_USCALED        , array , 1, 1, u8  , u8  , u8  ,     , xyz1, rgb
diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h
index 43eb0153ee..0ab53c75dd 100644
--- a/src/gallium/auxiliary/util/u_pack_color.h
+++ b/src/gallium/auxiliary/util/u_pack_color.h
@@ -425,6 +425,8 @@ util_pack_z(enum pipe_format format, double z)
       if (z == 1.0)
          return 0xffffffff;
       return (uint) (z * 0xffffffff);
+   case PIPE_FORMAT_Z32_FLOAT:
+      return (uint)z;
    case PIPE_FORMAT_S8Z24_UNORM:
    case PIPE_FORMAT_X8Z24_UNORM:
       if (z == 1.0)
diff --git a/src/gallium/auxiliary/util/u_ringbuffer.c b/src/gallium/auxiliary/util/u_ringbuffer.c
new file mode 100644
index 0000000000..3f43a19e01
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_ringbuffer.c
@@ -0,0 +1,145 @@
+
+#include "pipe/p_thread.h"
+#include "pipe/p_defines.h"
+#include "util/u_ringbuffer.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+/* Generic ringbuffer: 
+ */
+struct util_ringbuffer 
+{
+   struct util_packet *buf;
+   unsigned mask;
+
+   /* Can this be done with atomic variables??
+    */
+   unsigned head;
+   unsigned tail;
+   pipe_condvar change;
+   pipe_mutex mutex;
+};
+
+
+struct util_ringbuffer *util_ringbuffer_create( unsigned dwords )
+{
+   struct util_ringbuffer *ring = CALLOC_STRUCT(util_ringbuffer);
+   if (ring == NULL)
+      return NULL;
+
+   assert(util_is_power_of_two(dwords));
+   
+   ring->buf = MALLOC( dwords * sizeof(unsigned) );
+   if (ring->buf == NULL)
+      goto fail;
+
+   ring->mask = dwords - 1;
+
+   pipe_condvar_init(ring->change);
+   pipe_mutex_init(ring->mutex);
+   return ring;
+
+fail:
+   FREE(ring->buf);
+   FREE(ring);
+   return NULL;
+}
+
+void util_ringbuffer_destroy( struct util_ringbuffer *ring )
+{
+   pipe_condvar_destroy(ring->change);
+   pipe_mutex_destroy(ring->mutex);
+   FREE(ring->buf);
+   FREE(ring);
+}
+
+static INLINE unsigned util_ringbuffer_space( const struct util_ringbuffer *ring )
+{
+   return (ring->tail - (ring->head + 1)) & ring->mask;
+}
+
+void util_ringbuffer_enqueue( struct util_ringbuffer *ring,
+                              const struct util_packet *packet )
+{
+   unsigned i;
+
+   /* XXX: over-reliance on mutexes, etc:
+    */
+   pipe_mutex_lock(ring->mutex);
+
+   /* Wait for free space:
+    */
+   while (util_ringbuffer_space(ring) < packet->dwords)
+      pipe_condvar_wait(ring->change, ring->mutex);
+
+   /* Copy data to ring:
+    */
+   for (i = 0; i < packet->dwords; i++) {
+
+      /* Copy all dwords of the packet.  Note we're abusing the
+       * typesystem a little - we're being passed a pointer to
+       * something, but probably not an array of packet structs:
+       */
+      ring->buf[ring->head] = packet[i];
+      ring->head++;
+      ring->head &= ring->mask;
+   }
+
+   /* Signal change:
+    */
+   pipe_condvar_signal(ring->change);
+   pipe_mutex_unlock(ring->mutex);
+}
+
+enum pipe_error util_ringbuffer_dequeue( struct util_ringbuffer *ring,
+                                         struct util_packet *packet,
+                                         unsigned max_dwords,
+                                         boolean wait )
+{
+   const struct util_packet *ring_packet;
+   unsigned i;
+   int ret = PIPE_OK;
+
+   /* XXX: over-reliance on mutexes, etc:
+    */
+   pipe_mutex_lock(ring->mutex);
+
+   /* Wait for free space:
+    */
+   if (wait) {
+      while (util_ringbuffer_space(ring) == 0)
+         pipe_condvar_wait(ring->change, ring->mutex);
+   }
+   else {
+      if (util_ringbuffer_space(ring) == 0) {
+         ret = PIPE_ERROR_OUT_OF_MEMORY;
+         goto out;
+      }
+   }
+
+   ring_packet = &ring->buf[ring->tail];
+
+   /* Both of these are considered bugs.  Raise an assert on debug builds.
+    */
+   if (ring_packet->dwords > ring->mask + 1 - util_ringbuffer_space(ring) ||
+       ring_packet->dwords > max_dwords) {
+      assert(0);
+      ret = PIPE_ERROR_BAD_INPUT;
+      goto out;
+   }
+
+   /* Copy data from ring:
+    */
+   for (i = 0; i < ring_packet->dwords; i++) {
+      packet[i] = ring->buf[ring->tail];
+      ring->tail++;
+      ring->tail &= ring->mask;
+   }
+
+out:
+   /* Signal change:
+    */
+   pipe_condvar_signal(ring->change);
+   pipe_mutex_unlock(ring->mutex);
+   return ret;
+}
diff --git a/src/gallium/auxiliary/util/u_ringbuffer.h b/src/gallium/auxiliary/util/u_ringbuffer.h
new file mode 100644
index 0000000000..85f0ad6c1f
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_ringbuffer.h
@@ -0,0 +1,29 @@
+
+#ifndef UTIL_RINGBUFFER_H
+#define UTIL_RINGBUFFER_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"       /* only for pipe_error! */
+
+/* Generic header
+ */
+struct util_packet {
+   unsigned dwords:8;
+   unsigned data24:24;
+};
+
+struct util_ringbuffer;
+
+struct util_ringbuffer *util_ringbuffer_create( unsigned dwords );
+
+void util_ringbuffer_destroy( struct util_ringbuffer *ring );
+
+void util_ringbuffer_enqueue( struct util_ringbuffer *ring,
+                              const struct util_packet *packet );
+
+enum pipe_error util_ringbuffer_dequeue( struct util_ringbuffer *ring,
+                                         struct util_packet *packet,
+                                         unsigned max_dwords,
+                                         boolean wait );
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index 1ba82bb21f..f9936eb1cb 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -1357,7 +1357,10 @@ pipe_put_tile_rgba(struct pipe_transfer *pt,
       /*z24s8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
       break;
    default:
-      debug_printf("%s: unsupported format %s\n", __FUNCTION__, pf_name(format));
+      util_format_write_4f(format,
+                           p, src_stride * sizeof(float),
+                           packed, util_format_get_stride(format, w),
+                           0, 0, w, h);
    }
 
    pipe_put_tile_raw(pt, x, y, w, h, packed, 0);
diff --git a/src/gallium/auxiliary/vl/vl_compositor.c b/src/gallium/auxiliary/vl/vl_compositor.c
index fc2a1c59a6..a524e2fdfb 100644
--- a/src/gallium/auxiliary/vl/vl_compositor.c
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -316,6 +316,7 @@ init_buffers(struct vl_compositor *c)
    pipe_buffer_unmap(c->pipe->screen, c->vertex_bufs[0].buffer);
 
    c->vertex_elems[0].src_offset = 0;
+   c->vertex_elems[0].instance_divisor = 0;
    c->vertex_elems[0].vertex_buffer_index = 0;
    c->vertex_elems[0].nr_components = 2;
    c->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
@@ -345,6 +346,7 @@ init_buffers(struct vl_compositor *c)
    pipe_buffer_unmap(c->pipe->screen, c->vertex_bufs[1].buffer);
 
    c->vertex_elems[1].src_offset = 0;
+   c->vertex_elems[1].instance_divisor = 0;
    c->vertex_elems[1].vertex_buffer_index = 1;
    c->vertex_elems[1].nr_components = 2;
    c->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
@@ -353,7 +355,7 @@ init_buffers(struct vl_compositor *c)
     * Create our vertex shader's constant buffer
     * Const buffer contains scaling and translation vectors
     */
-   c->vs_const_buf.buffer = pipe_buffer_create
+   c->vs_const_buf = pipe_buffer_create
    (
       c->pipe->screen,
       1,
@@ -365,7 +367,7 @@ init_buffers(struct vl_compositor *c)
     * Create our fragment shader's constant buffer
     * Const buffer contains the color conversion matrix and bias vectors
     */
-   c->fs_const_buf.buffer = pipe_buffer_create
+   c->fs_const_buf = pipe_buffer_create
    (
       c->pipe->screen,
       1,
@@ -390,8 +392,8 @@ cleanup_buffers(struct vl_compositor *c)
    for (i = 0; i < 2; ++i)
       pipe_buffer_reference(&c->vertex_bufs[i].buffer, NULL);
 
-   pipe_buffer_reference(&c->vs_const_buf.buffer, NULL);
-   pipe_buffer_reference(&c->fs_const_buf.buffer, NULL);
+   pipe_buffer_reference(&c->vs_const_buf, NULL);
+   pipe_buffer_reference(&c->fs_const_buf, NULL);
 }
 
 bool vl_compositor_init(struct vl_compositor *compositor, struct pipe_context *pipe)
@@ -483,13 +485,13 @@ void vl_compositor_render(struct vl_compositor          *compositor,
    compositor->pipe->bind_fs_state(compositor->pipe, compositor->fragment_shader);
    compositor->pipe->set_vertex_buffers(compositor->pipe, 2, compositor->vertex_bufs);
    compositor->pipe->set_vertex_elements(compositor->pipe, 2, compositor->vertex_elems);
-   compositor->pipe->set_constant_buffer(compositor->pipe, PIPE_SHADER_VERTEX, 0, &compositor->vs_const_buf);
-   compositor->pipe->set_constant_buffer(compositor->pipe, PIPE_SHADER_FRAGMENT, 0, &compositor->fs_const_buf);
+   compositor->pipe->set_constant_buffer(compositor->pipe, PIPE_SHADER_VERTEX, 0, compositor->vs_const_buf);
+   compositor->pipe->set_constant_buffer(compositor->pipe, PIPE_SHADER_FRAGMENT, 0, compositor->fs_const_buf);
 
    vs_consts = pipe_buffer_map
    (
       compositor->pipe->screen,
-      compositor->vs_const_buf.buffer,
+      compositor->vs_const_buf,
       PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
    );
 
@@ -511,7 +513,7 @@ void vl_compositor_render(struct vl_compositor          *compositor,
    vs_consts->src_trans.z = 0;
    vs_consts->src_trans.w = 0;
 
-   pipe_buffer_unmap(compositor->pipe->screen, compositor->vs_const_buf.buffer);
+   pipe_buffer_unmap(compositor->pipe->screen, compositor->vs_const_buf);
 
    compositor->pipe->draw_arrays(compositor->pipe, PIPE_PRIM_TRIANGLE_STRIP, 0, 4);
    compositor->pipe->flush(compositor->pipe, PIPE_FLUSH_RENDER_CACHE, fence);
@@ -525,10 +527,10 @@ void vl_compositor_set_csc_matrix(struct vl_compositor *compositor, const float
 
    memcpy
    (
-      pipe_buffer_map(compositor->pipe->screen, compositor->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+      pipe_buffer_map(compositor->pipe->screen, compositor->fs_const_buf, PIPE_BUFFER_USAGE_CPU_WRITE),
       mat,
       sizeof(struct fragment_shader_consts)
    );
 
-   pipe_buffer_unmap(compositor->pipe->screen, compositor->fs_const_buf.buffer);
+   pipe_buffer_unmap(compositor->pipe->screen, compositor->fs_const_buf);
 }
diff --git a/src/gallium/auxiliary/vl/vl_compositor.h b/src/gallium/auxiliary/vl/vl_compositor.h
index f441901a75..6a9a3fd7af 100644
--- a/src/gallium/auxiliary/vl/vl_compositor.h
+++ b/src/gallium/auxiliary/vl/vl_compositor.h
@@ -47,7 +47,7 @@ struct vl_compositor
    struct pipe_scissor_state scissor;
    struct pipe_vertex_buffer vertex_bufs[2];
    struct pipe_vertex_element vertex_elems[2];
-   struct pipe_constant_buffer vs_const_buf, fs_const_buf;
+   struct pipe_buffer *vs_const_buf, *fs_const_buf;
 };
 
 bool vl_compositor_init(struct vl_compositor *compositor, struct pipe_context *pipe);
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c
index caf581aca6..e43187545c 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c
@@ -891,53 +891,61 @@ init_buffers(struct vl_mpeg12_mc_renderer *r)
 
    /* Position element */
    r->vertex_elems[0].src_offset = 0;
+   r->vertex_elems[0].instance_divisor = 0;
    r->vertex_elems[0].vertex_buffer_index = 0;
    r->vertex_elems[0].nr_components = 2;
    r->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
 
    /* Luma, texcoord element */
    r->vertex_elems[1].src_offset = sizeof(struct vertex2f);
+   r->vertex_elems[1].instance_divisor = 0;
    r->vertex_elems[1].vertex_buffer_index = 0;
    r->vertex_elems[1].nr_components = 2;
    r->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
 
    /* Chroma Cr texcoord element */
    r->vertex_elems[2].src_offset = sizeof(struct vertex2f) * 2;
+   r->vertex_elems[2].instance_divisor = 0;
    r->vertex_elems[2].vertex_buffer_index = 0;
    r->vertex_elems[2].nr_components = 2;
    r->vertex_elems[2].src_format = PIPE_FORMAT_R32G32_FLOAT;
 
    /* Chroma Cb texcoord element */
    r->vertex_elems[3].src_offset = sizeof(struct vertex2f) * 3;
+   r->vertex_elems[3].instance_divisor = 0;
    r->vertex_elems[3].vertex_buffer_index = 0;
    r->vertex_elems[3].nr_components = 2;
    r->vertex_elems[3].src_format = PIPE_FORMAT_R32G32_FLOAT;
 
    /* First ref surface top field texcoord element */
    r->vertex_elems[4].src_offset = 0;
+   r->vertex_elems[4].instance_divisor = 0;
    r->vertex_elems[4].vertex_buffer_index = 1;
    r->vertex_elems[4].nr_components = 2;
    r->vertex_elems[4].src_format = PIPE_FORMAT_R32G32_FLOAT;
 
    /* First ref surface bottom field texcoord element */
    r->vertex_elems[5].src_offset = sizeof(struct vertex2f);
+   r->vertex_elems[5].instance_divisor = 0;
    r->vertex_elems[5].vertex_buffer_index = 1;
    r->vertex_elems[5].nr_components = 2;
    r->vertex_elems[5].src_format = PIPE_FORMAT_R32G32_FLOAT;
 
    /* Second ref surface top field texcoord element */
    r->vertex_elems[6].src_offset = 0;
+   r->vertex_elems[6].instance_divisor = 0;
    r->vertex_elems[6].vertex_buffer_index = 2;
    r->vertex_elems[6].nr_components = 2;
    r->vertex_elems[6].src_format = PIPE_FORMAT_R32G32_FLOAT;
 
    /* Second ref surface bottom field texcoord element */
    r->vertex_elems[7].src_offset = sizeof(struct vertex2f);
+   r->vertex_elems[7].instance_divisor = 0;
    r->vertex_elems[7].vertex_buffer_index = 2;
    r->vertex_elems[7].nr_components = 2;
    r->vertex_elems[7].src_format = PIPE_FORMAT_R32G32_FLOAT;
 
-   r->vs_const_buf.buffer = pipe_buffer_create
+   r->vs_const_buf = pipe_buffer_create
    (
       r->pipe->screen,
       DEFAULT_BUF_ALIGNMENT,
@@ -945,7 +953,7 @@ init_buffers(struct vl_mpeg12_mc_renderer *r)
       sizeof(struct vertex_shader_consts)
    );
 
-   r->fs_const_buf.buffer = pipe_buffer_create
+   r->fs_const_buf = pipe_buffer_create
    (
       r->pipe->screen,
       DEFAULT_BUF_ALIGNMENT,
@@ -954,11 +962,11 @@ init_buffers(struct vl_mpeg12_mc_renderer *r)
 
    memcpy
    (
-      pipe_buffer_map(r->pipe->screen, r->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+      pipe_buffer_map(r->pipe->screen, r->fs_const_buf, PIPE_BUFFER_USAGE_CPU_WRITE),
       &fs_consts, sizeof(struct fragment_shader_consts)
    );
 
-   pipe_buffer_unmap(r->pipe->screen, r->fs_const_buf.buffer);
+   pipe_buffer_unmap(r->pipe->screen, r->fs_const_buf);
 
    return true;
 }
@@ -970,8 +978,8 @@ cleanup_buffers(struct vl_mpeg12_mc_renderer *r)
 
    assert(r);
 
-   pipe_buffer_reference(&r->vs_const_buf.buffer, NULL);
-   pipe_buffer_reference(&r->fs_const_buf.buffer, NULL);
+   pipe_buffer_reference(&r->vs_const_buf, NULL);
+   pipe_buffer_reference(&r->fs_const_buf, NULL);
 
    for (i = 0; i < 3; ++i)
       pipe_buffer_reference(&r->vertex_bufs.all[i].buffer, NULL);
@@ -1284,19 +1292,19 @@ flush(struct vl_mpeg12_mc_renderer *r)
 
    vs_consts = pipe_buffer_map
    (
-      r->pipe->screen, r->vs_const_buf.buffer,
+      r->pipe->screen, r->vs_const_buf,
       PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
    );
 
    vs_consts->denorm.x = r->surface->width0;
    vs_consts->denorm.y = r->surface->height0;
 
-   pipe_buffer_unmap(r->pipe->screen, r->vs_const_buf.buffer);
+   pipe_buffer_unmap(r->pipe->screen, r->vs_const_buf);
 
    r->pipe->set_constant_buffer(r->pipe, PIPE_SHADER_VERTEX, 0,
-                                &r->vs_const_buf);
+                                r->vs_const_buf);
    r->pipe->set_constant_buffer(r->pipe, PIPE_SHADER_FRAGMENT, 0,
-                                &r->fs_const_buf);
+                                r->fs_const_buf);
 
    if (num_macroblocks[MACROBLOCK_TYPE_INTRA] > 0) {
       r->pipe->set_vertex_buffers(r->pipe, 1, r->vertex_bufs.all);
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h
index 64184337a0..f00b8c7b8b 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h
@@ -63,8 +63,8 @@ struct vl_mpeg12_mc_renderer
 
    struct pipe_viewport_state viewport;
    struct pipe_scissor_state scissor;
-   struct pipe_constant_buffer vs_const_buf;
-   struct pipe_constant_buffer fs_const_buf;
+   struct pipe_buffer *vs_const_buf;
+   struct pipe_buffer *fs_const_buf;
    struct pipe_framebuffer_state fb_state;
    struct pipe_vertex_element vertex_elems[8];
 	
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst
index 21f5f9111a..d394f5b4f1 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -33,7 +33,11 @@ This state describes how resources in various flavours (textures,
 buffers, surfaces) are bound to the driver.
 
 
-* ``set_constant_buffer``
+* ``set_constant_buffer`` sets a constant buffer to be used for a given shader
+  type. index is used to indicate which buffer to set (some apis may allow
+  multiple ones to be set, and binding a specific one later, though drivers
+  are mostly restricted to the first one right now).
+
 * ``set_framebuffer_state``
 * ``set_fragment_sampler_textures``
 * ``set_vertex_sampler_textures``
@@ -47,7 +51,6 @@ These pieces of state are too small, variable, and/or trivial to have CSO
 objects. They all follow simple, one-method binding calls, e.g.
 ``set_edgeflags``.
 
-* ``set_edgeflags``
 * ``set_blend_color``
 * ``set_clip_state``
 * ``set_polygon_stipple``
@@ -72,12 +75,67 @@ stencil-only clears of packed depth-stencil buffers.
 Drawing
 ^^^^^^^
 
-``draw_arrays``
+``draw_arrays`` draws a specified primitive.
+
+This command is equivalent to calling ``draw_arrays_instanced``
+with ``startInstance`` set to 0 and ``instanceCount`` set to 1.
+
+``draw_elements`` draws a specified primitive using an optional
+index buffer.
 
-``draw_elements``
+This command is equivalent to calling ``draw_elements_instanced``
+with ``startInstance`` set to 0 and ``instanceCount`` set to 1.
 
 ``draw_range_elements``
 
+XXX: this is (probably) a temporary entrypoint, as the range
+information should be available from the vertex_buffer state.
+Using this to quickly evaluate a specialized path in the draw
+module.
+
+``draw_arrays_instanced`` draws multiple instances of the same primitive.
+
+This command is equivalent to calling ``draw_elements_instanced``
+with ``indexBuffer`` set to NULL and ``indexSize`` set to 0.
+
+``draw_elements_instanced`` draws multiple instances of the same primitive
+using an optional index buffer.
+
+For instanceID in the range between ``startInstance``
+and ``startInstance``+``instanceCount``-1, inclusive, draw a primitive
+specified by ``mode`` and sequential numbers in the range between ``start``
+and ``start``+``count``-1, inclusive.
+
+If ``indexBuffer`` is not NULL, it specifies an index buffer with index
+byte size of ``indexSize``. The sequential numbers are used to lookup
+the index buffer and the resulting indices in turn are used to fetch
+vertex attributes.
+
+If ``indexBuffer`` is NULL, the sequential numbers are used directly
+as indices to fetch vertex attributes.
+
+If a given vertex element has ``instance_divisor`` set to 0, it is said
+it contains per-vertex data and effective vertex attribute address needs
+to be recalculated for every index.
+
+  attribAddr = ``stride`` * index + ``src_offset``
+
+If a given vertex element has ``instance_divisor`` set to non-zero,
+it is said it contains per-instance data and effective vertex attribute
+address needs to recalculated for every ``instance_divisor``-th instance.
+
+  attribAddr = ``stride`` * instanceID / ``instance_divisor`` + ``src_offset``
+
+In the above formulas, ``src_offset`` is taken from the given vertex element
+and ``stride`` is taken from a vertex buffer associated with the given
+vertex element.
+
+The calculated attribAddr is used as an offset into the vertex buffer to
+fetch the attribute data.
+
+The value of ``instanceID`` can be read in a vertex shader through a system
+value register declared with INSTANCEID semantic name.
+
 
 Queries
 ^^^^^^^
diff --git a/src/gallium/docs/source/cso/rasterizer.rst b/src/gallium/docs/source/cso/rasterizer.rst
index 4d8e1708e7..bfa4a1170a 100644
--- a/src/gallium/docs/source/cso/rasterizer.rst
+++ b/src/gallium/docs/source/cso/rasterizer.rst
@@ -7,32 +7,69 @@ The rasterizer state controls the rendering of points, lines and triangles.
 Attributes include polygon culling state, line width, line stipple,
 multisample state, scissoring and flat/smooth shading.
 
-
 Members
 -------
 
+bypass_vs_clip_and_viewport
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Whether the entire TCL pipeline should be bypassed. This implies that
+vertices are pre-transformed for the viewport, and will not be run
+through the vertex shader.
+
+.. note::
+   
+   Implementations may still clip away vertices that are not in the viewport
+   when this is set.
+
 flatshade
-    If set, the provoking vertex of each polygon is used to determine the
-    color of the entire polygon.  If not set, fragment colors will be
-    interpolated between the vertex colors.
-    Note that this is separate from the fragment shader input attributes
-    CONSTANT, LINEAR and PERSPECTIVE.  We need the flatshade state at
+^^^^^^^^^
+
+If set, the provoking vertex of each polygon is used to determine the color
+of the entire polygon.  If not set, fragment colors will be interpolated
+between the vertex colors.
+
+The actual interpolated shading algorithm is obviously
+implementation-dependent, but will usually be Gourard for most hardware.
+
+.. note::
+
+    This is separate from the fragment shader input attributes
+    CONSTANT, LINEAR and PERSPECTIVE. The flatshade state is needed at
     clipping time to determine how to set the color of new vertices.
-    Also note that the draw module can implement flat shading by copying
-    the provoking vertex color to all the other vertices in the primitive.
+
+    :ref:`Draw` can implement flat shading by copying the provoking vertex
+    color to all the other vertices in the primitive.
 
 flatshade_first
-    Whether the first vertex should be the provoking vertex, for most
-    primitives. If not set, the last vertex is the provoking vertex.
+^^^^^^^^^^^^^^^
+
+Whether the first vertex should be the provoking vertex, for most primitives.
+If not set, the last vertex is the provoking vertex.
+
+There are several important exceptions to the specification of this rule.
+
+* ``PIPE_PRIMITIVE_POLYGON``: The provoking vertex is always the first
+  vertex. If the caller wishes to change the provoking vertex, they merely
+  need to rotate the vertices themselves.
+* ``PIPE_PRIMITIVE_QUAD``, ``PIPE_PRIMITIVE_QUAD_STRIP``: This option has no
+  effect; the provoking vertex is always the last vertex.
+* ``PIPE_PRIMITIVE_TRIANGLE_FAN``: When set, the provoking vertex is the
+  second vertex, not the first. This permits each segment of the fan to have
+  a different color.
+
+Other Members
+^^^^^^^^^^^^^
 
 light_twoside
-    If set, there are per-vertex back-facing colors.  The draw module
+    If set, there are per-vertex back-facing colors. :ref:`Draw`
     uses this state along with the front/back information to set the
     final vertex colors prior to rasterization.
 
 front_winding
     Indicates the window order of front-facing polygons, either
     PIPE_WINDING_CW or PIPE_WINDING_CCW
+
 cull_mode
     Indicates which polygons to cull, either PIPE_WINDING_NONE (cull no
     polygons), PIPE_WINDING_CW (cull clockwise-winding polygons),
@@ -68,7 +105,7 @@ line_stipple_enable
 line_stipple_pattern
     16-bit bitfield of on/off flags, used to pattern the line stipple.
 line_stipple_factor
-    When drawinga stippled line, each bit in the stipple pattern is
+    When drawing a stippled line, each bit in the stipple pattern is
     repeated N times, where N = line_stipple_factor + 1.
 line_last_pixel
     Controls whether the last pixel in a line is drawn or not.  OpenGL
@@ -96,7 +133,7 @@ sprite_coord_mode
     lower left vertex will have coordinate (0,0,0,1).
     For PIPE_SPRITE_COORD_UPPER_LEFT, the upper-left vertex will have
     coordinate (0,0,0,1).
-    This state is needed by the 'draw' module because that's where each
+    This state is needed by :ref:`Draw` because that's where each
     point vertex is converted into four quad vertices.  There's no other
     place to emit the new vertex texture coordinates which are required for
     sprite rendering.
@@ -108,45 +145,9 @@ scissor
     Whether the scissor test is enabled.
 
 multisample
-    Whether :ref:`MSAA` is enabled.
-
-bypass_vs_clip_and_viewport
-    Whether the entire TCL pipeline should be bypassed. This implies that
-    vertices are pre-transformed for the viewport, and will not be run
-    through the vertex shader. Note that implementations may still clip away
-    vertices that are not in the viewport.
+    Whether :term:`MSAA` is enabled.
 
 gl_rasterization_rules
     Whether the rasterizer should use (0.5, 0.5) pixel centers. When not set,
     the rasterizer will use (0, 0) for pixel centers.
 
-
-Notes
------
-
-flatshade
-^^^^^^^^^
-
-The actual interpolated shading algorithm is obviously
-implementation-dependent, but will usually be Gourard for most hardware.
-
-bypass_vs_clip_and_viewport
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-When set, this implies that vertices are pre-transformed for the viewport, and
-will not be run through the vertex shader. Note that implementations may still
-clip away vertices that are not visible.
-
-flatshade_first
-^^^^^^^^^^^^^^^
-
-There are several important exceptions to the specification of this rule.
-
-* ``PIPE_PRIMITIVE_POLYGON``: The provoking vertex is always the first
-  vertex. If the caller wishes to change the provoking vertex, they merely
-  need to rotate the vertices themselves.
-* ``PIPE_PRIMITIVE_QUAD``, ``PIPE_PRIMITIVE_QUAD_STRIP``: This option has no
-  effect; the provoking vertex is always the last vertex.
-* ``PIPE_PRIMITIVE_TRIANGLE_FAN``: When set, the provoking vertex is the
-  second vertex, not the first. This permits each segment of the fan to have
-  a different color.
diff --git a/src/gallium/docs/source/distro.rst b/src/gallium/docs/source/distro.rst
index 33e846e33d..0ef9fe2645 100644
--- a/src/gallium/docs/source/distro.rst
+++ b/src/gallium/docs/source/distro.rst
@@ -61,10 +61,7 @@ VMWare SVGA
 ATI r300
 ^^^^^^^^
 
-AMD/ATI r600
-^^^^^^^^^^^^
-
-Highly experimental.
+Testing-quality.
 
 Softpipe
 ^^^^^^^^
@@ -109,17 +106,31 @@ Auxiliary
 CSO Cache
 ^^^^^^^^^
 
+The CSO cache is used to accelerate preparation of state by saving
+driver-specific state structures for later use.
+
+.. _draw:
+
 Draw
 ^^^^
 
+Draw is a software :term:`TCL` pipeline for hardware that lacks vertex shaders
+or other essential parts of pre-rasterization vertex preparation.
+
 Gallivm
 ^^^^^^^
 
 Indices
 ^^^^^^^
 
-Pipe Buffer Manager
-^^^^^^^^^^^^^^^^^^^
+Indices provides tools for translating or generating element indices for
+use with element-based rendering.
+
+Pipe Buffer Managers
+^^^^^^^^^^^^^^^^^^^^
+
+Each of these managers provides various services to drivers that are not
+fully utilizing a memory manager.
 
 Remote Debugger
 ^^^^^^^^^^^^^^^
@@ -127,12 +138,12 @@ Remote Debugger
 Runtime Assembly Emission
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Surface Context Tracker
-^^^^^^^^^^^^^^^^^^^^^^^
-
 TGSI
 ^^^^
 
+The TGSI auxiliary module provides basic utilities for manipulating TGSI
+streams.
+
 Translate
 ^^^^^^^^^
 
diff --git a/src/gallium/docs/source/glossary.rst b/src/gallium/docs/source/glossary.rst
index 6a9110ce78..aec89f8b5c 100644
--- a/src/gallium/docs/source/glossary.rst
+++ b/src/gallium/docs/source/glossary.rst
@@ -8,3 +8,8 @@ Glossary
       Multi-Sampled Anti-Aliasing. A basic anti-aliasing technique that takes
       multiple samples of the depth buffer, and uses this information to
       smooth the edges of polygons.
+
+   TCL
+      Transform, Clipping, & Lighting. The three stages of preparation in a
+      rasterizing pipeline prior to the actual rasterization of vertices into
+      fragments.
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 9631e6967e..72bb75a55d 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -3,6 +3,28 @@ Screen
 
 A screen is an object representing the context-independent part of a device.
 
+Useful Flags
+------------
+
+.. _pipe_texture_usage:
+
+PIPE_TEXTURE_USAGE
+^^^^^^^^^^^^^^^^^^
+
+These flags determine the possible roles a texture may be used for during its
+lifetime. Texture usage flags are cumulative and may be combined to create a
+texture that can be used as multiple things.
+
+* ``RENDER_TARGET``: A colorbuffer or pixelbuffer.
+* ``DISPLAY_TARGET``: A sharable buffer that can be given to another process.
+* ``PRIMARY``: A frontbuffer or scanout buffer.
+* ``DEPTH_STENCIL``: A depthbuffer, stencilbuffer, or Z buffer. Gallium does
+  not explicitly provide for stencil-only buffers, so any stencilbuffer
+  validated here is implicitly also a depthbuffer.
+* ``SAMPLER``: A texture that may be sampled from in a fragment or vertex
+  shader.
+* ``DYNAMIC``: A texture that will be mapped frequently.
+
 Methods
 -------
 
@@ -33,6 +55,14 @@ is_format_supported
 
 See if a format can be used in a specific manner.
 
+**usage** is a bitmask of :ref:`PIPE_TEXTURE_USAGE` flags.
+
+Returns TRUE if all usages can be satisfied.
+
+.. note::
+
+   ``PIPE_TEXTURE_USAGE_DYNAMIC`` is not a valid usage.
+
 texture_create
 ^^^^^^^^^^^^^^
 
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index ebee4902b0..65a669d8cf 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -516,8 +516,11 @@ SEQ - Set On Equal
 .. math::
 
   dst.x = (src0.x == src1.x) ? 1 : 0
+
   dst.y = (src0.y == src1.y) ? 1 : 0
+
   dst.z = (src0.z == src1.z) ? 1 : 0
+
   dst.w = (src0.w == src1.w) ? 1 : 0
 
 
@@ -526,8 +529,11 @@ SFL - Set On False
 .. math::
 
   dst.x = 0
+
   dst.y = 0
+
   dst.z = 0
+
   dst.w = 0
 
 Considered for removal.
@@ -537,8 +543,11 @@ SGT - Set On Greater Than
 .. math::
 
   dst.x = (src0.x > src1.x) ? 1 : 0
+
   dst.y = (src0.y > src1.y) ? 1 : 0
+
   dst.z = (src0.z > src1.z) ? 1 : 0
+
   dst.w = (src0.w > src1.w) ? 1 : 0
 
 
@@ -560,8 +569,11 @@ SLE - Set On Less Equal Than
 .. math::
 
   dst.x = (src0.x <= src1.x) ? 1 : 0
+
   dst.y = (src0.y <= src1.y) ? 1 : 0
+
   dst.z = (src0.z <= src1.z) ? 1 : 0
+
   dst.w = (src0.w <= src1.w) ? 1 : 0
 
 
@@ -570,8 +582,11 @@ SNE - Set On Not Equal
 .. math::
 
   dst.x = (src0.x != src1.x) ? 1 : 0
+
   dst.y = (src0.y != src1.y) ? 1 : 0
+
   dst.z = (src0.z != src1.z) ? 1 : 0
+
   dst.w = (src0.w != src1.w) ? 1 : 0
 
 
@@ -580,8 +595,11 @@ STR - Set On True
 .. math::
 
   dst.x = 1
+
   dst.y = 1
+
   dst.z = 1
+
   dst.w = 1
 
 
@@ -629,8 +647,11 @@ X2D - 2D Coordinate Transformation
 .. math::
 
   dst.x = src0.x + src1.x \times src2.x + src1.y \times src2.y
+
   dst.y = src0.y + src1.x \times src2.z + src1.y \times src2.w
+
   dst.z = src0.x + src1.x \times src2.x + src1.y \times src2.y
+
   dst.w = src0.y + src1.x \times src2.z + src1.y \times src2.w
 
 Considered for removal.
@@ -979,13 +1000,13 @@ XOR - Bitwise Xor
 
 .. math::
 
-  dst.x = src0.x ^ src1.x
+  dst.x = src0.x \oplus src1.x
 
-  dst.y = src0.y ^ src1.y
+  dst.y = src0.y \oplus src1.y
 
-  dst.z = src0.z ^ src1.z
+  dst.z = src0.z \oplus src1.z
 
-  dst.w = src0.w ^ src1.w
+  dst.w = src0.w \oplus src1.w
 
 
 SAD - Sum Of Absolute Differences
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index d5f5c7bbba..aa29dcb394 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -358,6 +358,7 @@ struct cell_spu_function_info
 
 
 /** This is the object passed to spe_create_thread() */
+PIPE_ALIGN_TYPE(16,
 struct cell_init_info
 {
    unsigned id;
@@ -370,7 +371,7 @@ struct cell_init_info
    uint *buffer_status;  /**< points at cell_context->buffer_status */
 
    struct cell_spu_function_info *spu_functions;
-} ALIGN16_ATTRIB;
+});
 
 
 #endif /* CELL_COMMON_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 5c3188e7f9..e402ed2922 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -89,7 +89,7 @@ struct cell_buffer_node;
  */
 struct cell_buffer_list
 {
-   struct cell_fence fence ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) struct cell_fence fence;
    struct cell_buffer_node *head;
 };
 
@@ -115,7 +115,7 @@ struct cell_context
 
    struct pipe_blend_color blend_color;
    struct pipe_clip_state clip;
-   struct pipe_constant_buffer constants[2];
+   struct pipe_buffer *constants[2];
    struct pipe_framebuffer_state framebuffer;
    struct pipe_poly_stipple poly_stipple;
    struct pipe_scissor_state scissor;
@@ -150,18 +150,18 @@ struct cell_context
    /** Mapped constant buffers */
    void *mapped_constants[PIPE_SHADER_TYPES];
 
-   struct cell_spu_function_info spu_functions ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) struct cell_spu_function_info spu_functions;
 
    uint num_cells, num_spus;
 
    /** Buffers for command batches, vertex/index data */
    uint buffer_size[CELL_NUM_BUFFERS];
-   ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE];
 
    int cur_batch;  /**< which buffer is being filled w/ commands */
 
    /** [4] to ensure 16-byte alignment for each status word */
-   uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4];
 
 
    /** Associated with each command/batch buffer is a list of pipe_buffers
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index 3fa8b975d3..0a4da8ecc8 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -51,17 +51,17 @@ cell_map_constant_buffers(struct cell_context *sp)
    struct pipe_winsys *ws = sp->pipe.winsys;
    uint i;
    for (i = 0; i < 2; i++) {
-      if (sp->constants[i].buffer && sp->constants[i].buffer->size) {
-         sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i].buffer,
+      if (sp->constants[i] && sp->constants[i]->size) {
+         sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i],
                                                    PIPE_BUFFER_USAGE_CPU_READ);
          cell_flush_buffer_range(sp, sp->mapped_constants[i], 
-                                 sp->constants[i].buffer->size);
+                                 sp->constants[i]->size);
       }
    }
 
    draw_set_mapped_constant_buffer(sp->draw, PIPE_SHADER_VERTEX,
                                    sp->mapped_constants[PIPE_SHADER_VERTEX],
-                                   sp->constants[PIPE_SHADER_VERTEX].buffer->size);
+                                   sp->constants[PIPE_SHADER_VERTEX]->size);
 }
 
 static void
@@ -70,8 +70,8 @@ cell_unmap_constant_buffers(struct cell_context *sp)
    struct pipe_winsys *ws = sp->pipe.winsys;
    uint i;
    for (i = 0; i < 2; i++) {
-      if (sp->constants[i].buffer && sp->constants[i].buffer->size)
-         ws->buffer_unmap(ws, sp->constants[i].buffer);
+      if (sp->constants[i] && sp->constants[i]->size)
+         ws->buffer_unmap(ws, sp->constants[i]);
       sp->mapped_constants[i] = NULL;
    }
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 5b87286d4c..f1e1dcb9eb 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -240,12 +240,12 @@ cell_emit_state(struct cell_context *cell)
 
    if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
       const uint shader = PIPE_SHADER_FRAGMENT;
-      const uint num_const = cell->constants[shader].buffer->size / sizeof(float);
+      const uint num_const = cell->constants[shader]->size / sizeof(float);
       uint i, j;
       float *buf = cell_batch_alloc16(cell, ROUNDUP16(32 + num_const * sizeof(float)));
       uint32_t *ibuf = (uint32_t *) buf;
       const float *constants = pipe_buffer_map(cell->pipe.screen,
-                                               cell->constants[shader].buffer,
+                                               cell->constants[shader],
                                                PIPE_BUFFER_USAGE_CPU_READ);
       ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
       ibuf[4] = num_const;
@@ -253,7 +253,7 @@ cell_emit_state(struct cell_context *cell)
       for (i = 0; i < num_const; i++) {
          buf[j++] = constants[i];
       }
-      pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer);
+      pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader]);
    }
 
    if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 6568c784fe..1b09cf7f7d 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -183,7 +183,7 @@ cell_delete_vs_state(struct pipe_context *pipe, void *vs)
 static void
 cell_set_constant_buffer(struct pipe_context *pipe,
                          uint shader, uint index,
-                         const struct pipe_constant_buffer *buf)
+                         struct pipe_buffer *buf)
 {
    struct cell_context *cell = cell_context(pipe);
 
@@ -193,7 +193,7 @@ cell_set_constant_buffer(struct pipe_context *pipe,
    draw_flush(cell->draw);
 
    /* note: reference counting */
-   pipe_buffer_reference(&cell->constants[shader].buffer, buf->buffer);
+   pipe_buffer_reference(&cell->constants[shader], buf);
 
    if (shader == PIPE_SHADER_VERTEX)
       cell->dirty |= CELL_NEW_VS_CONSTANTS;
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 12b855a3db..55bd85bde2 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -53,8 +53,7 @@ struct spu_vs_context draw;
 /**
  * Buffers containing dynamically generated SPU code:
  */
-static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
-    ALIGN16_ATTRIB;
+PIPE_ALIGN_VAR(16) static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS];
 
 
 
@@ -543,7 +542,7 @@ cmd_batch(uint opcode)
 {
    const uint buf = (opcode >> 8) & 0xff;
    uint size = (opcode >> 16);
-   qword buffer[CELL_BUFFER_SIZE / 16] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) qword buffer[CELL_BUFFER_SIZE / 16];
    const unsigned usize = ROUNDUP16(size) / sizeof(buffer[0]);
    uint pos;
 
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index d86d8e09a5..d2166a4901 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -1839,10 +1839,11 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
    /* execute declarations (interpolants) */
    if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
       for (i = 0; i < mach->NumDeclarations; i++) {
+         PIPE_ALIGN_VAR(16)
          union {
             struct tgsi_full_declaration decl;
             qword buffer[ROUNDUP16(sizeof(struct tgsi_full_declaration)) / 16];
-         } d ALIGN16_ATTRIB;
+         } d;
          unsigned ea = (unsigned) (mach->Declarations + pc);
 
          spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl));
@@ -1853,10 +1854,11 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
 
    /* execute instructions, until pc is set to -1 */
    while (pc != -1) {
+      PIPE_ALIGN_VAR(16)
       union {
          struct tgsi_full_instruction inst;
          qword buffer[ROUNDUP16(sizeof(struct tgsi_full_instruction)) / 16];
-      } i ALIGN16_ATTRIB;
+      } i;
       unsigned ea = (unsigned) (mach->Instructions + pc);
 
       spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst));
diff --git a/src/gallium/drivers/cell/spu/spu_exec.h b/src/gallium/drivers/cell/spu/spu_exec.h
index 8605679940..0ca92af248 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.h
+++ b/src/gallium/drivers/cell/spu/spu_exec.h
@@ -98,9 +98,9 @@ struct spu_exec_machine
     * 4  internal temporaries
     * 1  address
     */
+   PIPE_ALIGN_VAR(16)
    struct spu_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS 
-                                      + TGSI_EXEC_NUM_TEMP_EXTRAS + 1]
-       ALIGN16_ATTRIB;
+                                      + TGSI_EXEC_NUM_TEMP_EXTRAS + 1];
 
    struct spu_exec_vector       *Addrs;
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index ff3d609d25..98919c43ff 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -144,7 +144,7 @@ export_func(struct cell_spu_function_info *spu_functions,
 void
 return_function_info(void)
 {
-   struct cell_spu_function_info funcs ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) struct cell_spu_function_info funcs;
    int tag = TAG_MISC;
 
    ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 33767e7c51..b18f4c22ef 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -93,6 +93,7 @@ typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
                                                          vector float *constants);
 
 
+PIPE_ALIGN_TYPE(16,
 struct spu_framebuffer
 {
    void *color_start;              /**< addr of color surface in main memory */
@@ -107,10 +108,11 @@ struct spu_framebuffer
 
    uint zsize;                     /**< 0, 2 or 4 bytes per Z */
    float zscale;                   /**< 65535.0, 2^24-1 or 2^32-1 */
-} ALIGN16_ATTRIB;
+});
 
 
 /** per-texture level info */
+PIPE_ALIGN_TYPE(16,
 struct spu_texture_level
 {
    void *start;
@@ -123,20 +125,22 @@ struct spu_texture_level
    vector signed int mask_s, mask_t, mask_r;
    /** texcoord clamp limits */
    vector signed int max_s, max_t, max_r;
-} ALIGN16_ATTRIB;
+});
 
 
+PIPE_ALIGN_TYPE(16,
 struct spu_texture
 {
    struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
    uint max_level;
    uint target;  /**< PIPE_TEXTURE_x */
-} ALIGN16_ATTRIB;
+});
 
 
 /**
  * All SPU global/context state will be in a singleton object of this type:
  */
+PIPE_ALIGN_TYPE(16,
 struct spu_global
 {
    /** One-time init/constant info */
@@ -155,8 +159,8 @@ struct spu_global
    struct vertex_info vertex_info;
 
    /** Current color and Z tiles */
-   tile_t ctile ALIGN16_ATTRIB;
-   tile_t ztile ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) tile_t ctile;
+   PIPE_ALIGN_VAR(16) tile_t ztile;
 
    /** Read depth/stencil tiles? */
    boolean read_depth_stencil;
@@ -165,8 +169,8 @@ struct spu_global
    ubyte cur_ctile_status, cur_ztile_status;
 
    /** Status of all tiles in framebuffer */
-   ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-   ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE];
+   PIPE_ALIGN_VAR(16) ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE];
 
    /** Current fragment ops machine code, at 8-byte boundary */
    uint *fragment_ops_code;
@@ -175,7 +179,7 @@ struct spu_global
    spu_fragment_ops_func fragment_ops[2];
 
    /** Current fragment program machine code, at 8-byte boundary */
-   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
+   PIPE_ALIGN_VAR(8) uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
    /** Current fragment ops function */
    spu_fragment_program_func fragment_program;
 
@@ -187,7 +191,7 @@ struct spu_global
    /** Fragment program constants */
    vector float constants[4 * CELL_MAX_CONSTANTS];
 
-} ALIGN16_ATTRIB;
+});
 
 
 extern struct spu_global spu;
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 5ffb7073ab..14987e3c3a 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -169,7 +169,7 @@ void
 cmd_render(const struct cell_command_render *render, uint *pos_incr)
 {
    /* we'll DMA into these buffers */
-   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) ubyte vertex_data[CELL_BUFFER_SIZE];
    const uint vertex_size = render->vertex_size; /* in bytes */
    /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
    uint index_bytes;
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
index 03375d84a5..087963960d 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -43,7 +43,8 @@ typedef void (*spu_fetch_func)(qword *out, const qword *in,
 			       const qword *shuffle_data);
 
 
-static const qword fetch_shuffle_data[5] ALIGN16_ATTRIB = {
+PIPE_ALIGN_VAR(16) static const qword
+fetch_shuffle_data[5] = {
    /* Shuffle used by CVT_64_FLOAT
     */
    {
@@ -110,7 +111,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
       unsigned idx;
       const unsigned bytes_per_entry = draw->vertex_fetch.size[attr];
       const unsigned quads_per_entry = (bytes_per_entry + 15) / 16;
-      qword in[2 * 4] ALIGN16_ATTRIB;
+      PIPE_ALIGN_VAR(16) qword in[2 * 4];
 
 
       /* Fetch four attributes for four vertices.  
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
index fbe5b34d39..3e9804bf8e 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -107,8 +107,8 @@ run_vertex_program(struct spu_vs_context *draw,
    struct spu_exec_machine *machine = &draw->machine;
    unsigned int j;
 
-   ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_MAX_ATTRIBS);
-   ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_MAX_ATTRIBS);
+   PIPE_ALIGN_VAR(16) struct spu_exec_vector inputs[PIPE_MAX_ATTRIBS];
+   PIPE_ALIGN_VAR(16) struct spu_exec_vector outputs[PIPE_MAX_ATTRIBS];
    const float *scale = draw->viewport.scale;
    const float *trans = draw->viewport.translate;
 
@@ -119,8 +119,8 @@ run_vertex_program(struct spu_vs_context *draw,
    ASSERT_ALIGN16(draw->constants);
    machine->Consts = (float (*)[4]) draw->constants;
 
-   machine->Inputs = ALIGN16_ASSIGN(inputs);
-   machine->Outputs = ALIGN16_ASSIGN(outputs);
+   machine->Inputs = inputs;
+   machine->Outputs = outputs;
 
    spu_vertex_fetch( draw, machine, elts, count );
 
@@ -132,8 +132,9 @@ run_vertex_program(struct spu_vs_context *draw,
    for (j = 0; j < count; j++) {
       unsigned slot;
       float x, y, z, w;
+      PIPE_ALIGN_VAR(16)
       unsigned char buffer[sizeof(struct vertex_header)
-          + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
+          + MAX_VERTEX_SIZE];
       struct vertex_header *const tmpOut =
           (struct vertex_header *) buffer;
       const unsigned vert_size = ROUNDUP16(sizeof(struct vertex_header)
@@ -186,8 +187,8 @@ run_vertex_program(struct spu_vs_context *draw,
 }
 
 
-unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]
-    ALIGN16_ATTRIB;
+PIPE_ALIGN_VAR(16) unsigned char
+immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]);
 
 
 void
diff --git a/src/gallium/drivers/failover/fo_context.h b/src/gallium/drivers/failover/fo_context.h
index 149393712a..191a44c3df 100644
--- a/src/gallium/drivers/failover/fo_context.h
+++ b/src/gallium/drivers/failover/fo_context.h
@@ -125,7 +125,7 @@ failover_context( struct pipe_context *pipe )
 void
 failover_set_constant_buffer(struct pipe_context *pipe,
                              uint shader, uint index,
-                             const struct pipe_constant_buffer *buf);
+                             struct pipe_buffer *buf);
 
 
 #endif /* FO_CONTEXT_H */
diff --git a/src/gallium/drivers/failover/fo_state.c b/src/gallium/drivers/failover/fo_state.c
index 3f5f556032..d6ec4d1313 100644
--- a/src/gallium/drivers/failover/fo_state.c
+++ b/src/gallium/drivers/failover/fo_state.c
@@ -495,7 +495,7 @@ failover_set_vertex_elements(struct pipe_context *pipe,
 void
 failover_set_constant_buffer(struct pipe_context *pipe,
                              uint shader, uint index,
-                             const struct pipe_constant_buffer *buf)
+                             struct pipe_buffer *buf)
 {
    struct failover_context *failover = failover_context(pipe);
 
diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h
index 234b441ce6..37cbd56036 100644
--- a/src/gallium/drivers/i915/i915_context.h
+++ b/src/gallium/drivers/i915/i915_context.h
@@ -233,7 +233,8 @@ struct i915_context
 
    struct pipe_blend_color blend_color;
    struct pipe_clip_state clip;
-   struct pipe_constant_buffer constants[PIPE_SHADER_TYPES];
+   /* XXX unneded */
+   struct pipe_buffer *constants[PIPE_SHADER_TYPES];
    struct pipe_framebuffer_state framebuffer;
    struct pipe_poly_stipple poly_stipple;
    struct pipe_scissor_state scissor;
diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index 5f5b6f8e18..0fab6e1bc3 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -518,7 +518,7 @@ static void i915_delete_vs_state(struct pipe_context *pipe, void *shader)
 
 static void i915_set_constant_buffer(struct pipe_context *pipe,
                                      uint shader, uint index,
-                                     const struct pipe_constant_buffer *buf)
+                                     struct pipe_buffer *buf)
 {
    struct i915_context *i915 = i915_context(pipe);
    struct pipe_screen *screen = pipe->screen;
@@ -538,13 +538,13 @@ static void i915_set_constant_buffer(struct pipe_context *pipe,
     */
    if (buf) {
       void *mapped;
-      if (buf->buffer && buf->buffer->size &&
-          (mapped = pipe_buffer_map(screen, buf->buffer,
+      if (buf->size &&
+          (mapped = pipe_buffer_map(screen, buf,
                                     PIPE_BUFFER_USAGE_CPU_READ))) {
-         memcpy(i915->current.constants[shader], mapped, buf->buffer->size);
-         pipe_buffer_unmap(screen, buf->buffer);
+         memcpy(i915->current.constants[shader], mapped, buf->size);
+         pipe_buffer_unmap(screen, buf);
          i915->current.num_user_constants[shader]
-            = buf->buffer->size / (4 * sizeof(float));
+            = buf->size / (4 * sizeof(float));
       }
       else {
          i915->current.num_user_constants[shader] = 0;
diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
index bb32d90e33..e389587f3e 100644
--- a/src/gallium/drivers/i965/brw_pipe_shader.c
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -262,7 +262,7 @@ static void brw_delete_vs_state( struct pipe_context *pipe, void *prog )
 
 static void brw_set_constant_buffer(struct pipe_context *pipe,
                                      uint shader, uint index,
-                                     const struct pipe_constant_buffer *buf)
+                                     struct pipe_buffer *buf)
 {
    struct brw_context *brw = brw_context(pipe);
 
@@ -270,13 +270,13 @@ static void brw_set_constant_buffer(struct pipe_context *pipe,
 
    if (shader == PIPE_SHADER_FRAGMENT) {
       pipe_buffer_reference( &brw->curr.fragment_constants,
-                             buf->buffer );
+                             buf );
 
       brw->state.dirty.mesa |= PIPE_NEW_FRAGMENT_CONSTANTS;
    }
    else {
       pipe_buffer_reference( &brw->curr.vertex_constants,
-                             buf->buffer );
+                             buf );
 
       brw->state.dirty.mesa |= PIPE_NEW_VERTEX_CONSTANTS;
    }
diff --git a/src/gallium/drivers/identity/id_context.c b/src/gallium/drivers/identity/id_context.c
index 9f5b4e6323..f9063d90fb 100644
--- a/src/gallium/drivers/identity/id_context.c
+++ b/src/gallium/drivers/identity/id_context.c
@@ -404,17 +404,17 @@ static void
 identity_set_constant_buffer(struct pipe_context *_pipe,
                              uint shader,
                              uint index,
-                             const struct pipe_constant_buffer *_buffer)
+                             struct pipe_buffer *_buffer)
 {
    struct identity_context *id_pipe = identity_context(_pipe);
    struct pipe_context *pipe = id_pipe->pipe;
-   struct pipe_constant_buffer unwrapped_buffer;
-   struct pipe_constant_buffer *buffer = NULL;
+   struct pipe_buffer *unwrapped_buffer;
+   struct pipe_buffer *buffer = NULL;
 
-   /* unwrap the input state */
+   /* XXX hmm? unwrap the input state */
    if (_buffer) {
-      unwrapped_buffer.buffer = identity_buffer_unwrap(_buffer->buffer);
-      buffer = &unwrapped_buffer;
+      unwrapped_buffer = identity_buffer_unwrap(_buffer);
+      buffer = unwrapped_buffer;
    }
 
    pipe->set_constant_buffer(pipe,
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 1cc3c9227c..aaa675aec7 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -124,8 +124,8 @@ static void llvmpipe_destroy( struct pipe_context *pipe )
    }
 
    for (i = 0; i < Elements(llvmpipe->constants); i++) {
-      if (llvmpipe->constants[i].buffer) {
-         pipe_buffer_reference(&llvmpipe->constants[i].buffer, NULL);
+      if (llvmpipe->constants[i]) {
+         pipe_buffer_reference(&llvmpipe->constants[i], NULL);
       }
    }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 6411797cf5..426d6eb4a1 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -64,7 +64,7 @@ struct llvmpipe_context {
    /** Other rendering state */
    struct pipe_blend_color blend_color[4][16];
    struct pipe_clip_state clip;
-   struct pipe_constant_buffer constants[PIPE_SHADER_TYPES];
+   struct pipe_buffer *constants[PIPE_SHADER_TYPES];
    struct pipe_framebuffer_state framebuffer;
    struct pipe_poly_stipple poly_stipple;
    struct pipe_scissor_state scissor;
diff --git a/src/gallium/drivers/llvmpipe/lp_quad.h b/src/gallium/drivers/llvmpipe/lp_quad.h
index 7eb05de77a..c3a48700a4 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad.h
+++ b/src/gallium/drivers/llvmpipe/lp_quad.h
@@ -31,6 +31,7 @@
 #ifndef LP_QUAD_H
 #define LP_QUAD_H
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_exec.h"
 
@@ -83,7 +84,7 @@ struct quad_header_inout
 struct quad_header_output
 {
    /** colors in SOA format (rrrr, gggg, bbbb, aaaa) */
-   float ALIGN16_ATTRIB color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
+   PIPE_ALIGN_VAR(16) float color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
 };
 
 
@@ -92,9 +93,9 @@ struct quad_header_output
  */
 struct quad_interp_coef
 {
-   float ALIGN16_ATTRIB a0[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-   float ALIGN16_ATTRIB dadx[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-   float ALIGN16_ATTRIB dady[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   PIPE_ALIGN_VAR(16) float a0[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   PIPE_ALIGN_VAR(16) float dadx[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   PIPE_ALIGN_VAR(16) float dady[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index b18f17c0cd..0b2d3a2801 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -117,7 +117,7 @@ struct setup_context {
 /**
  * Execute fragment shader for the four fragments in the quad.
  */
-ALIGN_STACK
+PIPE_ALIGN_STACK
 static void
 shade_quads(struct llvmpipe_context *llvmpipe,
             struct quad_header *quads[],
@@ -130,7 +130,7 @@ shade_quads(struct llvmpipe_context *llvmpipe,
    uint8_t *tile;
    uint8_t *color;
    void *depth;
-   uint32_t ALIGN16_ATTRIB mask[4][NUM_CHANNELS];
+   PIPE_ALIGN_VAR(16) uint32_t mask[4][NUM_CHANNELS];
    unsigned chan_index;
    unsigned q;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h
index 7020da145f..e16793186b 100644
--- a/src/gallium/drivers/llvmpipe/lp_state.h
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@@ -154,7 +154,7 @@ void llvmpipe_set_clip_state( struct pipe_context *,
 
 void llvmpipe_set_constant_buffer(struct pipe_context *,
                                   uint shader, uint index,
-                                  const struct pipe_constant_buffer *buf);
+                                  struct pipe_buffer *buf);
 
 void *llvmpipe_create_fs_state(struct pipe_context *,
                                const struct pipe_shader_state *);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index b73ca2d41e..9f4bbef73f 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -714,12 +714,11 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
 void
 llvmpipe_set_constant_buffer(struct pipe_context *pipe,
                              uint shader, uint index,
-                             const struct pipe_constant_buffer *constants)
+                             struct pipe_buffer *constants)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
-   struct pipe_buffer *buffer = constants ? constants->buffer : NULL;
-   unsigned size = buffer ? buffer->size : 0;
-   const void *data = buffer ? llvmpipe_buffer(buffer)->data : NULL;
+   unsigned size = constants ? constants->size : 0;
+   const void *data = constants ? llvmpipe_buffer(constants)->data : NULL;
 
    assert(shader < PIPE_SHADER_TYPES);
    assert(index == 0);
@@ -727,7 +726,7 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
    draw_flush(llvmpipe->draw);
 
    /* note: reference counting */
-   pipe_buffer_reference(&llvmpipe->constants[shader].buffer, buffer);
+   pipe_buffer_reference(&llvmpipe->constants[shader], constants);
 
    if(shader == PIPE_SHADER_FRAGMENT) {
       llvmpipe->jit_context.constants = data;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index 29fff91981..6c29e8d8ac 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -462,7 +462,7 @@ compute_blend_ref(const struct pipe_blend_state *blend,
 }
 
 
-ALIGN_STACK
+PIPE_ALIGN_STACK
 static boolean
 test_one(unsigned verbose,
          FILE *fp,
@@ -531,11 +531,11 @@ test_one(unsigned verbose,
    success = TRUE;
    for(i = 0; i < n && success; ++i) {
       if(mode == AoS) {
-         ALIGN16_ATTRIB uint8_t src[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t con[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t res[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t src[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t con[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t res[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8];
          int64_t start_counter = 0;
          int64_t end_counter = 0;
 
@@ -596,11 +596,11 @@ test_one(unsigned verbose,
 
       if(mode == SoA) {
          const unsigned stride = type.length*type.width/8;
-         ALIGN16_ATTRIB uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8];
          int64_t start_counter = 0;
          int64_t end_counter = 0;
          boolean mismatch;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index faddfb9677..c1abee424c 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -142,7 +142,7 @@ add_conv_test(LLVMModuleRef module,
 }
 
 
-ALIGN_STACK
+PIPE_ALIGN_STACK
 static boolean
 test_one(unsigned verbose,
          FILE *fp,
@@ -230,8 +230,8 @@ test_one(unsigned verbose,
    for(i = 0; i < n && success; ++i) {
       unsigned src_stride = src_type.length*src_type.width/8;
       unsigned dst_stride = dst_type.length*dst_type.width/8;
-      ALIGN16_ATTRIB uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
-      ALIGN16_ATTRIB uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      PIPE_ALIGN_VAR(16) uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      PIPE_ALIGN_VAR(16) uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       double fref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       int64_t start_counter = 0;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
index 23ea9ebbe7..2b258f1052 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -199,7 +199,7 @@ add_store_rgba_test(LLVMModuleRef module,
 }
 
 
-ALIGN_STACK
+PIPE_ALIGN_STACK
 static boolean
 test_format(unsigned verbose, FILE *fp, const struct pixel_test_case *test)
 {
diff --git a/src/gallium/drivers/nv04/nv04_state.c b/src/gallium/drivers/nv04/nv04_state.c
index e3dc4c5bf4..b67f1e16b1 100644
--- a/src/gallium/drivers/nv04/nv04_state.c
+++ b/src/gallium/drivers/nv04/nv04_state.c
@@ -332,7 +332,7 @@ nv04_set_clip_state(struct pipe_context *pipe,
 
 static void
 nv04_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
-			 const struct pipe_constant_buffer *buf )
+			 struct pipe_buffer *buf )
 {
 	struct nv04_context *nv04 = nv04_context(pipe);
 	struct pipe_screen *pscreen = pipe->screen;
@@ -342,13 +342,13 @@ nv04_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 
 	if (buf) {
 		void *mapped;
-		if (buf->buffer && buf->buffer->size &&
-                    (mapped = pipe_buffer_map(pscreen, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+		if (buf && buf->size &&
+                    (mapped = pipe_buffer_map(pscreen, buf, PIPE_BUFFER_USAGE_CPU_READ)))
 		{
-			memcpy(nv04->constbuf[shader], mapped, buf->buffer->size);
+			memcpy(nv04->constbuf[shader], mapped, buf->size);
 			nv04->constbuf_nr[shader] =
-				buf->buffer->size / (4 * sizeof(float));
-			pipe_buffer_unmap(pscreen, buf->buffer);
+				buf->size / (4 * sizeof(float));
+			pipe_buffer_unmap(pscreen, buf);
 		}
 	}
 }
diff --git a/src/gallium/drivers/nv10/nv10_state.c b/src/gallium/drivers/nv10/nv10_state.c
index ffc6be3c40..ad7def53b1 100644
--- a/src/gallium/drivers/nv10/nv10_state.c
+++ b/src/gallium/drivers/nv10/nv10_state.c
@@ -458,7 +458,7 @@ nv10_set_clip_state(struct pipe_context *pipe,
 
 static void
 nv10_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
-			 const struct pipe_constant_buffer *buf )
+			 struct pipe_buffer *buf )
 {
 	struct nv10_context *nv10 = nv10_context(pipe);
 	struct pipe_screen *pscreen = pipe->screen;
@@ -468,13 +468,13 @@ nv10_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 
 	if (buf) {
 		void *mapped;
-		if (buf->buffer && buf->buffer->size &&
-                    (mapped = pipe_buffer_map(pscreen, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+		if (buf->size &&
+                    (mapped = pipe_buffer_map(pscreen, buf, PIPE_BUFFER_USAGE_CPU_READ)))
 		{
-			memcpy(nv10->constbuf[shader], mapped, buf->buffer->size);
+			memcpy(nv10->constbuf[shader], mapped, buf->size);
 			nv10->constbuf_nr[shader] =
-				buf->buffer->size / (4 * sizeof(float));
-			pipe_buffer_unmap(pscreen, buf->buffer);
+				buf->size / (4 * sizeof(float));
+			pipe_buffer_unmap(pscreen, buf);
 		}
 	}
 }
diff --git a/src/gallium/drivers/nv20/nv20_state.c b/src/gallium/drivers/nv20/nv20_state.c
index 3a82e63423..45697a60ef 100644
--- a/src/gallium/drivers/nv20/nv20_state.c
+++ b/src/gallium/drivers/nv20/nv20_state.c
@@ -451,7 +451,7 @@ nv20_set_clip_state(struct pipe_context *pipe,
 
 static void
 nv20_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
-			 const struct pipe_constant_buffer *buf )
+			 struct pipe_buffer *buf )
 {
 	struct nv20_context *nv20 = nv20_context(pipe);
 	struct pipe_screen *pscreen = pipe->screen;
@@ -461,13 +461,13 @@ nv20_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 
 	if (buf) {
 		void *mapped;
-		if (buf->buffer && buf->buffer->size &&
-                    (mapped = pipe_buffer_map(pscreen, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+		if (buf->size &&
+                    (mapped = pipe_buffer_map(pscreen, buf, PIPE_BUFFER_USAGE_CPU_READ)))
 		{
-			memcpy(nv20->constbuf[shader], mapped, buf->buffer->size);
+			memcpy(nv20->constbuf[shader], mapped, buf->size);
 			nv20->constbuf_nr[shader] =
-				buf->buffer->size / (4 * sizeof(float));
-			pipe_buffer_unmap(pscreen, buf->buffer);
+				buf->size / (4 * sizeof(float));
+			pipe_buffer_unmap(pscreen, buf);
 		}
 	}
 }
diff --git a/src/gallium/drivers/nv30/nv30_fragtex.c b/src/gallium/drivers/nv30/nv30_fragtex.c
index 9893567891..0cc3172dcd 100644
--- a/src/gallium/drivers/nv30/nv30_fragtex.c
+++ b/src/gallium/drivers/nv30/nv30_fragtex.c
@@ -43,7 +43,6 @@ static struct nv30_texture_format *
 nv30_fragtex_format(uint pipe_format)
 {
 	struct nv30_texture_format *tf = nv30_texture_formats;
-	char fs[128];
 
 	while (tf->defined) {
 		if (tf->pipe == pipe_format)
@@ -65,7 +64,7 @@ nv30_fragtex_build(struct nv30_context *nv30, int unit)
 	struct nouveau_bo *bo = nouveau_bo(nv30mt->buffer);
 	struct nv30_texture_format *tf;
 	struct nouveau_stateobj *so;
-	uint32_t txf, txs , txp;
+	uint32_t txf, txs;
 	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
 
 	tf = nv30_fragtex_format(pt->format);
@@ -97,13 +96,6 @@ nv30_fragtex_build(struct nv30_context *nv30, int unit)
 		return NULL;
 	}
 
-	if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
-		txp = 0;
-	} else {
-		txp  = nv30mt->level[0].pitch;
-		txf |= (1<<13) /*FIXME: NV34TCL_TX_FORMAT_LINEAR ? */;
-	}
-
 	txs = tf->swizzle;
 
 	so = so_new(1, 8, 2);
diff --git a/src/gallium/drivers/nv30/nv30_state.c b/src/gallium/drivers/nv30/nv30_state.c
index a80dfb0488..065c927a10 100644
--- a/src/gallium/drivers/nv30/nv30_state.c
+++ b/src/gallium/drivers/nv30/nv30_state.c
@@ -590,12 +590,12 @@ nv30_set_clip_state(struct pipe_context *pipe,
 
 static void
 nv30_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
-			 const struct pipe_constant_buffer *buf )
+			 struct pipe_buffer *buf )
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
 
-	nv30->constbuf[shader] = buf->buffer;
-	nv30->constbuf_nr[shader] = buf->buffer->size / (4 * sizeof(float));
+	nv30->constbuf[shader] = buf;
+	nv30->constbuf_nr[shader] = buf->size / (4 * sizeof(float));
 
 	if (shader == PIPE_SHADER_VERTEX) {
 		nv30->dirty |= NV30_NEW_VERTPROG;
diff --git a/src/gallium/drivers/nv40/nv40_state.c b/src/gallium/drivers/nv40/nv40_state.c
index ed0ca9e02c..7d990f7d56 100644
--- a/src/gallium/drivers/nv40/nv40_state.c
+++ b/src/gallium/drivers/nv40/nv40_state.c
@@ -605,12 +605,12 @@ nv40_set_clip_state(struct pipe_context *pipe,
 
 static void
 nv40_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
-			 const struct pipe_constant_buffer *buf )
+			 struct pipe_buffer *buf )
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 
-	nv40->constbuf[shader] = buf->buffer;
-	nv40->constbuf_nr[shader] = buf->buffer->size / (4 * sizeof(float));
+	nv40->constbuf[shader] = buf;
+	nv40->constbuf_nr[shader] = buf->size / (4 * sizeof(float));
 
 	if (shader == PIPE_SHADER_VERTEX) {
 		nv40->dirty |= NV40_NEW_VERTPROG;
diff --git a/src/gallium/drivers/nv50/nv50_context.c b/src/gallium/drivers/nv50/nv50_context.c
index 5997456e4c..1e69746322 100644
--- a/src/gallium/drivers/nv50/nv50_context.c
+++ b/src/gallium/drivers/nv50/nv50_context.c
@@ -67,8 +67,12 @@ nv50_destroy(struct pipe_context *pipe)
 		so_ref(NULL, &nv50->state.vertprog);
 	if (nv50->state.fragprog)
 		so_ref(NULL, &nv50->state.fragprog);
-	if (nv50->state.programs)
-		so_ref(NULL, &nv50->state.programs);
+	if (nv50->state.geomprog)
+		so_ref(NULL, &nv50->state.geomprog);
+	if (nv50->state.fp_linkage)
+		so_ref(NULL, &nv50->state.fp_linkage);
+	if (nv50->state.gp_linkage)
+		so_ref(NULL, &nv50->state.gp_linkage);
 	if (nv50->state.vtxfmt)
 		so_ref(NULL, &nv50->state.vtxfmt);
 	if (nv50->state.vtxbuf)
@@ -100,7 +104,9 @@ nv50_create(struct pipe_screen *pscreen, unsigned pctx_id)
 	nv50->pipe.destroy = nv50_destroy;
 
 	nv50->pipe.draw_arrays = nv50_draw_arrays;
+	nv50->pipe.draw_arrays_instanced = nv50_draw_arrays_instanced;
 	nv50->pipe.draw_elements = nv50_draw_elements;
+	nv50->pipe.draw_elements_instanced = nv50_draw_elements_instanced;
 	nv50->pipe.clear = nv50_clear;
 
 	nv50->pipe.flush = nv50_flush;
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
index cbd4c3ff86..bebcd95054 100644
--- a/src/gallium/drivers/nv50/nv50_context.h
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -29,9 +29,7 @@
 #define NV50_CB_PVP		1
 #define NV50_CB_PFP		2
 #define NV50_CB_PGP		3
-#define NV50_CB_TIC		4
-#define NV50_CB_TSC		5
-#define NV50_CB_PUPLOAD         6
+#define NV50_CB_AUX		4
 
 #define NV50_NEW_BLEND		(1 << 0)
 #define NV50_NEW_ZSA		(1 << 1)
@@ -45,9 +43,11 @@
 #define NV50_NEW_VERTPROG_CB	(1 << 9)
 #define NV50_NEW_FRAGPROG	(1 << 10)
 #define NV50_NEW_FRAGPROG_CB	(1 << 11)
-#define NV50_NEW_ARRAYS		(1 << 12)
-#define NV50_NEW_SAMPLER	(1 << 13)
-#define NV50_NEW_TEXTURE	(1 << 14)
+#define NV50_NEW_GEOMPROG	(1 << 12)
+#define NV50_NEW_GEOMPROG_CB	(1 << 13)
+#define NV50_NEW_ARRAYS		(1 << 14)
+#define NV50_NEW_SAMPLER	(1 << 15)
+#define NV50_NEW_TEXTURE	(1 << 16)
 
 struct nv50_blend_stateobj {
 	struct pipe_blend_state pipe;
@@ -129,10 +129,13 @@ struct nv50_state {
 	unsigned miptree_nr[PIPE_SHADER_TYPES];
 	struct nouveau_stateobj *vertprog;
 	struct nouveau_stateobj *fragprog;
-	struct nouveau_stateobj *programs;
+	struct nouveau_stateobj *geomprog;
+	struct nouveau_stateobj *fp_linkage;
+	struct nouveau_stateobj *gp_linkage;
 	struct nouveau_stateobj *vtxfmt;
 	struct nouveau_stateobj *vtxbuf;
 	struct nouveau_stateobj *vtxattr;
+	struct nouveau_stateobj *instbuf;
 	unsigned vtxelt_nr;
 };
 
@@ -157,6 +160,7 @@ struct nv50_context {
 	struct pipe_framebuffer_state framebuffer;
 	struct nv50_program *vertprog;
 	struct nv50_program *fragprog;
+	struct nv50_program *geomprog;
 	struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
 	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
 	unsigned vtxbuf_nr;
@@ -193,11 +197,22 @@ extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *nv50);
 /* nv50_vbo.c */
 extern void nv50_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
+extern void nv50_draw_arrays_instanced(struct pipe_context *, unsigned mode,
+					unsigned start, unsigned count,
+					unsigned startInstance,
+					unsigned instanceCount);
 extern void nv50_draw_elements(struct pipe_context *pipe,
 				  struct pipe_buffer *indexBuffer,
 				  unsigned indexSize,
 				  unsigned mode, unsigned start,
 				  unsigned count);
+extern void nv50_draw_elements_instanced(struct pipe_context *pipe,
+					 struct pipe_buffer *indexBuffer,
+					 unsigned indexSize,
+					 unsigned mode, unsigned start,
+					 unsigned count,
+					 unsigned startInstance,
+					 unsigned instanceCount);
 extern void nv50_vbo_validate(struct nv50_context *nv50);
 
 /* nv50_clear.c */
@@ -207,7 +222,9 @@ extern void nv50_clear(struct pipe_context *pipe, unsigned buffers,
 /* nv50_program.c */
 extern void nv50_vertprog_validate(struct nv50_context *nv50);
 extern void nv50_fragprog_validate(struct nv50_context *nv50);
-extern void nv50_linkage_validate(struct nv50_context *nv50);
+extern void nv50_geomprog_validate(struct nv50_context *nv50);
+extern void nv50_fp_linkage_validate(struct nv50_context *nv50);
+extern void nv50_gp_linkage_validate(struct nv50_context *nv50);
 extern void nv50_program_destroy(struct nv50_context *nv50,
 				 struct nv50_program *p);
 
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
index 3f1edf0a13..dc8364ced7 100644
--- a/src/gallium/drivers/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -92,12 +92,23 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 	case PIPE_FORMAT_Z24S8_UNORM:
 		tile_flags = 0x1800;
 		break;
+	case PIPE_FORMAT_Z16_UNORM:
+		tile_flags = 0x6c00;
+		break;
 	case PIPE_FORMAT_X8Z24_UNORM:
 	case PIPE_FORMAT_S8Z24_UNORM:
 		tile_flags = 0x2800;
 		break;
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+		tile_flags = 0x7400;
+		break;
 	default:
-		tile_flags = 0x7000;
+		if ((pt->tex_usage & PIPE_TEXTURE_USAGE_PRIMARY) &&
+		    util_format_get_blocksizebits(pt->format) == 32)
+			tile_flags = 0x7a00;
+		else
+			tile_flags = 0x7000;
 		break;
 	}
 
@@ -145,7 +156,7 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 				  mt->level[0].tile_mode, tile_flags,
 				  &mt->base.bo);
 	if (ret) {
-		for (l = 0; l < pt->last_level; ++l)
+		for (l = 0; l <= pt->last_level; ++l)
 			FREE(mt->level[l].image_offset);
 		FREE(mt);
 		return NULL;
@@ -188,7 +199,7 @@ nv50_miptree_destroy(struct pipe_texture *pt)
 	struct nv50_miptree *mt = nv50_miptree(pt);
 	unsigned l;
 
-	for (l = 0; l < pt->last_level; ++l)
+	for (l = 0; l <= pt->last_level; ++l)
 		FREE(mt->level[l].image_offset);
 
 	nouveau_bo_ref(NULL, &mt->base.bo);
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index e16fa479e5..20db51070f 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -92,6 +92,11 @@ struct nv50_reg {
 
 	int rhw; /* result hw for FP outputs, or interpolant index */
 	int acc; /* instruction where this reg is last read (first insn == 1) */
+
+	int vtx; /* vertex index, for GP inputs (TGSI Dimension.Index) */
+	int indirect[2]; /* index into pc->addr, or -1 */
+
+	ubyte buf_index; /* c{0 .. 15}[] or g{0 .. 15}[] */
 };
 
 #define NV50_MOD_NEG 1
@@ -135,7 +140,8 @@ struct nv50_pc {
 	int immd_nr;
 	struct nv50_reg **addr;
 	int addr_nr;
-	uint8_t addr_alloc; /* set bit indicates used for TGSI_FILE_ADDRESS */
+	struct nv50_reg *sysval;
+	int sysval_nr;
 
 	struct nv50_reg *temp_temp[16];
 	struct nv50_program_exec *temp_temp_exec[16];
@@ -171,6 +177,8 @@ struct nv50_pc {
 	uint8_t edgeflag_out;
 };
 
+static struct nv50_reg *get_address_reg(struct nv50_pc *, struct nv50_reg *);
+
 static INLINE void
 ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
 {
@@ -179,7 +187,10 @@ ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
 	reg->hw = hw;
 	reg->mod = 0;
 	reg->rhw = -1;
+	reg->vtx = -1;
 	reg->acc = 0;
+	reg->indirect[0] = reg->indirect[1] = -1;
+	reg->buf_index = (type == P_CONST) ? 1 : 0;
 }
 
 static INLINE unsigned
@@ -197,7 +208,8 @@ terminate_mbb(struct nv50_pc *pc)
 
 	/* remove records of temporary address register values */
 	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
-		pc->r_addr[i].rhw = -1;
+		if (pc->r_addr[i].index < 0)
+			pc->r_addr[i].acc = 0;
 }
 
 static void
@@ -260,6 +272,7 @@ reg_instance(struct nv50_pc *pc, struct nv50_reg *reg)
 	if (reg) {
 		alloc_reg(pc, reg);
 		*ri = *reg;
+		reg->indirect[0] = reg->indirect[1] = -1;
 		reg->mod = 0;
 	}
 	return ri;
@@ -464,6 +477,12 @@ is_join(struct nv50_program_exec *e)
 	return FALSE;
 }
 
+static INLINE boolean
+is_control_flow(struct nv50_program_exec *e)
+{
+	return (e->inst[0] & 2);
+}
+
 static INLINE void
 set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
 	 struct nv50_program_exec *e)
@@ -525,11 +544,33 @@ set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
 static INLINE void
 set_addr(struct nv50_program_exec *e, struct nv50_reg *a)
 {
+	assert(a->type == P_ADDR);
+
 	assert(!(e->inst[0] & 0x0c000000));
 	assert(!(e->inst[1] & 0x00000004));
 
 	e->inst[0] |= (a->hw & 3) << 26;
-	e->inst[1] |= (a->hw >> 2) << 2;
+	e->inst[1] |= a->hw & 4;
+}
+
+static void
+emit_arl(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, uint8_t);
+
+static void
+emit_shl_imm(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, int);
+
+static void
+emit_mov_from_addr(struct nv50_pc *pc, struct nv50_reg *dst,
+		   struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[1] = 0x40000000;
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_addr(e, src);
+
+	emit(pc, e);
 }
 
 static void
@@ -548,72 +589,6 @@ emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst,
 	emit(pc, e);
 }
 
-static struct nv50_reg *
-alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref)
-{
-	struct nv50_reg *a_tgsi = NULL, *a = NULL;
-	int i;
-	uint8_t avail = ~pc->addr_alloc;
-
-	if (!ref) {
-		/* allocate for TGSI_FILE_ADDRESS */
-		while (avail) {
-			i = ffs(avail) - 1;
-
-			if (pc->r_addr[i].rhw < 0 ||
-			    pc->r_addr[i].acc != pc->insn_cur) {
-				pc->addr_alloc |= (1 << i);
-
-				pc->r_addr[i].rhw = -1;
-				pc->r_addr[i].index = i;
-				return &pc->r_addr[i];
-			}
-			avail &= ~(1 << i);
-		}
-		assert(0);
-		return NULL;
-	}
-
-	/* Allocate and set an address reg so we can access 'ref'.
-	 *
-	 * If and r_addr->index will be -1 or the hw index the value
-	 * value in rhw is relative to. If rhw < 0, the reg has not
-	 * been initialized or is in use for TGSI_FILE_ADDRESS.
-	 */
-	while (avail) { /* only consider regs that are not TGSI */
-		i = ffs(avail) - 1;
-		avail &= ~(1 << i);
-
-		if ((!a || a->rhw >= 0) && pc->r_addr[i].rhw < 0) {
-			/* prefer an usused reg with low hw index */
-			a = &pc->r_addr[i];
-			continue;
-		}
-		if (!a && pc->r_addr[i].acc != pc->insn_cur)
-			a = &pc->r_addr[i];
-
-		if (ref->hw - pc->r_addr[i].rhw >= 128)
-			continue;
-
-		if ((ref->acc >= 0 && pc->r_addr[i].index < 0) ||
-		    (ref->acc < 0 && pc->r_addr[i].index == ref->index)) {
-			pc->r_addr[i].acc = pc->insn_cur;
-			return &pc->r_addr[i];
-		}
-	}
-	assert(a);
-
-	if (ref->acc < 0)
-		a_tgsi = pc->addr[ref->index];
-
-	emit_add_addr_imm(pc, a, a_tgsi, (ref->hw & ~0x7f) * 4);
-
-	a->rhw = ref->hw & ~0x7f;
-	a->acc = pc->insn_cur;
-	a->index = a_tgsi ? ref->index : -1;
-	return a;
-}
-
 #define INTERP_LINEAR		0
 #define INTERP_FLAT		1
 #define INTERP_PERSPECTIVE	2
@@ -657,15 +632,15 @@ set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
 	e->param.shift = s;
 	e->param.mask = m << (s % 32);
 
-	if (src->hw > 127)
-		set_addr(e, alloc_addr(pc, src));
+	if (src->hw < 0 || src->hw > 127) /* need (additional) address reg */
+		set_addr(e, get_address_reg(pc, src));
 	else
 	if (src->acc < 0) {
 		assert(src->type == P_CONST);
-		set_addr(e, pc->addr[src->index]);
+		set_addr(e, pc->addr[src->indirect[0]]);
 	}
 
-	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
+	e->inst[1] |= (src->buf_index << 22);
 }
 
 /* Never apply nv50_reg::mod in emit_mov, or carefully check the code !!! */
@@ -694,6 +669,12 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 		if (src->type == P_ATTR) {
 			set_long(pc, e);
 			e->inst[1] |= 0x00200000;
+
+			if (src->vtx >= 0) {
+				/* indirect (vertex base + c) load from p[] */
+				e->inst[0] |= 0x01800000;
+				set_addr(e, get_address_reg(pc, src));
+			}
 		}
 
 		alloc_reg(pc, src);
@@ -808,6 +789,11 @@ set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 	if (src->type == P_ATTR) {
 		set_long(pc, e);
 		e->inst[1] |= 0x00200000;
+
+		if (src->vtx >= 0) {
+			e->inst[0] |= 0x01800000; /* src from p[] */
+			set_addr(e, get_address_reg(pc, src));
+		}
 	} else
 	if (src->type == P_CONST || src->type == P_IMMD) {
 		struct nv50_reg *temp = temp_temp(pc, e);
@@ -832,13 +818,13 @@ set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 		src = temp;
 	} else
 	if (src->type == P_CONST || src->type == P_IMMD) {
-		assert(!(e->inst[0] & 0x00800000));
-		if (e->inst[0] & 0x01000000) {
+		if (e->inst[0] & 0x01800000) {
 			struct nv50_reg *temp = temp_temp(pc, e);
 
 			emit_mov(pc, temp, src);
 			src = temp;
 		} else {
+			assert(!(e->inst[0] & 0x00800000));
 			set_data(pc, src, 0x7f, 16, e);
 			e->inst[0] |= 0x00800000;
 		}
@@ -862,13 +848,13 @@ set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 		src = temp;
 	} else
 	if (src->type == P_CONST || src->type == P_IMMD) {
-		assert(!(e->inst[0] & 0x01000000));
-		if (e->inst[0] & 0x00800000) {
+		if (e->inst[0] & 0x01800000) {
 			struct nv50_reg *temp = temp_temp(pc, e);
 
 			emit_mov(pc, temp, src);
 			src = temp;
 		} else {
+			assert(!(e->inst[0] & 0x01000000));
 			set_data(pc, src, 0x7f, 32+14, e);
 			e->inst[0] |= 0x01000000;
 		}
@@ -997,11 +983,125 @@ emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
 
 	e->inst[0] |= dst->hw << 2;
 	e->inst[0] |= s << 16; /* shift left */
-	set_src_0_restricted(pc, src, e);
+	set_src_0(pc, src, e);
 
 	emit(pc, e);
 }
 
+static boolean
+address_reg_suitable(struct nv50_reg *a, struct nv50_reg *r)
+{
+	if (!r)
+		return FALSE;
+
+	if (r->vtx != a->vtx)
+		return FALSE;
+	if (r->vtx >= 0)
+		return (r->indirect[1] == a->indirect[1]);
+
+	if (r->hw < a->rhw || (r->hw - a->rhw) >= 128)
+		return FALSE;
+
+	if (a->index >= 0)
+		return (a->index == r->indirect[0]);
+	return (a->indirect[0] == r->indirect[0]);
+}
+
+static void
+load_vertex_base(struct nv50_pc *pc, struct nv50_reg *dst,
+		 struct nv50_reg *a, int shift)
+{
+	struct nv50_reg mem, *temp;
+
+	ctor_reg(&mem, P_ATTR, -1, dst->vtx);
+
+	assert(dst->type == P_ADDR);
+	if (!a) {
+		emit_arl(pc, dst, &mem, 0);
+		return;
+	}
+	temp = alloc_temp(pc, NULL);
+
+	if (shift) {
+		emit_mov_from_addr(pc, temp, a);
+		if (shift < 0)
+			emit_shl_imm(pc, temp, temp, shift);
+		emit_arl(pc, dst, temp, MAX2(shift, 0));
+	}
+	emit_mov(pc, temp, &mem);
+	set_addr(pc->p->exec_tail, dst);
+
+	emit_arl(pc, dst, temp, 0);
+	free_temp(pc, temp);
+}
+
+/* case (ref == NULL): allocate address register for TGSI_FILE_ADDRESS
+ * case (vtx >= 0, acc >= 0): load vertex base from a[vtx * 4] to $aX
+ * case (vtx >= 0, acc < 0): load vertex base from s[$aY + vtx * 4] to $aX
+ * case (vtx < 0, acc >= 0): memory address too high to encode
+ * case (vtx < 0, acc < 0): get source register for TGSI_FILE_ADDRESS
+ */
+static struct nv50_reg *
+get_address_reg(struct nv50_pc *pc, struct nv50_reg *ref)
+{
+	int i;
+	struct nv50_reg *a_ref, *a = NULL;
+
+	for (i = 0; i < NV50_SU_MAX_ADDR; ++i) {
+		if (pc->r_addr[i].acc == 0)
+			a = &pc->r_addr[i]; /* an unused address reg */
+		else
+		if (address_reg_suitable(&pc->r_addr[i], ref)) {
+			pc->r_addr[i].acc = pc->insn_cur;
+			return &pc->r_addr[i];
+		} else
+		if (!a && pc->r_addr[i].index < 0 &&
+		    pc->r_addr[i].acc < pc->insn_cur)
+			a = &pc->r_addr[i];
+	}
+	if (!a) {
+		/* We'll be able to spill address regs when this
+		 * mess is replaced with a proper compiler ...
+		 */
+		NOUVEAU_ERR("out of address regs\n");
+		abort();
+		return NULL;
+	}
+
+	/* initialize and reserve for this TGSI instruction */
+	a->rhw = 0;
+	a->index = a->indirect[0] = a->indirect[1] = -1;
+	a->acc = pc->insn_cur;
+
+	if (!ref) {
+		a->vtx = -1;
+		return a;
+	}
+	a->vtx = ref->vtx;
+
+	/* now put in the correct value ... */
+
+	if (ref->vtx >= 0) {
+		a->indirect[1] = ref->indirect[1];
+
+		/* For an indirect vertex index, we need to shift address right
+		 * by 2, the address register will contain vtx * 16, we need to
+		 * load from a[vtx * 4].
+		 */
+		load_vertex_base(pc, a, (ref->acc < 0) ?
+				 pc->addr[ref->indirect[1]] : NULL, -2);
+	} else {
+		assert(ref->acc < 0 || ref->indirect[0] < 0);
+
+		a->rhw = ref->hw & ~0x7f;
+		a->indirect[0] = ref->indirect[0];
+		a_ref = (ref->acc < 0) ? pc->addr[ref->indirect[0]] : NULL;
+
+		emit_add_addr_imm(pc, a, a_ref, a->rhw * 4);
+	}
+	return a;
+}
+
 #define NV50_MAX_F32 0x880
 #define NV50_MAX_S32 0x08c
 #define NV50_MAX_U32 0x084
@@ -1629,6 +1729,18 @@ emit_ret(struct nv50_pc *pc, int pred, unsigned cc)
 	emit_control_flow(pc, 0x3, pred, cc);
 }
 
+static void
+emit_prim_cmd(struct nv50_pc *pc, unsigned cmd)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xf0000000 | (cmd << 9);
+	e->inst[1] = 0xc0000000;
+	set_long(pc, e);
+
+	emit(pc, e);
+}
+
 #define QOP_ADD 0
 #define QOP_SUBR 1
 #define QOP_SUB 2
@@ -2171,14 +2283,19 @@ tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 	{
 		struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c];
 		if (!r) {
-			r = alloc_addr(pc, NULL);
-			pc->addr[dst->Register.Index * 4 + c] = r;
+			r = get_address_reg(pc, NULL);
+			r->index = dst->Register.Index * 4 + c;
+			pc->addr[r->index] = r;
 		}
 		assert(r);
 		return r;
 	}
 	case TGSI_FILE_NULL:
 		return NULL;
+	case TGSI_FILE_SYSTEM_VALUE:
+		assert(pc->sysval[dst->Register.Index].type == P_RESULT);
+		assert(c == 0);
+		return &pc->sysval[dst->Register.Index];
 	default:
 		break;
 	}
@@ -2208,6 +2325,18 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 		switch (src->Register.File) {
 		case TGSI_FILE_INPUT:
 			r = &pc->attr[src->Register.Index * 4 + c];
+
+			if (!src->Dimension.Dimension)
+				break;
+			r = reg_instance(pc, r);
+			r->vtx = src->Dimension.Index;
+
+			if (!src->Dimension.Indirect)
+				break;
+			swz = tgsi_util_get_src_register_swizzle(
+				&src->DimIndirect, 0);
+			r->acc = -1;
+			r->indirect[1] = src->DimIndirect.Index * 4 + swz;
 			break;
 		case TGSI_FILE_TEMPORARY:
 			r = &pc->temp[src->Register.Index * 4 + c];
@@ -2221,12 +2350,12 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 			 * use the index field to select the address reg.
 			 */
 			r = reg_instance(pc, NULL);
+			ctor_reg(r, P_CONST, -1, src->Register.Index * 4 + c);
+
 			swz = tgsi_util_get_src_register_swizzle(
-						 &src->Indirect, 0);
-			ctor_reg(r, P_CONST,
-				 src->Indirect.Index * 4 + swz,
-				 src->Register.Index * 4 + c);
+				&src->Indirect, 0);
 			r->acc = -1;
+			r->indirect[0] = src->Indirect.Index * 4 + swz;
 			break;
 		case TGSI_FILE_IMMEDIATE:
 			r = &pc->immd[src->Register.Index * 4 + c];
@@ -2237,6 +2366,10 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 			r = pc->addr[src->Register.Index * 4 + c];
 			assert(r);
 			break;
+		case TGSI_FILE_SYSTEM_VALUE:
+			assert(c == 0);
+			r = &pc->sysval[src->Register.Index];
+			break;
 		default:
 			assert(0);
 			break;
@@ -2273,7 +2406,7 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 		r->mod |= mod & NV50_MOD_I32;
 
 	assert(r);
-	if (r->acc >= 0 && r != temp)
+	if (r->acc >= 0 && r->vtx < 0 && r != temp)
 		return reg_instance(pc, r); /* will clear r->mod */
 	return r;
 }
@@ -2495,10 +2628,14 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		}
 		break;
 	case TGSI_OPCODE_ARL:
-		assert(src[0][0]);
 		temp = temp_temp(pc, NULL);
-		emit_cvt(pc, temp, src[0][0], -1, CVT_FLOOR | CVT_S32_F32);
-		emit_arl(pc, dst[0], temp, 4);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, temp, src[0][c], -1,
+				 CVT_FLOOR | CVT_S32_F32);
+			emit_arl(pc, dst[c], temp, 4);
+		}
 		break;
 	case TGSI_OPCODE_BGNLOOP:
 		pc->loop_brka[pc->loop_lvl] = emit_breakaddr(pc);
@@ -2605,6 +2742,9 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
 		terminate_mbb(pc);
 		break;
+	case TGSI_OPCODE_EMIT:
+		emit_prim_cmd(pc, 1);
+		break;
 	case TGSI_OPCODE_ENDIF:
 		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
 
@@ -2628,8 +2768,12 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		pc->loop_brka[pc->loop_lvl]->param.index = pc->p->exec_size;
 		terminate_mbb(pc);
 		break;
+	case TGSI_OPCODE_ENDPRIM:
+		emit_prim_cmd(pc, 2);
+		break;
 	case TGSI_OPCODE_ENDSUB:
 		assert(pc->in_subroutine);
+		terminate_mbb(pc);
 		pc->in_subroutine = FALSE;
 		break;
 	case TGSI_OPCODE_EX2:
@@ -3028,10 +3172,14 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		if (!is_long(pc->p->exec_tail))
 			convert_to_long(pc, pc->p->exec_tail);
 		else
-		if (is_immd(pc->p->exec_tail) || is_join(pc->p->exec_tail))
+		if (is_immd(pc->p->exec_tail) ||
+		    is_join(pc->p->exec_tail) ||
+		    is_control_flow(pc->p->exec_tail))
 			emit_nop(pc);
 
 		pc->p->exec_tail->inst[1] |= 1; /* set exit bit */
+
+		terminate_mbb(pc);
 		break;
 	default:
 		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
@@ -3327,17 +3475,53 @@ load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
  * value of 0 for back-facing, and 0xffffffff for front-facing.
  */
 static void
-load_frontfacing(struct nv50_pc *pc, struct nv50_reg *a)
+load_frontfacing(struct nv50_pc *pc, struct nv50_reg *sv)
+{
+	struct nv50_reg *temp = alloc_temp(pc, NULL);
+	int r_pred = 0;
+
+	temp->rhw = 255;
+	emit_interp(pc, temp, NULL, INTERP_FLAT);
+
+	emit_cvt(pc, sv, temp, r_pred, CVT_ABS | CVT_F32_S32);
+
+	emit_not(pc, temp, temp);
+	set_pred(pc, 0x2, r_pred, pc->p->exec_tail);
+	emit_cvt(pc, sv, temp, -1, CVT_F32_S32);
+	set_pred(pc, 0x2, r_pred, pc->p->exec_tail);
+
+	free_temp(pc, temp);
+}
+
+static void
+load_instance_id(struct nv50_pc *pc, unsigned index)
 {
-	struct nv50_reg *one = alloc_immd(pc, 1.0f);
+	struct nv50_reg reg, mem;
 
-	assert(a->rhw == -1);
-	alloc_reg(pc, a); /* do this before rhw is set */
-	a->rhw = 255;
-	load_interpolant(pc, a);
-	emit_bitop2(pc, a, a, one, TGSI_OPCODE_AND);
+	ctor_reg(&reg, P_TEMP, -1, -1);
+	ctor_reg(&mem, P_CONST, -1, 24); /* startInstance */
+	mem.buf_index = 2;
 
-	FREE(one);
+	emit_add_b32(pc, &reg, &pc->sysval[index], &mem);
+	pc->sysval[index] = reg;
+}
+
+static void
+copy_semantic_info(struct nv50_program *p)
+{
+	unsigned i, id;
+
+	for (i = 0; i < p->cfg.in_nr; ++i) {
+		id = p->cfg.in[i].id;
+		p->cfg.in[i].sn = p->info.input_semantic_name[id];
+		p->cfg.in[i].si = p->info.input_semantic_index[id];
+	}
+
+	for (i = 0; i < p->cfg.out_nr; ++i) {
+		id = p->cfg.out[i].id;
+		p->cfg.out[i].sn = p->info.output_semantic_name[id];
+		p->cfg.out[i].si = p->info.output_semantic_index[id];
+	}
 }
 
 static boolean
@@ -3346,7 +3530,7 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 	struct tgsi_parse_context tp;
 	struct nv50_program *p = pc->p;
 	boolean ret = FALSE;
-	unsigned i, c, flat_nr = 0;
+	unsigned i, c, instance_id, vertex_id, flat_nr = 0;
 
 	tgsi_parse_init(&tp, pc->p->pipe.tokens);
 	while (!tgsi_parse_end_of_tokens(&tp)) {
@@ -3386,13 +3570,13 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 				switch (d->Semantic.Name) {
 				case TGSI_SEMANTIC_BCOLOR:
 					p->cfg.two_side[si].hw = first;
-					if (p->cfg.io_nr > first)
-						p->cfg.io_nr = first;
+					if (p->cfg.out_nr > first)
+						p->cfg.out_nr = first;
 					break;
 				case TGSI_SEMANTIC_PSIZE:
 					p->cfg.psiz = first;
-					if (p->cfg.io_nr > first)
-						p->cfg.io_nr = first;
+					if (p->cfg.out_nr > first)
+						p->cfg.out_nr = first;
 					break;
 				case TGSI_SEMANTIC_EDGEFLAG:
 					pc->edgeflag_out = first;
@@ -3432,6 +3616,37 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 					pc->interp_mode[i] = mode;
 			}
 				break;
+			case TGSI_FILE_SYSTEM_VALUE:
+				assert(d->Declaration.Semantic);
+				switch (d->Semantic.Name) {
+				case TGSI_SEMANTIC_FACE:
+					assert(p->type == PIPE_SHADER_FRAGMENT);
+					load_frontfacing(pc,
+							 &pc->sysval[first]);
+					break;
+				case TGSI_SEMANTIC_INSTANCEID:
+					assert(p->type == PIPE_SHADER_VERTEX);
+					instance_id = first;
+					p->cfg.regs[0] |= (1 << 4);
+					break;
+				case TGSI_SEMANTIC_PRIMID:
+					assert(p->type != PIPE_SHADER_VERTEX);
+					p->cfg.prim_id = first;
+					break;
+					/*
+				case TGSI_SEMANTIC_PRIMIDIN:
+					assert(p->type == PIPE_SHADER_GEOMETRY);
+					pc->sysval[first].hw = 6;
+					p->cfg.regs[0] |= (1 << 8);
+					break;
+				case TGSI_SEMANTIC_VERTEXID:
+					assert(p->type == PIPE_SHADER_VERTEX);
+					vertex_id = first;
+					p->cfg.regs[0] |= (1 << 12) | (1 << 0);
+					break;
+					*/
+				}
+				break;
 			case TGSI_FILE_ADDRESS:
 			case TGSI_FILE_CONSTANT:
 			case TGSI_FILE_SAMPLER:
@@ -3452,68 +3667,98 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 		}
 	}
 
-	if (p->type == PIPE_SHADER_VERTEX) {
+	if (p->type == PIPE_SHADER_VERTEX || p->type == PIPE_SHADER_GEOMETRY) {
 		int rid = 0;
 
-		for (i = 0; i < pc->attr_nr * 4; ++i) {
-			if (pc->attr[i].acc) {
-				pc->attr[i].hw = rid++;
-				p->cfg.attr[i / 32] |= 1 << (i % 32);
+		if (p->type == PIPE_SHADER_GEOMETRY) {
+			for (i = 0; i < pc->attr_nr; ++i) {
+				p->cfg.in[i].hw = rid;
+				p->cfg.in[i].id = i;
+
+				for (c = 0; c < 4; ++c) {
+					int n = i * 4 + c;
+					if (!pc->attr[n].acc)
+						continue;
+					pc->attr[n].hw = rid++;
+					p->cfg.in[i].mask |= 1 << c;
+				}
+			}
+		} else {
+			for (i = 0; i < pc->attr_nr * 4; ++i) {
+				if (pc->attr[i].acc) {
+					pc->attr[i].hw = rid++;
+					p->cfg.attr[i / 32] |= 1 << (i % 32);
+				}
+			}
+			if (p->cfg.regs[0] & (1 << 0))
+				pc->sysval[vertex_id].hw = rid++;
+			if (p->cfg.regs[0] & (1 << 4)) {
+				pc->sysval[instance_id].hw = rid++;
+				load_instance_id(pc, instance_id);
 			}
 		}
 
 		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
-			p->cfg.io[i].hw = rid;
-			p->cfg.io[i].id = i;
+			p->cfg.out[i].hw = rid;
+			p->cfg.out[i].id = i;
 
 			for (c = 0; c < 4; ++c) {
 				int n = i * 4 + c;
 				if (!pc->result[n].acc)
 					continue;
 				pc->result[n].hw = rid++;
-				p->cfg.io[i].mask |= 1 << c;
+				p->cfg.out[i].mask |= 1 << c;
 			}
 		}
+		if (p->cfg.prim_id < 0x40) {
+			/* GP has to write to PrimitiveID */
+			ctor_reg(&pc->sysval[p->cfg.prim_id],
+				 P_RESULT, p->cfg.prim_id, rid);
+			p->cfg.prim_id = rid++;
+		}
 
 		for (c = 0; c < 2; ++c)
 			if (p->cfg.two_side[c].hw < 0x40)
-				p->cfg.two_side[c] = p->cfg.io[
+				p->cfg.two_side[c] = p->cfg.out[
 					p->cfg.two_side[c].hw];
 
 		if (p->cfg.psiz < 0x40)
-			p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
+			p->cfg.psiz = p->cfg.out[p->cfg.psiz].hw;
+
+		copy_semantic_info(p);
 	} else
 	if (p->type == PIPE_SHADER_FRAGMENT) {
-		int rid, aid;
+		int rid, aid, base;
 		unsigned n = 0, m = pc->attr_nr - flat_nr;
 
 		pc->allow32 = TRUE;
 
-		int base = (TGSI_SEMANTIC_POSITION ==
-			    p->info.input_semantic_name[0]) ? 0 : 1;
+		base = (TGSI_SEMANTIC_POSITION ==
+			p->info.input_semantic_name[0]) ? 0 : 1;
 
 		/* non-flat interpolants have to be mapped to
 		 * the lower hardware IDs, so sort them:
 		 */
 		for (i = 0; i < pc->attr_nr; i++) {
 			if (pc->interp_mode[i] == INTERP_FLAT)
-				p->cfg.io[m++].id = i;
+				p->cfg.in[m++].id = i;
 			else {
 				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
-					p->cfg.io[n].linear = TRUE;
-				p->cfg.io[n++].id = i;
+					p->cfg.in[n].linear = TRUE;
+				p->cfg.in[n++].id = i;
 			}
 		}
+		copy_semantic_info(p);
 
 		if (!base) /* set w-coordinate mask from perspective interp */
-			p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
+			p->cfg.in[0].mask |= p->cfg.regs[1] >> 24;
 
 		aid = popcnt4( /* if fcrd isn't contained in cfg.io */
-			base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
+			base ? (p->cfg.regs[1] >> 24) : p->cfg.in[0].mask);
 
 		for (n = 0; n < pc->attr_nr; ++n) {
-			p->cfg.io[n].hw = rid = aid;
-			i = p->cfg.io[n].id;
+			p->cfg.in[n].hw = rid = aid;
+			i = p->cfg.in[n].id;
 
 			if (p->info.input_semantic_name[n] ==
 			    TGSI_SEMANTIC_FACE) {
@@ -3525,15 +3770,15 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 				if (!pc->attr[i * 4 + c].acc)
 					continue;
 				pc->attr[i * 4 + c].rhw = rid++;
-				p->cfg.io[n].mask |= 1 << c;
+				p->cfg.in[n].mask |= 1 << c;
 
 				load_interpolant(pc, &pc->attr[i * 4 + c]);
 			}
-			aid += popcnt4(p->cfg.io[n].mask);
+			aid += popcnt4(p->cfg.in[n].mask);
 		}
 
 		if (!base)
-			p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
+			p->cfg.regs[1] |= p->cfg.in[0].mask << 24;
 
 		m = popcnt4(p->cfg.regs[1] >> 24);
 
@@ -3543,32 +3788,33 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 		p->cfg.regs[1] |= aid - m;
 
 		if (flat_nr) {
-			i = p->cfg.io[pc->attr_nr - flat_nr].hw;
+			i = p->cfg.in[pc->attr_nr - flat_nr].hw;
 			p->cfg.regs[1] |= (i - m) << 16;
 		} else
 			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
 
 		/* mark color semantic for light-twoside */
-		n = 0x40;
-		for (i = 0; i < pc->attr_nr; i++) {
-			ubyte si, sn;
-
-			sn = p->info.input_semantic_name[p->cfg.io[i].id];
-			si = p->info.input_semantic_index[p->cfg.io[i].id];
-
-			if (sn == TGSI_SEMANTIC_COLOR) {
-				p->cfg.two_side[si] = p->cfg.io[i];
-
-				/* increase colour count */
-				p->cfg.regs[0] += popcnt4(
-					p->cfg.two_side[si].mask) << 16;
-
-				n = MIN2(n, p->cfg.io[i].hw - m);
+		n = 0x80;
+		for (i = 0; i < p->cfg.in_nr; i++) {
+			if (p->cfg.in[i].sn == TGSI_SEMANTIC_COLOR) {
+				n = MIN2(n, p->cfg.in[i].hw - m);
+				p->cfg.two_side[p->cfg.in[i].si] = p->cfg.in[i];
+
+				p->cfg.regs[0] += /* increase colour count */
+					popcnt4(p->cfg.in[i].mask) << 16;
 			}
 		}
-		if (n < 0x40)
+		if (n < 0x80)
 			p->cfg.regs[0] += n;
 
+		if (p->cfg.prim_id < 0x40) {
+			pc->sysval[p->cfg.prim_id].rhw = rid++;
+			emit_interp(pc, &pc->sysval[p->cfg.prim_id], NULL,
+				    INTERP_FLAT);
+			/* increase FP_INTERPOLANT_CTRL_COUNT */
+			p->cfg.regs[1] += 1;
+		}
+
 		/* Initialize FP results:
 		 * FragDepth is always first TGSI and last hw output
 		 */
@@ -3622,10 +3868,31 @@ free_nv50_pc(struct nv50_pc *pc)
 		FREE(pc->attr);
 	if (pc->temp)
 		FREE(pc->temp);
+	if (pc->sysval)
+		FREE(pc->sysval);
+	if (pc->insn_pos)
+		FREE(pc->insn_pos);
 
 	FREE(pc);
 }
 
+static INLINE uint32_t
+nv50_map_gs_output_prim(unsigned pprim)
+{
+	switch (pprim) {
+	case PIPE_PRIM_POINTS:
+		return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_POINTS;
+	case PIPE_PRIM_LINE_STRIP:
+		return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP;
+	default:
+		NOUVEAU_ERR("invalid GS_OUTPUT_PRIMITIVE: %u\n", pprim);
+		abort();
+		return 0;
+	}
+}
+
 static boolean
 ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
 {
@@ -3639,25 +3906,55 @@ ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
 	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
 	pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1;
 	assert(pc->addr_nr <= 2);
+	pc->sysval_nr = p->info.file_max[TGSI_FILE_SYSTEM_VALUE] + 1;
 
 	p->cfg.high_temp = 4;
 
 	p->cfg.two_side[0].hw = 0x40;
 	p->cfg.two_side[1].hw = 0x40;
+	p->cfg.prim_id = 0x40;
 
 	p->cfg.edgeflag_in = pc->edgeflag_out = 0xff;
 
+	for (i = 0; i < p->info.num_properties; ++i) {
+		unsigned *data = &p->info.properties[i].data[0];
+
+		switch (p->info.properties[i].name) {
+		case TGSI_PROPERTY_GS_OUTPUT_PRIM:
+			p->cfg.prim_type = nv50_map_gs_output_prim(data[0]);
+			break;
+		case TGSI_PROPERTY_GS_MAX_VERTICES:
+			p->cfg.vert_count = data[0];
+			break;
+		default:
+			break;
+		}
+	}
+
 	switch (p->type) {
 	case PIPE_SHADER_VERTEX:
 		p->cfg.psiz = 0x40;
 		p->cfg.clpd = 0x40;
-		p->cfg.io_nr = pc->result_nr;
+		p->cfg.out_nr = pc->result_nr;
+		break;
+	case PIPE_SHADER_GEOMETRY:
+		assert(p->cfg.prim_type);
+		assert(p->cfg.vert_count);
+
+		p->cfg.psiz = 0x80;
+		p->cfg.clpd = 0x80;
+		p->cfg.prim_id = 0x80;
+		p->cfg.out_nr = pc->result_nr;
+		p->cfg.in_nr = pc->attr_nr;
+
+		p->cfg.two_side[0].hw = 0x80;
+		p->cfg.two_side[1].hw = 0x80;
 		break;
 	case PIPE_SHADER_FRAGMENT:
 		rtype[0] = rtype[1] = P_TEMP;
 
 		p->cfg.regs[0] = 0x01000004;
-		p->cfg.io_nr = pc->attr_nr;
+		p->cfg.in_nr = pc->attr_nr;
 
 		if (p->info.writes_z) {
 			p->cfg.regs[2] |= 0x00000100;
@@ -3715,7 +4012,16 @@ ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
 			return FALSE;
 	}
 	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
-		ctor_reg(&pc->r_addr[i], P_ADDR, -256, i + 1);
+		ctor_reg(&pc->r_addr[i], P_ADDR, -1, i + 1);
+
+	if (pc->sysval_nr) {
+		pc->sysval = CALLOC(pc->sysval_nr, sizeof(struct nv50_reg *));
+		if (!pc->sysval)
+			return FALSE;
+		/* will only ever use SYSTEM_VALUE[i].x (hopefully) */
+		for (i = 0; i < pc->sysval_nr; ++i)
+			ctor_reg(&pc->sysval[i], rtype[0], i, -1);
+	}
 
 	return TRUE;
 }
@@ -3877,13 +4183,17 @@ nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
 
 	if (p->param_nr) {
 		unsigned cb;
-		uint32_t *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
+		uint32_t *map = pipe_buffer_map(pscreen,
+						nv50->constbuf[p->type],
 						PIPE_BUFFER_USAGE_CPU_READ);
-
-		if (p->type == PIPE_SHADER_VERTEX)
+		switch (p->type) {
+		case PIPE_SHADER_GEOMETRY: cb = NV50_CB_PGP; break;
+		case PIPE_SHADER_FRAGMENT: cb = NV50_CB_PFP; break;
+		default:
 			cb = NV50_CB_PVP;
-		else
-			cb = NV50_CB_PFP;
+			assert(p->type == PIPE_SHADER_VERTEX);
+			break;
+		}
 
 		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
 		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
@@ -3977,19 +4287,18 @@ nv50_vertprog_validate(struct nv50_context *nv50)
 	nv50_program_validate_data(nv50, p);
 	nv50_program_validate_code(nv50, p);
 
-	so = so_new(5, 8, 2);
+	so = so_new(5, 7, 2);
 	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		      NOUVEAU_BO_HIGH, 0, 0);
+		  NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		      NOUVEAU_BO_LOW, 0, 0);
+		  NOUVEAU_BO_LOW, 0, 0);
 	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
 	so_data  (so, p->cfg.attr[0]);
 	so_data  (so, p->cfg.attr[1]);
 	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
 	so_data  (so, p->cfg.high_result);
-	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
-	so_data  (so, p->cfg.high_result); //8);
+	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_TEMP, 1);
 	so_data  (so, p->cfg.high_temp);
 	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
 	so_data  (so, 0); /* program start offset */
@@ -4033,42 +4342,74 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 	so_ref(NULL, &so);
 }
 
+void
+nv50_geomprog_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *p = nv50->geomprog;
+	struct nouveau_stateobj *so;
+
+	if (!p->translated) {
+		nv50_program_validate(nv50, p);
+		if (!p->translated)
+			assert(0);
+	}
+
+	nv50_program_validate_data(nv50, p);
+	nv50_program_validate_code(nv50, p);
+
+	so = so_new(6, 7, 2);
+	so_method(so, tesla, NV50TCL_GP_ADDRESS_HIGH, 2);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_LOW, 0, 0);
+	so_method(so, tesla, NV50TCL_GP_REG_ALLOC_TEMP, 1);
+	so_data  (so, p->cfg.high_temp);
+	so_method(so, tesla, NV50TCL_GP_REG_ALLOC_RESULT, 1);
+	so_data  (so, p->cfg.high_result);
+	so_method(so, tesla, NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE, 1);
+	so_data  (so, p->cfg.prim_type);
+	so_method(so, tesla, NV50TCL_GP_VERTEX_OUTPUT_COUNT, 1);
+	so_data  (so, p->cfg.vert_count);
+	so_method(so, tesla, NV50TCL_GP_START_ID, 1);
+	so_data  (so, 0);
+	so_ref(so, &nv50->state.geomprog);
+	so_ref(NULL, &so);
+}
+
 static uint32_t
 nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
 {
+	struct nv50_program *vp;
 	struct nv50_program *fp = nv50->fragprog;
-	struct nv50_program *vp = nv50->vertprog;
 	unsigned i, c, m = base;
 	uint32_t origin = 0x00000010;
 
+	vp = nv50->geomprog ? nv50->geomprog : nv50->vertprog;
+
 	/* XXX: this might not work correctly in all cases yet - we'll
 	 * just assume that an FP generic input that is not written in
 	 * the VP is PointCoord.
 	 */
 	memset(pntc, 0, 8 * sizeof(uint32_t));
 
-	for (i = 0; i < fp->cfg.io_nr; i++) {
-		uint8_t sn, si;
-		uint8_t j, k = fp->cfg.io[i].id;
-		unsigned n = popcnt4(fp->cfg.io[i].mask);
+	for (i = 0; i < fp->cfg.in_nr; i++) {
+		unsigned j, n = popcnt4(fp->cfg.in[i].mask);
 
-		if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
+		if (fp->cfg.in[i].sn != TGSI_SEMANTIC_GENERIC) {
 			m += n;
 			continue;
 		}
 
-		for (j = 0; j < vp->info.num_outputs; ++j) {
-			sn = vp->info.output_semantic_name[j];
-			si = vp->info.output_semantic_index[j];
-
-			if (sn == fp->info.input_semantic_name[k] &&
-			    si == fp->info.input_semantic_index[k])
+		for (j = 0; j < vp->cfg.out_nr; ++j)
+			if (vp->cfg.out[j].sn ==  fp->cfg.in[i].sn &&
+			    vp->cfg.out[j].si == fp->cfg.in[i].si)
 				break;
-		}
 
-		if (j < vp->info.num_outputs) {
-			ubyte mode =
-				nv50->rasterizer->pipe.sprite_coord_mode[si];
+		if (j < vp->cfg.out_nr) {
+			ubyte mode = nv50->rasterizer->pipe.sprite_coord_mode[
+				vp->cfg.out[j].si];
 
 			if (mode == PIPE_SPRITE_COORD_NONE) {
 				m += n;
@@ -4080,7 +4421,7 @@ nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
 
 		/* this is either PointCoord or replaced by sprite coords */
 		for (c = 0; c < 4; c++) {
-			if (!(fp->cfg.io[i].mask & (1 << c)))
+			if (!(fp->cfg.in[i].mask & (1 << c)))
 				continue;
 			pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
 			++m;
@@ -4090,18 +4431,22 @@ nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
 }
 
 static int
-nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
-	       struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
+nv50_vec4_map(uint32_t *map32, int mid, uint8_t zval, uint32_t lin[4],
+	      struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
 {
 	int c;
 	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
-	uint8_t *map = (uint8_t *)p_map;
+	uint8_t *map = (uint8_t *)map32;
 
 	for (c = 0; c < 4; ++c) {
 		if (mf & 1) {
 			if (fpi->linear == TRUE)
 				lin[mid / 32] |= 1 << (mid % 32);
-			map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
+			if (mv & 1)
+				map[mid] = oid;
+			else
+				map[mid] = (c == 3) ? (zval + 1) : zval;
+			++mid;
 		}
 
 		oid += mv & 1;
@@ -4113,34 +4458,42 @@ nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
 }
 
 void
-nv50_linkage_validate(struct nv50_context *nv50)
+nv50_fp_linkage_validate(struct nv50_context *nv50)
 {
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nv50_program *vp = nv50->vertprog;
 	struct nv50_program *fp = nv50->fragprog;
 	struct nouveau_stateobj *so;
-	struct nv50_sreg4 dummy, *vpo;
+	struct nv50_sreg4 dummy;
 	int i, n, c, m = 0;
-	uint32_t map[16], lin[4], reg[5], pcrd[8];
+	uint32_t map[16], lin[4], reg[6], pcrd[8];
+	uint8_t zval = 0x40;
 
+	if (nv50->geomprog) {
+		vp = nv50->geomprog;
+		zval = 0x80;
+	}
 	memset(map, 0, sizeof(map));
 	memset(lin, 0, sizeof(lin));
 
 	reg[1] = 0x00000004; /* low and high clip distance map ids */
 	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
 	reg[3] = 0x00000000; /* point size map id & enable */
+	reg[5] = 0x00000000; /* primitive ID map slot */
 	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
 	reg[4] = fp->cfg.regs[1]; /* interpolant info */
 
 	dummy.linear = FALSE;
 	dummy.mask = 0xf; /* map all components of HPOS */
-	m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
+	m = nv50_vec4_map(map, m, zval, lin, &dummy, &vp->cfg.out[0]);
 
 	dummy.mask = 0x0;
 
 	if (vp->cfg.clpd < 0x40) {
-		for (c = 0; c < vp->cfg.clpd_nr; ++c)
-			map[m++] = vp->cfg.clpd + c;
+		for (c = 0; c < vp->cfg.clpd_nr; ++c) {
+			map[m / 4] |= (vp->cfg.clpd + c) << ((m % 4) * 8);
+			++m;
+		}
 		reg[1] = (m << 8);
 	}
 
@@ -4148,35 +4501,37 @@ nv50_linkage_validate(struct nv50_context *nv50)
 
 	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
 	if (nv50->rasterizer->pipe.light_twoside) {
-		vpo = &vp->cfg.two_side[0];
+		struct nv50_sreg4 *vpo = &vp->cfg.two_side[0];
+		struct nv50_sreg4 *fpi = &fp->cfg.two_side[0];
 
-		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
-		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
+		m = nv50_vec4_map(map, m, zval, lin, &fpi[0], &vpo[0]);
+		m = nv50_vec4_map(map, m, zval, lin, &fpi[1], &vpo[1]);
 	}
 
 	reg[0] += m - 4; /* adjust FFC0 id */
 	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
 
-	for (i = 0; i < fp->cfg.io_nr; i++) {
-		ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id];
-		ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id];
-
-		/* position must be mapped first */
-		assert(i == 0 || sn != TGSI_SEMANTIC_POSITION);
-
+	for (i = 0; i < fp->cfg.in_nr; i++) {
 		/* maybe even remove these from cfg.io */
-		if (sn == TGSI_SEMANTIC_POSITION || sn == TGSI_SEMANTIC_FACE)
+		if (fp->cfg.in[i].sn == TGSI_SEMANTIC_POSITION ||
+		    fp->cfg.in[i].sn == TGSI_SEMANTIC_FACE)
 			continue;
 
-		/* VP outputs and vp->cfg.io are in the same order */
-		for (n = 0; n < vp->info.num_outputs; ++n) {
-			if (vp->info.output_semantic_name[n] == sn &&
-			    vp->info.output_semantic_index[n] == si)
+		for (n = 0; n < vp->cfg.out_nr; ++n)
+			if (vp->cfg.out[n].sn == fp->cfg.in[i].sn &&
+			    vp->cfg.out[n].si == fp->cfg.in[i].si)
 				break;
-		}
-		vpo = (n < vp->info.num_outputs) ? &vp->cfg.io[n] : &dummy;
 
-		m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
+		m = nv50_vec4_map(map, m, zval, lin, &fp->cfg.in[i],
+				  (n < vp->cfg.out_nr) ?
+				  &vp->cfg.out[n] : &dummy);
+	}
+	/* PrimitiveID either is replaced by the system value, or
+	 * written by the geometry shader into an output register
+	 */
+	if (fp->cfg.prim_id < 0x40) {
+		map[m / 4] |= vp->cfg.prim_id << ((m % 4) * 8);
+		reg[5] = m++;
 	}
 
 	if (nv50->rasterizer->pipe.point_size_per_vertex) {
@@ -4184,14 +4539,28 @@ nv50_linkage_validate(struct nv50_context *nv50)
 		reg[3] = (m++ << 4) | 1;
 	}
 
-	/* now fill the stateobj */
-	so = so_new(7, 57, 0);
+	/* now fill the stateobj (at most 28 so_data)  */
+	so = so_new(10, 54, 0);
 
 	n = (m + 3) / 4;
-	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
-	so_data  (so, m);
-	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
-	so_datap (so, map, n);
+	assert(m <= 32);
+	if (vp->type == PIPE_SHADER_GEOMETRY) {
+		so_method(so, tesla, NV50TCL_GP_RESULT_MAP_SIZE, 1);
+		so_data  (so, m);
+		so_method(so, tesla, NV50TCL_GP_RESULT_MAP(0), n);
+		so_datap (so, map, n);
+	} else {
+		so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1);
+		so_data  (so, vp->cfg.regs[0]);
+
+		so_method(so, tesla, NV50TCL_MAP_SEMANTIC_4, 1);
+		so_data  (so, reg[5]);
+
+		so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
+		so_data  (so, m);
+		so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
+		so_datap (so, map, n);
+	}
 
 	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
 	so_datap (so, reg, 4);
@@ -4211,8 +4580,77 @@ nv50_linkage_validate(struct nv50_context *nv50)
 		so_datap (so, pcrd, 8);
 	}
 
-        so_ref(so, &nv50->state.programs);
-        so_ref(NULL, &so);
+	so_method(so, tesla, NV50TCL_GP_ENABLE, 1);
+	so_data  (so, (vp->type == PIPE_SHADER_GEOMETRY) ? 1 : 0);
+
+	so_ref(so, &nv50->state.fp_linkage);
+	so_ref(NULL, &so);
+}
+
+static int
+construct_vp_gp_mapping(uint32_t *map32, int m,
+			struct nv50_program *vp, struct nv50_program *gp)
+{
+	uint8_t *map = (uint8_t *)map32;
+	int i, j, c;
+
+        for (i = 0; i < gp->cfg.in_nr; ++i) {
+                uint8_t oid, mv = 0, mg = gp->cfg.in[i].mask;
+
+                for (j = 0; j < vp->cfg.out_nr; ++j) {
+                        if (vp->cfg.out[j].sn == gp->cfg.in[i].sn &&
+                            vp->cfg.out[j].si == gp->cfg.in[i].si) {
+				mv = vp->cfg.out[j].mask;
+				oid = vp->cfg.out[j].hw;
+                                break;
+			}
+		}
+
+                for (c = 0; c < 4; ++c, mv >>= 1, mg >>= 1) {
+			if (mg & mv & 1)
+				map[m++] = oid;
+			else
+			if (mg & 1)
+				map[m++] = (c == 3) ? 0x41 : 0x40;
+                        oid += mv & 1;
+                }
+        }
+	return m;
+}
+
+void
+nv50_gp_linkage_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so;
+	struct nv50_program *vp = nv50->vertprog;
+	struct nv50_program *gp = nv50->geomprog;
+	uint32_t map[16];
+	int m = 0;
+
+	if (!gp) {
+		so_ref(NULL, &nv50->state.gp_linkage);
+		return;
+	}
+	memset(map, 0, sizeof(map));
+
+	m = construct_vp_gp_mapping(map, m, vp, gp);
+
+	so = so_new(3, 24 - 3, 0);
+
+	so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1);
+	so_data  (so, vp->cfg.regs[0] | gp->cfg.regs[0]);
+
+	assert(m <= 32);
+	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
+	so_data  (so, m);
+
+	m = (m + 3) / 4;
+	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), m);
+	so_datap (so, map, m);
+
+	so_ref(so, &nv50->state.gp_linkage);
+	so_ref(NULL, &so);
 }
 
 void
@@ -4229,6 +4667,7 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
 
 	nouveau_bo_ref(NULL, &p->bo);
 
+	FREE(p->immd);
 	nouveau_resource_free(&p->data[0]);
 
 	p->translated = 0;
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 461fec1d89..1e3ad6bff0 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -16,11 +16,13 @@ struct nv50_program_exec {
 };
 
 struct nv50_sreg4 {
-	uint8_t hw;
-	uint8_t id; /* tgsi index, nv50 needs them sorted: flat ones last */
+	uint8_t hw; /* hw index, nv50 wants flat FP inputs last */
+	uint8_t id; /* tgsi index */
 
 	uint8_t mask;
 	boolean linear;
+
+	ubyte sn, si; /* semantic name & index */
 };
 
 struct nv50_program {
@@ -49,16 +51,24 @@ struct nv50_program {
 		uint32_t regs[4];
 
 		/* for VPs, io_nr doesn't count 'private' results (PSIZ etc.) */
-		unsigned io_nr;
-		struct nv50_sreg4 io[PIPE_MAX_SHADER_OUTPUTS];
+		unsigned in_nr, out_nr;
+		struct nv50_sreg4 in[PIPE_MAX_SHADER_INPUTS];
+		struct nv50_sreg4 out[PIPE_MAX_SHADER_OUTPUTS];
 
 		/* FP colour inputs, VP/GP back colour outputs */
 		struct nv50_sreg4 two_side[2];
 
-		/* VP only */
+		/* GP only */
+		unsigned vert_count;
+		uint8_t prim_type;
+
+		/* VP & GP only */
 		uint8_t clpd, clpd_nr;
 		uint8_t psiz;
 		uint8_t edgeflag_in;
+
+		/* FP & GP only */
+		uint8_t prim_id;
 	} cfg;
 };
 
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index 28e2b35dea..9d58f3c965 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -167,7 +167,7 @@ nv50_screen_destroy(struct pipe_screen *pscreen)
 	struct nv50_screen *screen = nv50_screen(pscreen);
 	unsigned i;
 
-	for (i = 0; i < 2; i++) {
+	for (i = 0; i < 3; i++) {
 		if (screen->constbuf_parm[i])
 			nouveau_bo_ref(NULL, &screen->constbuf_parm[i]);
 	}
@@ -329,7 +329,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_ref(NULL, &so);
 
 	/* Static tesla init */
-	so = so_new(40, 84, 20);
+	so = so_new(47, 95, 24);
 
 	so_method(so, screen->tesla, NV50TCL_COND_MODE, 1);
 	so_data  (so, NV50TCL_COND_MODE_ALWAYS);
@@ -352,10 +352,11 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_data  (so, 0xf);
 
 	/* max TIC (bits 4:8) & TSC (ignored) bindings, per program type */
-	so_method(so, screen->tesla, NV50TCL_TEX_LIMITS(0), 1);
-	so_data  (so, 0x54);
-	so_method(so, screen->tesla, NV50TCL_TEX_LIMITS(2), 1);
-	so_data  (so, 0x54);
+	for (i = 0; i < 3; ++i) {
+		so_method(so, screen->tesla, NV50TCL_TEX_LIMITS(i), 1);
+		so_data  (so, 0x54);
+	}
+
 	/* origin is top left (set to 1 for bottom left) */
 	so_method(so, screen->tesla, NV50TCL_Y_ORIGIN_BOTTOM, 1);
 	so_data  (so, 0);
@@ -370,8 +371,8 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		return NULL;
 	}
 
-	for (i = 0; i < 2; i++) {
-		ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, (128 * 4) * 4,
+	for (i = 0; i < 3; i++) {
+		ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, (256 * 4) * 4,
 				     &screen->constbuf_parm[i]);
 		if (ret) {
 			nv50_screen_destroy(pscreen);
@@ -406,22 +407,45 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
 	so_data  (so, 0x00000001 | (NV50_CB_PMISC << 12));
 	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
+	so_data  (so, 0x00000021 | (NV50_CB_PMISC << 12));
+	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
 	so_data  (so, 0x00000031 | (NV50_CB_PMISC << 12));
 
+	/* bind auxiliary constbuf to immediate data bo */
 	so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
-	so_reloc (so, screen->constbuf_parm[0], 0, NOUVEAU_BO_VRAM |
-		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, screen->constbuf_parm[0], 0, NOUVEAU_BO_VRAM |
-		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, screen->constbuf_misc[0], (128 * 4) * 4,
+		  NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->constbuf_misc[0], (128 * 4) * 4,
+		  NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_AUX << 16) | 0x00000200);
+	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
+	so_data  (so, 0x00000201 | (NV50_CB_AUX << 12));
+	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
+	so_data  (so, 0x00000221 | (NV50_CB_AUX << 12));
+
+	so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
+	so_reloc (so, screen->constbuf_parm[PIPE_SHADER_VERTEX], 0,
+		  NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->constbuf_parm[PIPE_SHADER_VERTEX], 0,
+		  NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
 	so_data  (so, (NV50_CB_PVP << 16) | 0x00000800);
 	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
 	so_data  (so, 0x00000101 | (NV50_CB_PVP << 12));
 
 	so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
-	so_reloc (so, screen->constbuf_parm[1], 0, NOUVEAU_BO_VRAM |
-		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, screen->constbuf_parm[1], 0, NOUVEAU_BO_VRAM |
-		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, screen->constbuf_parm[PIPE_SHADER_GEOMETRY], 0,
+		  NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->constbuf_parm[PIPE_SHADER_GEOMETRY], 0,
+		  NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_PGP << 16) | 0x00000800);
+	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
+	so_data  (so, 0x00000121 | (NV50_CB_PGP << 12));
+
+	so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
+	so_reloc (so, screen->constbuf_parm[PIPE_SHADER_FRAGMENT], 0,
+		  NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->constbuf_parm[PIPE_SHADER_FRAGMENT], 0,
+		  NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
 	so_data  (so, (NV50_CB_PFP << 16) | 0x00000800);
 	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
 	so_data  (so, 0x00000131 | (NV50_CB_PFP << 12));
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
index a038a4e3c2..0d786b0f2e 100644
--- a/src/gallium/drivers/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -18,10 +18,12 @@ struct nv50_screen {
 	struct nouveau_notifier *sync;
 
 	struct nouveau_bo *constbuf_misc[1];
-	struct nouveau_bo *constbuf_parm[2];
+	struct nouveau_bo *constbuf_parm[PIPE_SHADER_TYPES];
 
 	struct nouveau_resource *immd_heap[1];
-	struct nouveau_resource *parm_heap[2];
+	struct nouveau_resource *parm_heap[PIPE_SHADER_TYPES];
+
+	struct pipe_buffer *strm_vbuf[16];
 
 	struct nouveau_bo *tic;
 	struct nouveau_bo *tsc;
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index 1f67df814b..6ab33be663 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -531,7 +531,7 @@ nv50_vp_state_delete(struct pipe_context *pipe, void *hwcso)
 	struct nv50_program *p = hwcso;
 
 	nv50_program_destroy(nv50, p);
-	FREE((void*)p->pipe.tokens);
+	FREE((void *)p->pipe.tokens);
 	FREE(p);
 }
 
@@ -563,7 +563,39 @@ nv50_fp_state_delete(struct pipe_context *pipe, void *hwcso)
 	struct nv50_program *p = hwcso;
 
 	nv50_program_destroy(nv50, p);
-	FREE((void*)p->pipe.tokens);
+	FREE((void *)p->pipe.tokens);
+	FREE(p);
+}
+
+static void *
+nv50_gp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv50_program *p = CALLOC_STRUCT(nv50_program);
+
+	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	p->type = PIPE_SHADER_GEOMETRY;
+	tgsi_scan_shader(p->pipe.tokens, &p->info);
+	return (void *)p;
+}
+
+static void
+nv50_gp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->fragprog = hwcso;
+	nv50->dirty |= NV50_NEW_GEOMPROG;
+}
+
+static void
+nv50_gp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_program *p = hwcso;
+
+	nv50_program_destroy(nv50, p);
+	FREE((void *)p->pipe.tokens);
 	FREE(p);
 }
 
@@ -585,17 +617,21 @@ nv50_set_clip_state(struct pipe_context *pipe,
 
 static void
 nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
-			 const struct pipe_constant_buffer *buf )
+			 struct pipe_buffer *buf )
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
 
 	if (shader == PIPE_SHADER_VERTEX) {
-		nv50->constbuf[PIPE_SHADER_VERTEX] = buf->buffer;
+		nv50->constbuf[PIPE_SHADER_VERTEX] = buf;
 		nv50->dirty |= NV50_NEW_VERTPROG_CB;
 	} else
 	if (shader == PIPE_SHADER_FRAGMENT) {
-		nv50->constbuf[PIPE_SHADER_FRAGMENT] = buf->buffer;
+		nv50->constbuf[PIPE_SHADER_FRAGMENT] = buf;
 		nv50->dirty |= NV50_NEW_FRAGPROG_CB;
+	} else
+	if (shader == PIPE_SHADER_GEOMETRY) {
+		nv50->constbuf[PIPE_SHADER_GEOMETRY] = buf;
+		nv50->dirty |= NV50_NEW_GEOMPROG_CB;
 	}
 }
 
@@ -696,6 +732,10 @@ nv50_init_state_functions(struct nv50_context *nv50)
 	nv50->pipe.bind_fs_state = nv50_fp_state_bind;
 	nv50->pipe.delete_fs_state = nv50_fp_state_delete;
 
+	nv50->pipe.create_gs_state = nv50_gp_state_create;
+	nv50->pipe.bind_gs_state = nv50_gp_state_bind;
+	nv50->pipe.delete_gs_state = nv50_gp_state_delete;
+
 	nv50->pipe.set_blend_color = nv50_set_blend_color;
 	nv50->pipe.set_clip_state = nv50_set_clip_state;
 	nv50->pipe.set_constant_buffer = nv50_set_constant_buffer;
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index f83232f43c..956da9b304 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -199,6 +199,8 @@ nv50_state_emit(struct nv50_context *nv50)
 			nv50->state.dirty |= NV50_NEW_VERTPROG;
 		if (nv50->state.fragprog)
 			nv50->state.dirty |= NV50_NEW_FRAGPROG;
+		if (nv50->state.geomprog)
+			nv50->state.dirty |= NV50_NEW_GEOMPROG;
 		if (nv50->state.rast)
 			nv50->state.dirty |= NV50_NEW_RASTERIZER;
 		if (nv50->state.blend_colour)
@@ -228,9 +230,14 @@ nv50_state_emit(struct nv50_context *nv50)
 		so_emit(chan, nv50->state.vertprog);
 	if (nv50->state.dirty & NV50_NEW_FRAGPROG)
 		so_emit(chan, nv50->state.fragprog);
+	if (nv50->state.dirty & NV50_NEW_GEOMPROG && nv50->state.geomprog)
+		so_emit(chan, nv50->state.geomprog);
 	if (nv50->state.dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG |
-				 NV50_NEW_RASTERIZER))
-		so_emit(chan, nv50->state.programs);
+				 NV50_NEW_GEOMPROG | NV50_NEW_RASTERIZER))
+		so_emit(chan, nv50->state.fp_linkage);
+	if ((nv50->state.dirty & (NV50_NEW_VERTPROG | NV50_NEW_GEOMPROG))
+	    && nv50->state.gp_linkage)
+		so_emit(chan, nv50->state.gp_linkage);
 	if (nv50->state.dirty & NV50_NEW_RASTERIZER)
 		so_emit(chan, nv50->state.rast);
 	if (nv50->state.dirty & NV50_NEW_BLEND_COLOUR)
@@ -267,6 +274,9 @@ nv50_state_flush_notify(struct nouveau_channel *chan)
 	so_emit_reloc_markers(chan, nv50->state.fragprog);
 	so_emit_reloc_markers(chan, nv50->state.vtxbuf);
 	so_emit_reloc_markers(chan, nv50->screen->static_init);
+
+	if (nv50->state.instbuf)
+		so_emit_reloc_markers(chan, nv50->state.instbuf);
 }
 
 boolean
@@ -291,9 +301,15 @@ nv50_state_validate(struct nv50_context *nv50)
 	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_FRAGPROG_CB))
 		nv50_fragprog_validate(nv50);
 
+	if (nv50->dirty & (NV50_NEW_GEOMPROG | NV50_NEW_GEOMPROG_CB))
+		nv50_geomprog_validate(nv50);
+
 	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG |
-			   NV50_NEW_RASTERIZER))
-		nv50_linkage_validate(nv50);
+			   NV50_NEW_GEOMPROG | NV50_NEW_RASTERIZER))
+		nv50_fp_linkage_validate(nv50);
+
+	if (nv50->dirty & (NV50_NEW_GEOMPROG | NV50_NEW_VERTPROG))
+		nv50_gp_linkage_validate(nv50);
 
 	if (nv50->dirty & NV50_NEW_RASTERIZER)
 		so_ref(nv50->rasterizer->so, &nv50->state.rast);
@@ -400,8 +416,9 @@ viewport_uptodate:
 		for (i = 0; i < PIPE_SHADER_TYPES; ++i)
 			nr += nv50->sampler_nr[i];
 
-		so = so_new(1+ 5 * PIPE_SHADER_TYPES, 1+ 19 * PIPE_SHADER_TYPES
-					+ nr * 8, PIPE_SHADER_TYPES * 2);
+		so = so_new(1 + 5 * PIPE_SHADER_TYPES,
+			    1 + 19 * PIPE_SHADER_TYPES + nr * 8,
+			    PIPE_SHADER_TYPES * 2);
 
 		nv50_validate_samplers(nv50, so, PIPE_SHADER_VERTEX);
 		nv50_validate_samplers(nv50, so, PIPE_SHADER_FRAGMENT);
diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c
index bef548b728..871536dca9 100644
--- a/src/gallium/drivers/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nv50/nv50_tex.c
@@ -155,7 +155,7 @@ static boolean
 nv50_validate_textures(struct nv50_context *nv50, struct nouveau_stateobj *so,
 		       unsigned p)
 {
-	static const unsigned p_remap[PIPE_SHADER_TYPES] = { 0, 2 };
+	static const unsigned p_remap[PIPE_SHADER_TYPES] = { 0, 2, 1 };
 
 	struct nouveau_grobj *eng2d = nv50->screen->eng2d;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index f2e510fba6..bfb1b34d27 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -40,6 +40,8 @@ nv50_push_elements_u32(struct nv50_context *, uint32_t *, unsigned);
 static boolean
 nv50_push_arrays(struct nv50_context *, unsigned, unsigned);
 
+#define NV50_USING_LOATHED_EDGEFLAG(ctx) ((ctx)->vertprog->cfg.edgeflag_in < 16)
+
 static INLINE unsigned
 nv50_prim(unsigned mode)
 {
@@ -55,6 +57,14 @@ nv50_prim(unsigned mode)
 	case PIPE_PRIM_QUADS: return NV50TCL_VERTEX_BEGIN_QUADS;
 	case PIPE_PRIM_QUAD_STRIP: return NV50TCL_VERTEX_BEGIN_QUAD_STRIP;
 	case PIPE_PRIM_POLYGON: return NV50TCL_VERTEX_BEGIN_POLYGON;
+	case PIPE_PRIM_LINES_ADJACENCY:
+		return NV50TCL_VERTEX_BEGIN_LINES_ADJACENCY;
+	case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+		return NV50TCL_VERTEX_BEGIN_LINE_STRIP_ADJACENCY;
+	case PIPE_PRIM_TRIANGLES_ADJACENCY:
+		return NV50TCL_VERTEX_BEGIN_TRIANGLES_ADJACENCY;
+	case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+		return NV50TCL_VERTEX_BEGIN_TRIANGLE_STRIP_ADJACENCY;
 	default:
 		break;
 	}
@@ -152,6 +162,309 @@ nv50_vbo_vtxelt_to_hw(struct pipe_vertex_element *ve)
 	return (hw_type | hw_size);
 }
 
+/* For instanced drawing from user buffers, hitting the FIFO repeatedly
+ * with the same vertex data is probably worse than uploading all data.
+ */
+static boolean
+nv50_upload_vtxbuf(struct nv50_context *nv50, unsigned i)
+{
+	struct nv50_screen *nscreen = nv50->screen;
+	struct pipe_screen *pscreen = &nscreen->base.base;
+	struct pipe_buffer *buf = nscreen->strm_vbuf[i];
+	struct pipe_vertex_buffer *vb = &nv50->vtxbuf[i];
+	uint8_t *src;
+	unsigned size = align(vb->buffer->size, 4096);
+
+	if (buf && buf->size < size)
+		pipe_buffer_reference(&nscreen->strm_vbuf[i], NULL);
+
+	if (!nscreen->strm_vbuf[i]) {
+		nscreen->strm_vbuf[i] = pipe_buffer_create(
+			pscreen, 0, PIPE_BUFFER_USAGE_VERTEX, size);
+		buf = nscreen->strm_vbuf[i];
+	}
+
+	src = pipe_buffer_map(pscreen, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+	if (!src)
+		return FALSE;
+	src += vb->buffer_offset;
+
+	size = (vb->max_index + 1) * vb->stride + 16; /* + 16 is for stride 0 */
+	if (vb->buffer_offset + size > vb->buffer->size)
+		size = vb->buffer->size - vb->buffer_offset;
+
+	pipe_buffer_write(pscreen, buf, vb->buffer_offset, size, src);
+	pipe_buffer_unmap(pscreen, vb->buffer);
+
+	vb->buffer = buf; /* don't pipe_reference, this is a private copy */
+	return TRUE;
+}
+
+static void
+nv50_upload_user_vbufs(struct nv50_context *nv50)
+{
+	unsigned i;
+
+	if (nv50->vbo_fifo)
+		nv50->dirty |= NV50_NEW_ARRAYS;
+	if (!(nv50->dirty & NV50_NEW_ARRAYS))
+		return;
+
+	for (i = 0; i < nv50->vtxbuf_nr; ++i) {
+		if (nv50->vtxbuf[i].buffer->usage & PIPE_BUFFER_USAGE_VERTEX)
+			continue;
+		nv50_upload_vtxbuf(nv50, i);
+	}
+}
+
+static void
+nv50_set_static_vtxattr(struct nv50_context *nv50, unsigned i, void *data)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_channel *chan = tesla->channel;
+	float v[4];
+
+	util_format_read_4f(nv50->vtxelt[i].src_format,
+			    v, 0, data, 0, 0, 0, 1, 1);
+
+	switch (nv50->vtxelt[i].nr_components) {
+	case 4:
+		BEGIN_RING(chan, tesla, NV50TCL_VTX_ATTR_4F_X(i), 4);
+		OUT_RINGf (chan, v[0]);
+		OUT_RINGf (chan, v[1]);
+		OUT_RINGf (chan, v[2]);
+		OUT_RINGf (chan, v[3]);
+		break;
+	case 3:
+		BEGIN_RING(chan, tesla, NV50TCL_VTX_ATTR_3F_X(i), 3);
+		OUT_RINGf (chan, v[0]);
+		OUT_RINGf (chan, v[1]);
+		OUT_RINGf (chan, v[2]);
+		break;
+	case 2:
+		BEGIN_RING(chan, tesla, NV50TCL_VTX_ATTR_2F_X(i), 2);
+		OUT_RINGf (chan, v[0]);
+		OUT_RINGf (chan, v[1]);
+		break;
+	case 1:
+		BEGIN_RING(chan, tesla, NV50TCL_VTX_ATTR_1F(i), 1);
+		OUT_RINGf (chan, v[0]);
+		break;
+	default:
+		assert(0);
+		break;
+	}
+}
+
+static unsigned
+init_per_instance_arrays_immd(struct nv50_context *nv50,
+			      unsigned startInstance,
+			      unsigned pos[16], unsigned step[16])
+{
+	struct nouveau_bo *bo;
+	unsigned i, b, count = 0;
+
+	for (i = 0; i < nv50->vtxelt_nr; ++i) {
+		if (!nv50->vtxelt[i].instance_divisor)
+			continue;
+		++count;
+		b = nv50->vtxelt[i].vertex_buffer_index;
+
+		pos[i] = nv50->vtxelt[i].src_offset +
+			nv50->vtxbuf[b].buffer_offset +
+			startInstance * nv50->vtxbuf[b].stride;
+		step[i] = startInstance % nv50->vtxelt[i].instance_divisor;
+
+		bo = nouveau_bo(nv50->vtxbuf[b].buffer);
+		if (!bo->map)
+			nouveau_bo_map(bo, NOUVEAU_BO_RD);
+
+		nv50_set_static_vtxattr(nv50, i, (uint8_t *)bo->map + pos[i]);
+	}
+
+	return count;
+}
+
+static unsigned
+init_per_instance_arrays(struct nv50_context *nv50,
+			 unsigned startInstance,
+			 unsigned pos[16], unsigned step[16])
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_channel *chan = tesla->channel;
+	struct nouveau_bo *bo;
+	struct nouveau_stateobj *so;
+	unsigned i, b, count = 0;
+	const uint32_t rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	if (nv50->vbo_fifo)
+		return init_per_instance_arrays_immd(nv50, startInstance,
+						     pos, step);
+
+	so = so_new(nv50->vtxelt_nr, nv50->vtxelt_nr * 2, nv50->vtxelt_nr * 2);
+
+	for (i = 0; i < nv50->vtxelt_nr; ++i) {
+		if (!nv50->vtxelt[i].instance_divisor)
+			continue;
+		++count;
+		b = nv50->vtxelt[i].vertex_buffer_index;
+
+		pos[i] = nv50->vtxelt[i].src_offset +
+			nv50->vtxbuf[b].buffer_offset +
+			startInstance * nv50->vtxbuf[b].stride;
+
+		if (!startInstance) {
+			step[i] = 0;
+			continue;
+		}
+		step[i] = startInstance % nv50->vtxelt[i].instance_divisor;
+
+		bo = nouveau_bo(nv50->vtxbuf[b].buffer);
+
+		so_method(so, tesla, NV50TCL_VERTEX_ARRAY_START_HIGH(i), 2);
+		so_reloc (so, bo, pos[i], rl | NOUVEAU_BO_HIGH, 0, 0);
+		so_reloc (so, bo, pos[i], rl | NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	if (count && startInstance) {
+		so_ref (so, &nv50->state.instbuf); /* for flush notify */
+		so_emit(chan, nv50->state.instbuf);
+	}
+	so_ref (NULL, &so);
+
+	return count;
+}
+
+static void
+step_per_instance_arrays_immd(struct nv50_context *nv50,
+			      unsigned pos[16], unsigned step[16])
+{
+	struct nouveau_bo *bo;
+	unsigned i, b;
+
+	for (i = 0; i < nv50->vtxelt_nr; ++i) {
+		if (!nv50->vtxelt[i].instance_divisor)
+			continue;
+		if (++step[i] != nv50->vtxelt[i].instance_divisor)
+			continue;
+		b = nv50->vtxelt[i].vertex_buffer_index;
+		bo = nouveau_bo(nv50->vtxbuf[b].buffer);
+
+		step[i] = 0;
+		pos[i] += nv50->vtxbuf[b].stride;
+
+		nv50_set_static_vtxattr(nv50, i, (uint8_t *)bo->map + pos[i]);
+	}
+}
+
+static void
+step_per_instance_arrays(struct nv50_context *nv50,
+			 unsigned pos[16], unsigned step[16])
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_channel *chan = tesla->channel;
+	struct nouveau_bo *bo;
+	struct nouveau_stateobj *so;
+	unsigned i, b;
+	const uint32_t rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	if (nv50->vbo_fifo) {
+		step_per_instance_arrays_immd(nv50, pos, step);
+		return;
+	}
+
+	so = so_new(nv50->vtxelt_nr, nv50->vtxelt_nr * 2, nv50->vtxelt_nr * 2);
+
+	for (i = 0; i < nv50->vtxelt_nr; ++i) {
+		if (!nv50->vtxelt[i].instance_divisor)
+			continue;
+		b = nv50->vtxelt[i].vertex_buffer_index;
+
+		if (++step[i] == nv50->vtxelt[i].instance_divisor) {
+			step[i] = 0;
+			pos[i] += nv50->vtxbuf[b].stride;
+		}
+
+		bo = nouveau_bo(nv50->vtxbuf[b].buffer);
+
+		so_method(so, tesla, NV50TCL_VERTEX_ARRAY_START_HIGH(i), 2);
+		so_reloc (so, bo, pos[i], rl | NOUVEAU_BO_HIGH, 0, 0);
+		so_reloc (so, bo, pos[i], rl | NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	so_ref (so, &nv50->state.instbuf); /* for flush notify */
+	so_ref (NULL, &so);
+
+	so_emit(chan, nv50->state.instbuf);
+}
+
+static INLINE void
+nv50_unmap_vbufs(struct nv50_context *nv50)
+{
+        unsigned i;
+
+        for (i = 0; i < nv50->vtxbuf_nr; ++i)
+                if (nouveau_bo(nv50->vtxbuf[i].buffer)->map)
+                        nouveau_bo_unmap(nouveau_bo(nv50->vtxbuf[i].buffer));
+}
+
+void
+nv50_draw_arrays_instanced(struct pipe_context *pipe,
+			   unsigned mode, unsigned start, unsigned count,
+			   unsigned startInstance, unsigned instanceCount)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->tesla->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	unsigned i, nz_divisors;
+	unsigned step[16], pos[16];
+
+	if (!NV50_USING_LOATHED_EDGEFLAG(nv50))
+		nv50_upload_user_vbufs(nv50);
+
+	nv50_state_validate(nv50);
+
+	nz_divisors = init_per_instance_arrays(nv50, startInstance, pos, step);
+
+	BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 2);
+	OUT_RING  (chan, NV50_CB_AUX | (24 << 8));
+	OUT_RING  (chan, startInstance);
+
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+	OUT_RING  (chan, nv50_prim(mode));
+
+	if (nv50->vbo_fifo)
+		nv50_push_arrays(nv50, start, count);
+	else {
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2);
+		OUT_RING  (chan, start);
+		OUT_RING  (chan, count);
+	}
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+	OUT_RING  (chan, 0);
+
+	for (i = 1; i < instanceCount; i++) {
+		if (nz_divisors) /* any non-zero array divisors ? */
+			step_per_instance_arrays(nv50, pos, step);
+
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+		OUT_RING  (chan, nv50_prim(mode) | (1 << 28));
+
+		if (nv50->vbo_fifo)
+			nv50_push_arrays(nv50, start, count);
+		else {
+			BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2);
+			OUT_RING  (chan, start);
+			OUT_RING  (chan, count);
+		}
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+		OUT_RING  (chan, 0);
+	}
+	nv50_unmap_vbufs(nv50);
+
+	so_ref(NULL, &nv50->state.instbuf);
+}
+
 void
 nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 		 unsigned count)
@@ -182,6 +495,8 @@ nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
 	OUT_RING  (chan, 0);
 
+	nv50_unmap_vbufs(nv50);
+
         /* XXX: not sure what to do if ret != TRUE: flush and retry?
          */
         assert(ret);
@@ -210,7 +525,7 @@ nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
 		unsigned nr = count > 2046 ? 2046 : count;
 		int i;
 
-		BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U16 | 0x40000000, nr >> 1);
+		BEGIN_RING_NI(chan, tesla, NV50TCL_VB_ELEMENT_U16, nr >> 1);
 		for (i = 0; i < nr; i += 2)
 			OUT_RING  (chan, (map[i + 1] << 16) | map[i]);
 
@@ -243,7 +558,7 @@ nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
 		unsigned nr = count > 2046 ? 2046 : count;
 		int i;
 
-		BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U16 | 0x40000000, nr >> 1);
+		BEGIN_RING_NI(chan, tesla, NV50TCL_VB_ELEMENT_U16, nr >> 1);
 		for (i = 0; i < nr; i += 2)
 			OUT_RING  (chan, (map[i + 1] << 16) | map[i]);
 
@@ -268,7 +583,7 @@ nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map,
 	while (count) {
 		unsigned nr = count > 2047 ? 2047 : count;
 
-		BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U32 | 0x40000000, nr);
+		BEGIN_RING_NI(chan, tesla, NV50TCL_VB_ELEMENT_U32, nr);
 		OUT_RINGp (chan, map, nr);
 
 		count -= nr;
@@ -277,6 +592,77 @@ nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map,
 	return TRUE;
 }
 
+static INLINE void
+nv50_draw_elements_inline(struct nv50_context *nv50,
+			  void *map, unsigned indexSize,
+			  unsigned start, unsigned count)
+{
+	switch (indexSize) {
+	case 1:
+		nv50_draw_elements_inline_u08(nv50, map, start, count);
+		break;
+	case 2:
+		nv50_draw_elements_inline_u16(nv50, map, start, count);
+		break;
+	case 4:
+		nv50_draw_elements_inline_u32(nv50, map, start, count);
+		break;
+	}
+}
+
+void
+nv50_draw_elements_instanced(struct pipe_context *pipe,
+			     struct pipe_buffer *indexBuffer,
+			     unsigned indexSize,
+			     unsigned mode, unsigned start, unsigned count,
+			     unsigned startInstance, unsigned instanceCount)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_channel *chan = tesla->channel;
+	struct pipe_screen *pscreen = pipe->screen;
+	void *map;
+	unsigned i, nz_divisors;
+	unsigned step[16], pos[16];
+
+	map = pipe_buffer_map(pscreen, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ);
+
+	if (!NV50_USING_LOATHED_EDGEFLAG(nv50))
+		nv50_upload_user_vbufs(nv50);
+
+	nv50_state_validate(nv50);
+
+	nz_divisors = init_per_instance_arrays(nv50, startInstance, pos, step);
+
+	BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 2);
+	OUT_RING  (chan, NV50_CB_AUX | (24 << 8));
+	OUT_RING  (chan, startInstance);
+
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+	OUT_RING  (chan, nv50_prim(mode));
+
+	nv50_draw_elements_inline(nv50, map, indexSize, start, count);
+
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+	OUT_RING  (chan, 0);
+
+	for (i = 1; i < instanceCount; ++i) {
+		if (nz_divisors) /* any non-zero array divisors ? */
+			step_per_instance_arrays(nv50, pos, step);
+
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+		OUT_RING  (chan, nv50_prim(mode) | (1 << 28));
+
+		nv50_draw_elements_inline(nv50, map, indexSize, start, count);
+
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+		OUT_RING  (chan, 0);
+	}
+	nv50_unmap_vbufs(nv50);
+
+	so_ref(NULL, &nv50->state.instbuf);
+}
+
 void
 nv50_draw_elements(struct pipe_context *pipe,
 		   struct pipe_buffer *indexBuffer, unsigned indexSize,
@@ -287,7 +673,6 @@ nv50_draw_elements(struct pipe_context *pipe,
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct pipe_screen *pscreen = pipe->screen;
 	void *map;
-	boolean ret;
 	
 	map = pipe_buffer_map(pscreen, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ);
 
@@ -300,29 +685,15 @@ nv50_draw_elements(struct pipe_context *pipe,
 
 	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
 	OUT_RING  (chan, nv50_prim(mode));
-	switch (indexSize) {
-	case 1:
-		ret = nv50_draw_elements_inline_u08(nv50, map, start, count);
-		break;
-	case 2:
-		ret = nv50_draw_elements_inline_u16(nv50, map, start, count);
-		break;
-	case 4:
-		ret = nv50_draw_elements_inline_u32(nv50, map, start, count);
-		break;
-	default:
-		assert(0);
-		ret = FALSE;
-		break;
-	}
+
+	nv50_draw_elements_inline(nv50, map, indexSize, start, count);
+
 	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
 	OUT_RING  (chan, 0);
 
+	nv50_unmap_vbufs(nv50);
+
 	pipe_buffer_unmap(pscreen, indexBuffer);
-        
-        /* XXX: what to do if ret != TRUE?  Flush and retry?
-         */
-	assert(ret);
 }
 
 static INLINE boolean
@@ -335,23 +706,16 @@ nv50_vbo_static_attrib(struct nv50_context *nv50, unsigned attrib,
 	struct nouveau_stateobj *so;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nouveau_bo *bo = nouveau_bo(vb->buffer);
-	float *v;
+	float v[4];
 	int ret;
-	enum pipe_format pf = ve->src_format;
-	const struct util_format_description *desc;
-
-	desc = util_format_description(pf);
-	assert(desc);
-
-	if ((desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT) ||
-	    util_format_get_component_bits(pf, UTIL_FORMAT_COLORSPACE_RGB, 0) != 32)
-		return FALSE;
 
 	ret = nouveau_bo_map(bo, NOUVEAU_BO_RD);
 	if (ret)
 		return FALSE;
-	v = (float *)(bo->map + (vb->buffer_offset + ve->src_offset));
 
+	util_format_read_4f(ve->src_format, v, 0, (uint8_t *)bo->map +
+			    (vb->buffer_offset + ve->src_offset), 0,
+			    0, 0, 1, 1);
 	so = *pso;
 	if (!so)
 		*pso = so = so_new(nv50->vtxelt_nr, nv50->vtxelt_nr * 4, 0);
@@ -409,7 +773,7 @@ nv50_vbo_validate(struct nv50_context *nv50)
 		    !(nv50->vtxbuf[i].buffer->usage & PIPE_BUFFER_USAGE_VERTEX))
 			nv50->vbo_fifo = 0xffff;
 
-	if (nv50->vertprog->cfg.edgeflag_in < 16)
+	if (NV50_USING_LOATHED_EDGEFLAG(nv50))
 		nv50->vbo_fifo = 0xffff; /* vertprog can't set edgeflag */
 
 	n_ve = MAX2(nv50->vtxelt_nr, nv50->state.vtxelt_nr);
@@ -437,17 +801,20 @@ nv50_vbo_validate(struct nv50_context *nv50)
 			nv50->vbo_fifo &= ~(1 << i);
 			continue;
 		}
-		so_data(vtxfmt, hw | i);
 
 		if (nv50->vbo_fifo) {
+			so_data  (vtxfmt, hw |
+				  (ve->instance_divisor ? (1 << 4) : i));
 			so_method(vtxbuf, tesla,
 				  NV50TCL_VERTEX_ARRAY_FORMAT(i), 1);
 			so_data  (vtxbuf, 0);
 			continue;
 		}
+		so_data(vtxfmt, hw | i);
 
 		so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_FORMAT(i), 3);
-		so_data  (vtxbuf, 0x20000000 | vb->stride);
+		so_data  (vtxbuf, 0x20000000 |
+			  (ve->instance_divisor ? 0 : vb->stride));
 		so_reloc (vtxbuf, bo, vb->buffer_offset +
 			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
 			  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
@@ -485,7 +852,7 @@ typedef void (*pfn_push)(struct nouveau_channel *, void *);
 struct nv50_vbo_emitctx
 {
 	pfn_push push[16];
-	void *map[16];
+	uint8_t *map[16];
 	unsigned stride[16];
 	unsigned nr_ve;
 	unsigned vtx_dwords;
@@ -523,19 +890,18 @@ nv50_map_vbufs(struct nv50_context *nv50)
 
 	for (i = 0; i < nv50->vtxbuf_nr; ++i) {
 		struct pipe_vertex_buffer *vb = &nv50->vtxbuf[i];
-		unsigned size, delta;
+		unsigned size = vb->stride * (vb->max_index + 1) + 16;
 
 		if (nouveau_bo(vb->buffer)->map)
 			continue;
 
-		size = vb->stride * (vb->max_index + 1);
-		delta = vb->buffer_offset;
-
+		size = vb->stride * (vb->max_index + 1) + 16;
+		size = MIN2(size, vb->buffer->size);
 		if (!size)
-			size = vb->buffer->size - vb->buffer_offset;
+			size = vb->buffer->size;
 
 		if (nouveau_bo_map_range(nouveau_bo(vb->buffer),
-					 delta, size, NOUVEAU_BO_RD))
+					 0, size, NOUVEAU_BO_RD))
 			break;
 	}
 
@@ -546,16 +912,6 @@ nv50_map_vbufs(struct nv50_context *nv50)
 	return FALSE;
 }
 
-static INLINE void
-nv50_unmap_vbufs(struct nv50_context *nv50)
-{
-        unsigned i;
-
-        for (i = 0; i < nv50->vtxbuf_nr; ++i)
-                if (nouveau_bo(nv50->vtxbuf[i].buffer)->map)
-                        nouveau_bo_unmap(nouveau_bo(nv50->vtxbuf[i].buffer));
-}
-
 static void
 emit_b32_1(struct nouveau_channel *chan, void *data)
 {
@@ -650,12 +1006,13 @@ emit_prepare(struct nv50_context *nv50, struct nv50_vbo_emitctx *emit,
 
 		ve = &nv50->vtxelt[i];
 		vb = &nv50->vtxbuf[ve->vertex_buffer_index];
-		if (!(nv50->vbo_fifo & (1 << i)))
+		if (!(nv50->vbo_fifo & (1 << i)) || ve->instance_divisor)
 			continue;
 		n = emit->nr_ve++;
 
 		emit->stride[n] = vb->stride;
-		emit->map[n] = nouveau_bo(vb->buffer)->map +
+		emit->map[n] = (uint8_t *)nouveau_bo(vb->buffer)->map +
+			vb->buffer_offset +
 			(start * vb->stride + ve->src_offset);
 
 		desc = util_format_description(ve->src_format);
@@ -745,13 +1102,12 @@ nv50_push_arrays(struct nv50_context *nv50, unsigned start, unsigned count)
 
 		set_edgeflag(chan, tesla, &emit, 0); /* nr will be 1 */
 
-		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
+		BEGIN_RING_NI(chan, tesla, NV50TCL_VERTEX_DATA, dw);
 		for (i = 0; i < nr; ++i)
 			emit_vtx_next(chan, &emit);
 
 		count -= nr;
 	}
-	nv50_unmap_vbufs(nv50);
 
 	return TRUE;
 }
@@ -772,13 +1128,12 @@ nv50_push_elements_u32(struct nv50_context *nv50, uint32_t *map, unsigned count)
 
 		set_edgeflag(chan, tesla, &emit, *map);
 
-		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
+		BEGIN_RING_NI(chan, tesla, NV50TCL_VERTEX_DATA, dw);
 		for (i = 0; i < nr; ++i)
 			emit_vtx(chan, &emit, *map++);
 
 		count -= nr;
 	}
-	nv50_unmap_vbufs(nv50);
 
 	return TRUE;
 }
@@ -799,13 +1154,12 @@ nv50_push_elements_u16(struct nv50_context *nv50, uint16_t *map, unsigned count)
 
 		set_edgeflag(chan, tesla, &emit, *map);
 
-		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
+		BEGIN_RING_NI(chan, tesla, NV50TCL_VERTEX_DATA, dw);
 		for (i = 0; i < nr; ++i)
 			emit_vtx(chan, &emit, *map++);
 
 		count -= nr;
 	}
-	nv50_unmap_vbufs(nv50);
 
 	return TRUE;
 }
@@ -826,13 +1180,12 @@ nv50_push_elements_u08(struct nv50_context *nv50, uint8_t *map, unsigned count)
 
 		set_edgeflag(chan, tesla, &emit, *map);
 
-		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
+		BEGIN_RING_NI(chan, tesla, NV50TCL_VERTEX_DATA, dw);
 		for (i = 0; i < nr; ++i)
 			emit_vtx(chan, &emit, *map++);
 
 		count -= nr;
 	}
-	nv50_unmap_vbufs(nv50);
 
 	return TRUE;
 }
diff --git a/src/gallium/drivers/r300/r300_chipset.c b/src/gallium/drivers/r300/r300_chipset.c
index 51fdb82ff3..92de297ef1 100644
--- a/src/gallium/drivers/r300/r300_chipset.c
+++ b/src/gallium/drivers/r300/r300_chipset.c
@@ -33,6 +33,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
     /* Reasonable defaults */
     caps->num_vert_fpus = 4;
     caps->has_tcl = debug_get_bool_option("RADEON_NO_TCL", FALSE) ? FALSE : TRUE;
+    caps->is_r400 = FALSE;
     caps->is_r500 = FALSE;
     caps->high_second_pipe = FALSE;
 
@@ -123,6 +124,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x4A54:
             caps->family = CHIP_FAMILY_R420;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x5548:
@@ -136,6 +138,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x5D57:
             caps->family = CHIP_FAMILY_R423;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x554C:
@@ -147,6 +150,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x5D4A:
             caps->family = CHIP_FAMILY_R430;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x5D4C:
@@ -157,6 +161,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x5D52:
             caps->family = CHIP_FAMILY_R480;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x4B48:
@@ -166,6 +171,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x4B4C:
             caps->family = CHIP_FAMILY_R481;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x5E4C:
@@ -182,6 +188,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x5E4D:
             caps->family = CHIP_FAMILY_RV410;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x5954:
@@ -212,6 +219,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x791F:
             caps->family = CHIP_FAMILY_RS690;
             caps->has_tcl = FALSE;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x793F:
@@ -219,6 +227,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x7942:
             caps->family = CHIP_FAMILY_RS600;
             caps->has_tcl = FALSE;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x796C:
@@ -227,6 +236,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x796F:
             caps->family = CHIP_FAMILY_RS740;
             caps->has_tcl = FALSE;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x7100:
diff --git a/src/gallium/drivers/r300/r300_chipset.h b/src/gallium/drivers/r300/r300_chipset.h
index 0633a8b8a7..2808486492 100644
--- a/src/gallium/drivers/r300/r300_chipset.h
+++ b/src/gallium/drivers/r300/r300_chipset.h
@@ -40,11 +40,18 @@ struct r300_capabilities {
     unsigned num_z_pipes;
     /* Whether or not TCL is physically present */
     boolean has_tcl;
+    /* Whether or not this is R400. The differences compared to their R3xx
+     * cousins are:
+     * - Extended fragment shader registers
+     * - Blend LTE/GTE thresholds */
+    boolean is_r400;
     /* Whether or not this is an RV515 or newer; R500s have many differences
      * that require extra consideration, compared to their R3xx cousins:
      * - Extra bit of width and height on texture sizes
      * - Blend color is split across two registers
-     * - Universal Shader (US) block used for fragment shaders */
+     * - Blend LTE/GTE thresholds
+     * - Universal Shader (US) block used for fragment shaders
+     * - FP16 blending and multisampling */
     boolean is_r500;
     /* Whether or not the second pixel pipe is accessed with the high bit */
     boolean high_second_pipe;
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index af95bbe789..94a9ab3ef3 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -87,7 +87,7 @@ r300_is_texture_referenced(struct pipe_context *pipe,
 {
     struct pipe_buffer* buf = 0;
 
-    r300_get_texture_buffer(texture, &buf, NULL);
+    r300_get_texture_buffer(pipe->screen, texture, &buf, NULL);
 
     return pipe->is_buffer_referenced(pipe, buf);
 }
@@ -110,23 +110,33 @@ static void r300_flush_cb(void *data)
     cs_context_copy->context.flush(&cs_context_copy->context, 0, NULL);
 }
 
-#define R300_INIT_ATOM(name) \
-    r300->name##_state.state = NULL; \
-    r300->name##_state.emit = r300_emit_##name##_state; \
-    r300->name##_state.dirty = FALSE; \
-    insert_at_tail(&r300->atom_list, &r300->name##_state);
+#define R300_INIT_ATOM(atomname, atomsize) \
+    r300->atomname##_state.name = #atomname; \
+    r300->atomname##_state.state = NULL; \
+    r300->atomname##_state.size = atomsize; \
+    r300->atomname##_state.emit = r300_emit_##atomname##_state; \
+    r300->atomname##_state.dirty = FALSE; \
+    insert_at_tail(&r300->atom_list, &r300->atomname##_state);
 
 static void r300_setup_atoms(struct r300_context* r300)
 {
+    /* Create the actual atom list.
+     *
+     * Each atom is examined and emitted in the order it appears here, which
+     * can affect performance and conformance if not handled with care.
+     *
+     * Some atoms never change size, others change every emit. This is just
+     * an upper bound on each atom, to keep the emission machinery from
+     * underallocating space. */
     make_empty_list(&r300->atom_list);
-    R300_INIT_ATOM(ztop);
-    R300_INIT_ATOM(blend);
-    R300_INIT_ATOM(blend_color);
-    R300_INIT_ATOM(clip);
-    R300_INIT_ATOM(dsa);
-    R300_INIT_ATOM(rs);
-    R300_INIT_ATOM(scissor);
-    R300_INIT_ATOM(viewport);
+    R300_INIT_ATOM(ztop, 2);
+    R300_INIT_ATOM(blend, 8);
+    R300_INIT_ATOM(blend_color, 3);
+    R300_INIT_ATOM(clip, 29);
+    R300_INIT_ATOM(dsa, 8);
+    R300_INIT_ATOM(rs, 22);
+    R300_INIT_ATOM(scissor, 3);
+    R300_INIT_ATOM(viewport, 9);
 }
 
 struct pipe_context* r300_create_context(struct pipe_screen* screen,
@@ -143,8 +153,6 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300->context.winsys = (struct pipe_winsys*)radeon_winsys;
     r300->context.screen = screen;
 
-    r300_init_debug(r300);
-
     r300->context.destroy = r300_destroy_context;
 
     r300->context.clear = r300_clear;
@@ -182,7 +190,7 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300->blend_color_state.state = CALLOC_STRUCT(r300_blend_color_state);
     r300->clip_state.state = CALLOC_STRUCT(pipe_clip_state);
     r300->rs_block = CALLOC_STRUCT(r300_rs_block);
-    r300->scissor_state.state = CALLOC_STRUCT(r300_scissor_state);
+    r300->scissor_state.state = CALLOC_STRUCT(pipe_scissor_state);
     r300->vertex_info = CALLOC_STRUCT(r300_vertex_info);
     r300->viewport_state.state = CALLOC_STRUCT(r300_viewport_state);
     r300->ztop_state.state = CALLOC_STRUCT(r300_ztop_state);
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index 5937f0e2cc..5e33dc042a 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -30,16 +30,28 @@
 #include "pipe/p_context.h"
 #include "pipe/p_inlines.h"
 
+#include "r300_screen.h"
+
 struct r300_context;
 
 struct r300_fragment_shader;
 struct r300_vertex_shader;
 
 struct r300_atom {
+    /* List pointers. */
     struct r300_atom *prev, *next;
+    /* Name, for debugging. */
+    const char* name;
+    /* Opaque state. */
     void* state;
+    /* Emit the state to the context. */
     void (*emit)(struct r300_context*, void*);
+    /* Upper bound on number of dwords to emit. */
+    unsigned size;
+    /* Whether this atom should be emitted. */
     boolean dirty;
+    /* Another dirty flag that is never automatically cleared. */
+    boolean always_dirty;
 };
 
 struct r300_blend_state {
@@ -75,10 +87,10 @@ struct r300_rs_state {
     uint32_t point_size;            /* R300_GA_POINT_SIZE: 0x421c */
     uint32_t point_minmax;          /* R300_GA_POINT_MINMAX: 0x4230 */
     uint32_t line_control;          /* R300_GA_LINE_CNTL: 0x4234 */
-    uint32_t depth_scale_front;  /* R300_SU_POLY_OFFSET_FRONT_SCALE: 0x42a4 */
-    uint32_t depth_offset_front;/* R300_SU_POLY_OFFSET_FRONT_OFFSET: 0x42a8 */
-    uint32_t depth_scale_back;    /* R300_SU_POLY_OFFSET_BACK_SCALE: 0x42ac */
-    uint32_t depth_offset_back;  /* R300_SU_POLY_OFFSET_BACK_OFFSET: 0x42b0 */
+    float depth_scale;            /* R300_SU_POLY_OFFSET_FRONT_SCALE: 0x42a4 */
+                                  /* R300_SU_POLY_OFFSET_BACK_SCALE: 0x42ac */
+    float depth_offset;           /* R300_SU_POLY_OFFSET_FRONT_OFFSET: 0x42a8 */
+                                  /* R300_SU_POLY_OFFSET_BACK_OFFSET: 0x42b0 */
     uint32_t polygon_offset_enable; /* R300_SU_POLY_OFFSET_ENABLE: 0x42b4 */
     uint32_t cull_mode;             /* R300_SU_CULL_MODE: 0x42b8 */
     uint32_t line_stipple_config;   /* R300_GA_LINE_STIPPLE_CONFIG: 0x4328 */
@@ -106,16 +118,6 @@ struct r300_sampler_state {
     unsigned min_lod, max_lod;
 };
 
-struct r300_scissor_regs {
-    uint32_t top_left;     /* R300_SC_SCISSORS_TL: 0x43e0 */
-    uint32_t bottom_right; /* R300_SC_SCISSORS_BR: 0x43e4 */
-};
-
-struct r300_scissor_state {
-    struct r300_scissor_regs framebuffer;
-    struct r300_scissor_regs scissor;
-};
-
 struct r300_texture_state {
     uint32_t format0; /* R300_TX_FORMAT0: 0x4480 */
     uint32_t format1; /* R300_TX_FORMAT1: 0x44c0 */
@@ -188,6 +190,12 @@ struct r300_query {
     struct r300_query* next;
 };
 
+enum r300_buffer_tiling {
+    R300_BUFFER_LINEAR = 0,
+    R300_BUFFER_TILED,
+    R300_BUFFER_SQUARETILED
+};
+
 struct r300_texture {
     /* Parent class */
     struct pipe_texture tex;
@@ -224,6 +232,9 @@ struct r300_texture {
 
     /* Registers carrying texture format data. */
     struct r300_texture_state state;
+
+    /* Buffer tiling */
+    enum r300_buffer_tiling microtile, macrotile;
 };
 
 struct r300_vertex_info {
@@ -315,9 +326,10 @@ struct r300_context {
     uint32_t dirty_hw;
     /* Whether the TCL engine should be in bypass mode. */
     boolean tcl_bypass;
-
-    /** Combination of DBG_xxx flags */
-    unsigned debug;
+    /* Whether polygon offset is enabled. */
+    boolean polygon_offset_enabled;
+    /* Z buffer bit depth. */
+    uint32_t zbuffer_bpp;
 };
 
 /* Convenience cast wrapper. */
@@ -331,35 +343,15 @@ struct draw_stage* r300_draw_stage(struct r300_context* r300);
 void r300_init_state_functions(struct r300_context* r300);
 void r300_init_surface_functions(struct r300_context* r300);
 
-/* Debug functionality. */
-
-/**
- * Debug flags to disable/enable certain groups of debugging outputs.
- *
- * \note These may be rather coarse, and the grouping may be impractical.
- * If you find, while debugging the driver, that a different grouping
- * of these flags would be beneficial, just feel free to change them
- * but make sure to update the documentation in r300_debug.c to reflect
- * those changes.
- */
-/*@{*/
-#define DBG_HELP    0x0000001
-#define DBG_FP      0x0000002
-#define DBG_VP      0x0000004
-#define DBG_CS      0x0000008
-#define DBG_DRAW    0x0000010
-#define DBG_TEX     0x0000020
-#define DBG_FALL    0x0000040
-/*@}*/
-
-static INLINE boolean DBG_ON(struct r300_context * ctx, unsigned flags)
+static INLINE boolean CTX_DBG_ON(struct r300_context * ctx, unsigned flags)
 {
-    return (ctx->debug & flags) ? TRUE : FALSE;
+    return SCREEN_DBG_ON(r300_screen(ctx->context.screen), flags);
 }
 
-static INLINE void DBG(struct r300_context * ctx, unsigned flags, const char * fmt, ...)
+static INLINE void CTX_DBG(struct r300_context * ctx, unsigned flags,
+                       const char * fmt, ...)
 {
-    if (DBG_ON(ctx, flags)) {
+    if (CTX_DBG_ON(ctx, flags)) {
         va_list va;
         va_start(va, fmt);
         debug_vprintf(fmt, va);
@@ -367,6 +359,8 @@ static INLINE void DBG(struct r300_context * ctx, unsigned flags, const char * f
     }
 }
 
-void r300_init_debug(struct r300_context * ctx);
+#define DBG_ON  CTX_DBG_ON
+#define DBG     CTX_DBG
 
 #endif /* R300_CONTEXT_H */
+
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
index d142fee050..151f72b0fe 100644
--- a/src/gallium/drivers/r300/r300_cs.h
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -52,7 +52,7 @@
 #define CS_LOCALS(context) \
     struct r300_context* const cs_context_copy = (context); \
     struct radeon_winsys* cs_winsys = cs_context_copy->winsys; \
-    int cs_count = 0;
+    int cs_count = 0; (void) cs_count;
 
 #define CHECK_CS(size) \
     assert(cs_winsys->check_cs(cs_winsys, (size)))
diff --git a/src/gallium/drivers/r300/r300_debug.c b/src/gallium/drivers/r300/r300_debug.c
index 2a6ed54ac9..00d4f31c2b 100644
--- a/src/gallium/drivers/r300/r300_debug.c
+++ b/src/gallium/drivers/r300/r300_debug.c
@@ -46,7 +46,7 @@ static struct debug_option debug_options[] = {
     { 0, 0, 0 }
 };
 
-void r300_init_debug(struct r300_context * ctx)
+void r300_init_debug(struct r300_screen * screen)
 {
     const char * options = debug_get_option("RADEON_DEBUG", 0);
     boolean printhint = FALSE;
@@ -64,7 +64,7 @@ void r300_init_debug(struct r300_context * ctx)
 
             for(opt = debug_options; opt->name; ++opt) {
                 if (!strncmp(options, opt->name, length)) {
-                    ctx->debug |= opt->flag;
+                    screen->debug |= opt->flag;
                     break;
                 }
             }
@@ -77,11 +77,11 @@ void r300_init_debug(struct r300_context * ctx)
             options += length;
         }
 
-        if (!ctx->debug)
+        if (!screen->debug)
             printhint = TRUE;
     }
 
-    if (printhint || ctx->debug & DBG_HELP) {
+    if (printhint || screen->debug & DBG_HELP) {
         debug_printf("You can enable debug output by setting the RADEON_DEBUG environment variable\n"
                      "to a comma-separated list of debug options. Available options are:\n");
         for(opt = debug_options; opt->name; ++opt) {
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index 0e5533c790..2ea9fab015 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -41,6 +41,7 @@ void r300_emit_blend_state(struct r300_context* r300, void* state)
 {
     struct r300_blend_state* blend = (struct r300_blend_state*)state;
     CS_LOCALS(r300);
+
     BEGIN_CS(8);
     OUT_CS_REG(R300_RB3D_ROPCNTL, blend->rop);
     OUT_CS_REG_SEQ(R300_RB3D_CBLEND, 3);
@@ -419,8 +420,10 @@ void r300_emit_fb_state(struct r300_context* r300,
 
         OUT_CS_REG_SEQ(R300_RB3D_COLORPITCH0 + (4 * i), 1);
         OUT_CS_RELOC(tex->buffer, tex->pitch[surf->level] |
-                     r300_translate_colorformat(tex->tex.format), 0,
-                     RADEON_GEM_DOMAIN_VRAM, 0);
+                     r300_translate_colorformat(tex->tex.format) |
+                     R300_COLOR_TILE(tex->macrotile) |
+                     R300_COLOR_MICROTILE(tex->microtile),
+                     0, RADEON_GEM_DOMAIN_VRAM, 0);
 
         OUT_CS_REG(R300_US_OUT_FMT_0 + (4 * i),
             r300_translate_out_fmt(surf->format));
@@ -443,8 +446,10 @@ void r300_emit_fb_state(struct r300_context* r300,
         OUT_CS_REG(R300_ZB_FORMAT, r300_translate_zsformat(tex->tex.format));
 
         OUT_CS_REG_SEQ(R300_ZB_DEPTHPITCH, 1);
-        OUT_CS_RELOC(tex->buffer, tex->pitch[surf->level], 0,
-                     RADEON_GEM_DOMAIN_VRAM, 0);
+        OUT_CS_RELOC(tex->buffer, tex->pitch[surf->level] |
+                     R300_DEPTHMACROTILE(tex->macrotile) |
+                     R300_DEPTHMICROTILE(tex->microtile),
+                     0, RADEON_GEM_DOMAIN_VRAM, 0);
     }
 
     END_CS;
@@ -579,19 +584,37 @@ void r300_emit_query_end(struct r300_context* r300)
 void r300_emit_rs_state(struct r300_context* r300, void* state)
 {
     struct r300_rs_state* rs = (struct r300_rs_state*)state;
+    float scale, offset;
     CS_LOCALS(r300);
 
-    BEGIN_CS(22);
+    BEGIN_CS(18 + (rs->polygon_offset_enable ? 5 : 0));
     OUT_CS_REG(R300_VAP_CNTL_STATUS, rs->vap_control_status);
     OUT_CS_REG(R300_GA_POINT_SIZE, rs->point_size);
     OUT_CS_REG_SEQ(R300_GA_POINT_MINMAX, 2);
     OUT_CS(rs->point_minmax);
     OUT_CS(rs->line_control);
-    OUT_CS_REG_SEQ(R300_SU_POLY_OFFSET_FRONT_SCALE, 6);
-    OUT_CS(rs->depth_scale_front);
-    OUT_CS(rs->depth_offset_front);
-    OUT_CS(rs->depth_scale_back);
-    OUT_CS(rs->depth_offset_back);
+
+    if (rs->polygon_offset_enable) {
+        scale = rs->depth_scale * 12;
+        offset = rs->depth_offset;
+
+        switch (r300->zbuffer_bpp) {
+            case 16:
+                offset *= 4;
+                break;
+            case 24:
+                offset *= 2;
+                break;
+        }
+
+        OUT_CS_REG_SEQ(R300_SU_POLY_OFFSET_FRONT_SCALE, 4);
+        OUT_CS_32F(scale);
+        OUT_CS_32F(offset);
+        OUT_CS_32F(scale);
+        OUT_CS_32F(offset);
+    }
+
+    OUT_CS_REG_SEQ(R300_SU_POLY_OFFSET_ENABLE, 2);
     OUT_CS(rs->polygon_offset_enable);
     OUT_CS(rs->cull_mode);
     OUT_CS_REG(R300_GA_LINE_STIPPLE_CONFIG, rs->line_stipple_config);
@@ -641,27 +664,47 @@ void r300_emit_rs_block_state(struct r300_context* r300,
     END_CS;
 }
 
-static void r300_emit_scissor_regs(struct r300_context* r300,
-                                   struct r300_scissor_regs* scissor)
+void r300_emit_scissor_state(struct r300_context* r300, void* state)
 {
+    unsigned minx, miny, maxx, maxy;
+    uint32_t top_left, bottom_right;
+    struct r300_screen* r300screen = r300_screen(r300->context.screen);
+    struct pipe_scissor_state* scissor = (struct pipe_scissor_state*)state;
     CS_LOCALS(r300);
 
-    BEGIN_CS(3);
-    OUT_CS_REG_SEQ(R300_SC_SCISSORS_TL, 2);
-    OUT_CS(scissor->top_left);
-    OUT_CS(scissor->bottom_right);
-    END_CS;
-}
+    minx = miny = 0;
+    maxx = r300->framebuffer_state.width;
+    maxy = r300->framebuffer_state.height;
 
-void r300_emit_scissor_state(struct r300_context* r300, void* state)
-{
-    struct r300_scissor_state* scissor = (struct r300_scissor_state*)state;
-    /* XXX argfl! */
     if (((struct r300_rs_state*)r300->rs_state.state)->rs.scissor) {
-        r300_emit_scissor_regs(r300, &scissor->scissor);
+        minx = MAX2(minx, scissor->minx);
+        miny = MAX2(miny, scissor->miny);
+        maxx = MIN2(maxx, scissor->maxx);
+        maxy = MIN2(maxy, scissor->maxy);
+    }
+
+    if (r300screen->caps->is_r500) {
+        top_left =
+            (minx << R300_SCISSORS_X_SHIFT) |
+            (miny << R300_SCISSORS_Y_SHIFT);
+        bottom_right =
+            ((maxx - 1) << R300_SCISSORS_X_SHIFT) |
+            ((maxy - 1) << R300_SCISSORS_Y_SHIFT);
     } else {
-        r300_emit_scissor_regs(r300, &scissor->framebuffer);
+        /* Offset of 1440 in non-R500 chipsets. */
+        top_left =
+            ((minx + 1440) << R300_SCISSORS_X_SHIFT) |
+            ((miny + 1440) << R300_SCISSORS_Y_SHIFT);
+        bottom_right =
+            (((maxx - 1) + 1440) << R300_SCISSORS_X_SHIFT) |
+            (((maxy - 1) + 1440) << R300_SCISSORS_Y_SHIFT);
     }
+
+    BEGIN_CS(3);
+    OUT_CS_REG_SEQ(R300_SC_SCISSORS_TL, 2);
+    OUT_CS(top_left);
+    OUT_CS(bottom_right);
+    END_CS;
 }
 
 void r300_emit_texture(struct r300_context* r300,
@@ -680,12 +723,18 @@ void r300_emit_texture(struct r300_context* r300,
         filter0 |= R300_TX_WRAP_T(R300_TX_CLAMP_TO_EDGE);
     }
 
-    /* determine min/max levels */
-    /* the MAX_MIP level is the largest (finest) one */
-    max_level = MIN2(sampler->max_lod, tex->tex.last_level);
-    min_level = MIN2(sampler->min_lod, max_level);
-    format0 |= R300_TX_NUM_LEVELS(max_level);
-    filter0 |= R300_TX_MAX_MIP_LEVEL(min_level);
+    if (tex->is_npot) {
+        /* NPOT textures don't support mip filter, unfortunately.
+         * This prevents incorrect rendering. */
+        filter0 &= ~R300_TX_MIN_FILTER_MIP_MASK;
+    } else {
+        /* determine min/max levels */
+        /* the MAX_MIP level is the largest (finest) one */
+        max_level = MIN2(sampler->max_lod, tex->tex.last_level);
+        min_level = MIN2(sampler->min_lod, max_level);
+        format0 |= R300_TX_NUM_LEVELS(max_level);
+        filter0 |= R300_TX_MAX_MIP_LEVEL(min_level);
+    }
 
     BEGIN_CS(16);
     OUT_CS_REG(R300_TX_FILTER0_0 + (offset * 4), filter0 |
@@ -697,8 +746,10 @@ void r300_emit_texture(struct r300_context* r300,
     OUT_CS_REG(R300_TX_FORMAT1_0 + (offset * 4), tex->state.format1);
     OUT_CS_REG(R300_TX_FORMAT2_0 + (offset * 4), tex->state.format2);
     OUT_CS_REG_SEQ(R300_TX_OFFSET_0 + (offset * 4), 1);
-    OUT_CS_RELOC(tex->buffer, 0,
-            RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0, 0);
+    OUT_CS_RELOC(tex->buffer,
+                 R300_TXO_MACRO_TILE(tex->macrotile) |
+                 R300_TXO_MICRO_TILE(tex->microtile),
+                 RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0, 0);
     END_CS;
 }
 
@@ -764,32 +815,6 @@ void r300_emit_aos(struct r300_context* r300, unsigned offset)
     END_CS;
 }
 
-#if 0
-void r300_emit_draw_packet(struct r300_context* r300)
-{
-    CS_LOCALS(r300);
-
-    DBG(r300, DBG_DRAW, "r300: Preparing vertex buffer %p for render, "
-            "vertex size %d\n", r300->vbo,
-            r300->vertex_info->vinfo.size);
-    /* Set the pointer to our vertex buffer. The emitted values are this:
-     * PACKET3 [3D_LOAD_VBPNTR]
-     * COUNT   [1]
-     * FORMAT  [size | stride << 8]
-     * OFFSET  [offset into BO]
-     * VBPNTR  [relocated BO]
-     */
-    BEGIN_CS(7);
-    OUT_CS_PKT3(R300_PACKET3_3D_LOAD_VBPNTR, 3);
-    OUT_CS(1);
-    OUT_CS(r300->vertex_info->vinfo.size |
-            (r300->vertex_info->vinfo.size << 8));
-    OUT_CS(r300->vbo_offset);
-    OUT_CS_RELOC(r300->vbo, 0, RADEON_GEM_DOMAIN_GTT, 0, 0);
-    END_CS;
-}
-#endif
-
 void r300_emit_vertex_format_state(struct r300_context* r300)
 {
     int i;
@@ -992,14 +1017,23 @@ void r300_emit_dirty_state(struct r300_context* r300)
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     struct r300_texture* tex;
     struct r300_atom* atom;
-    int i, dirty_tex = 0;
+    unsigned i, dwords = 1024;
+    int dirty_tex = 0;
     boolean invalid = FALSE;
 
-    /* Check size of CS. */
-    /* Make sure we have at least 8*1024 spare dwords. */
+    /* Check the required number of dwords against the space remaining in the
+     * current CS object. If we need more, then flush. */
+
+    foreach(atom, &r300->atom_list) {
+        if (atom->dirty || atom->always_dirty) {
+            dwords += atom->size;
+        }
+    }
+
+    /* Make sure we have at least 2*1024 spare dwords. */
     /* XXX It would be nice to know the number of dwords we really need to
      * XXX emit. */
-    if (!r300->winsys->check_cs(r300->winsys, 8*1024)) {
+    if (!r300->winsys->check_cs(r300->winsys, dwords)) {
         r300->context.flush(&r300->context, 0, NULL);
     }
 
@@ -1039,10 +1073,12 @@ validate:
         }
     }
     /* ...occlusion query buffer... */
-    if (!r300->winsys->add_buffer(r300->winsys, r300->oqbo,
-                0, RADEON_GEM_DOMAIN_GTT)) {
-        r300->context.flush(&r300->context, 0, NULL);
-        goto validate;
+    if (r300->dirty_state & R300_NEW_QUERY) {
+        if (!r300->winsys->add_buffer(r300->winsys, r300->oqbo,
+                    0, RADEON_GEM_DOMAIN_GTT)) {
+            r300->context.flush(&r300->context, 0, NULL);
+            goto validate;
+        }
     }
     /* ...and vertex buffer. */
     if (r300->vbo) {
@@ -1071,7 +1107,7 @@ validate:
     }
 
     foreach(atom, &r300->atom_list) {
-        if (atom->dirty) {
+        if (atom->dirty || atom->always_dirty) {
             atom->emit(r300, atom->state);
             atom->dirty = FALSE;
         }
diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c
index 14a08241fc..59819cb106 100644
--- a/src/gallium/drivers/r300/r300_flush.c
+++ b/src/gallium/drivers/r300/r300_flush.c
@@ -37,8 +37,10 @@ static void r300_flush(struct pipe_context* pipe,
 {
     struct r300_context *r300 = r300_context(pipe);
     struct r300_query *query;
+    struct r300_atom *atom;
 
     CS_LOCALS(r300);
+    (void) cs_count;
     /* We probably need to flush Draw, but we may have been called from
      * within Draw. This feels kludgy, but it might be the best thing.
      *
@@ -54,7 +56,15 @@ static void r300_flush(struct pipe_context* pipe,
         r300_emit_invariant_state(r300);
         r300->dirty_state = R300_NEW_KITCHEN_SINK;
         r300->dirty_hw = 0;
+
+        /* New kitchen sink, baby. */
+        foreach(atom, &r300->atom_list) {
+            if (atom->state) {
+                atom->dirty = TRUE;
+            }
+        }
     }
+
     /* reset flushed query */
     foreach(query, &r300->query_list) {
         query->flushed = TRUE;
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index 034bfc15cf..361813891f 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -1619,18 +1619,20 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_TX_OFFSET_5                    0x4554
 #define R300_TX_OFFSET_6                    0x4558
 #define R300_TX_OFFSET_7                    0x455C
-	/* BEGIN: Guess from R200 */
+
 #       define R300_TXO_ENDIAN_NO_SWAP           (0 << 0)
 #       define R300_TXO_ENDIAN_BYTE_SWAP         (1 << 0)
 #       define R300_TXO_ENDIAN_WORD_SWAP         (2 << 0)
 #       define R300_TXO_ENDIAN_HALFDW_SWAP       (3 << 0)
-#       define R300_TXO_MACRO_TILE               (1 << 2)
+#       define R300_TXO_MACRO_TILE_LINEAR        (0 << 2)
+#       define R300_TXO_MACRO_TILE_TILED         (1 << 2)
+#       define R300_TXO_MACRO_TILE(x)            ((x) << 2)
 #       define R300_TXO_MICRO_TILE_LINEAR        (0 << 3)
-#       define R300_TXO_MICRO_TILE               (1 << 3)
-#       define R300_TXO_MICRO_TILE_SQUARE        (2 << 3)
+#       define R300_TXO_MICRO_TILE_TILED         (1 << 3)
+#       define R300_TXO_MICRO_TILE_TILED_SQUARE  (2 << 3)
+#       define R300_TXO_MICRO_TILE(x)            ((x) << 3)
 #       define R300_TXO_OFFSET_MASK              0xffffffe0
 #       define R300_TXO_OFFSET_SHIFT             5
-	/* END: Guess from R200 */
 
 /* 32 bit chroma key */
 #define R300_TX_CHROMA_KEY_0                      0x4580
@@ -2283,9 +2285,11 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_COLORPITCH_MASK              0x00003FFE
 #       define R300_COLOR_TILE_DISABLE            (0 << 16)
 #       define R300_COLOR_TILE_ENABLE             (1 << 16)
+#       define R300_COLOR_TILE(x)                 ((x) << 16)
 #       define R300_COLOR_MICROTILE_DISABLE       (0 << 17)
 #       define R300_COLOR_MICROTILE_ENABLE        (1 << 17)
 #       define R300_COLOR_MICROTILE_ENABLE_SQUARE (2 << 17) /* Only available in 16-bit */
+#       define R300_COLOR_MICROTILE(x)            ((x) << 17)
 #       define R300_COLOR_ENDIAN_NO_SWAP          (0 << 19)
 #       define R300_COLOR_ENDIAN_WORD_SWAP        (1 << 19)
 #       define R300_COLOR_ENDIAN_DWORD_SWAP       (2 << 19)
@@ -2544,9 +2548,11 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_DEPTHPITCH_MASK              0x00003FFC
 #       define R300_DEPTHMACROTILE_DISABLE      (0 << 16)
 #       define R300_DEPTHMACROTILE_ENABLE       (1 << 16)
+#       define R300_DEPTHMACROTILE(x)           ((x) << 16)
 #       define R300_DEPTHMICROTILE_LINEAR       (0 << 17)
 #       define R300_DEPTHMICROTILE_TILED        (1 << 17)
 #       define R300_DEPTHMICROTILE_TILED_SQUARE (2 << 17)
+#       define R300_DEPTHMICROTILE(x)           ((x) << 17)
 #       define R300_DEPTHENDIAN_NO_SWAP         (0 << 18)
 #       define R300_DEPTHENDIAN_WORD_SWAP       (1 << 18)
 #       define R300_DEPTHENDIAN_DWORD_SWAP      (2 << 18)
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index ee43421cdb..90de062bcd 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -114,6 +114,44 @@ static uint32_t r300_provoking_vertex_fixes(struct r300_context *r300,
     return color_control;
 }
 
+static void r300_emit_draw_immediate(struct r300_context *r300,
+                                     unsigned mode,
+                                     unsigned start,
+                                     unsigned count)
+{
+    struct pipe_buffer* vbo = r300->vertex_buffer[0].buffer;
+    unsigned vertex_size = r300->vertex_buffer[0].stride / sizeof(float);
+    unsigned i;
+    uint32_t* map;
+    CS_LOCALS(r300);
+
+    map = (uint32_t*)pipe_buffer_map_range(r300->context.screen, vbo,
+            start * vertex_size, count * vertex_size,
+            PIPE_BUFFER_USAGE_CPU_READ);
+
+    BEGIN_CS(10 + count * vertex_size);
+    OUT_CS_REG(R300_GA_COLOR_CONTROL,
+            r300_provoking_vertex_fixes(r300, mode));
+    OUT_CS_REG(R300_VAP_VTX_SIZE, vertex_size);
+    OUT_CS_REG(R300_VAP_VF_MIN_VTX_INDX, 0);
+    OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, count - 1);
+    OUT_CS_PKT3(R300_PACKET3_3D_DRAW_IMMD_2, count * vertex_size);
+    OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_EMBEDDED | (count << 16) |
+            r300_translate_primitive(mode));
+    //debug_printf("r300: Immd %d verts, %d attrs\n", count, vertex_size);
+    for (i = 0; i < count * vertex_size; i++) {
+        if (i % vertex_size == 0) {
+            //debug_printf("r300: -- vert --\n");
+        }
+        //debug_printf("r300: 0x%08x\n", *map);
+        OUT_CS(*map);
+        map++;
+    }
+    END_CS;
+
+    pipe_buffer_unmap(r300->context.screen, vbo);
+}
+
 static void r300_emit_draw_arrays(struct r300_context *r300,
                                   unsigned mode,
                                   unsigned count)
@@ -207,17 +245,49 @@ validate:
     return TRUE;
 }
 
+static void r300_shorten_ubyte_elts(struct r300_context* r300,
+                                    struct pipe_buffer** elts,
+                                    unsigned count)
+{
+    struct pipe_screen* screen = r300->context.screen;
+    struct pipe_buffer* new_elts;
+    unsigned char *in_map;
+    unsigned short *out_map;
+    unsigned i;
+
+    new_elts = screen->buffer_create(screen, 32,
+                                     PIPE_BUFFER_USAGE_INDEX |
+                                     PIPE_BUFFER_USAGE_CPU_WRITE |
+                                     PIPE_BUFFER_USAGE_GPU_READ,
+                                     2 * count);
+
+    in_map = pipe_buffer_map(screen, *elts, PIPE_BUFFER_USAGE_CPU_READ);
+    out_map = pipe_buffer_map(screen, new_elts, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+    for (i = 0; i < count; i++) {
+        *out_map = (unsigned short)*in_map;
+        in_map++;
+        out_map++;
+    }
+
+    pipe_buffer_unmap(screen, *elts);
+    pipe_buffer_unmap(screen, new_elts);
+
+    *elts = new_elts;
+}
+
 /* This is the fast-path drawing & emission for HW TCL. */
 void r300_draw_range_elements(struct pipe_context* pipe,
-                                 struct pipe_buffer* indexBuffer,
-                                 unsigned indexSize,
-                                 unsigned minIndex,
-                                 unsigned maxIndex,
-                                 unsigned mode,
-                                 unsigned start,
-                                 unsigned count)
+                              struct pipe_buffer* indexBuffer,
+                              unsigned indexSize,
+                              unsigned minIndex,
+                              unsigned maxIndex,
+                              unsigned mode,
+                              unsigned start,
+                              unsigned count)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct pipe_buffer* orgIndexBuffer = indexBuffer;
 
     if (!u_trim_pipe_prim(mode, &count)) {
         return;
@@ -236,13 +306,18 @@ void r300_draw_range_elements(struct pipe_context* pipe,
         return;
     }
 
+    if (indexSize == 1) {
+        r300_shorten_ubyte_elts(r300, &indexBuffer, count);
+        indexSize = 2;
+    }
+
     if (!r300->winsys->add_buffer(r300->winsys, indexBuffer,
                                   RADEON_GEM_DOMAIN_GTT, 0)) {
-        return;
+        goto cleanup;
     }
 
     if (!r300->winsys->validate(r300->winsys)) {
-        return;
+        goto cleanup;
     }
 
     r300_emit_dirty_state(r300);
@@ -251,6 +326,11 @@ void r300_draw_range_elements(struct pipe_context* pipe,
 
     r300_emit_draw_elements(r300, indexBuffer, indexSize, minIndex, maxIndex,
                             mode, start, count);
+
+cleanup:
+    if (indexBuffer != orgIndexBuffer) {
+        pipe->screen->buffer_destroy(indexBuffer);
+    }
 }
 
 /* Simple helpers for context setup. Should probably be moved to util. */
@@ -264,7 +344,7 @@ void r300_draw_elements(struct pipe_context* pipe,
 }
 
 void r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
-                         unsigned start, unsigned count)
+                      unsigned start, unsigned count)
 {
     struct r300_context* r300 = r300_context(pipe);
 
@@ -287,9 +367,12 @@ void r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
 
     r300_emit_dirty_state(r300);
 
-    r300_emit_aos(r300, start);
-
-    r300_emit_draw_arrays(r300, mode, count);
+    if (FALSE && count <= 4 && r300->vertex_buffer_count == 1) {
+        r300_emit_draw_immediate(r300, mode, start, count);
+    } else {
+        r300_emit_aos(r300, start);
+        r300_emit_draw_arrays(r300, mode, count);
+    }
 }
 
 /****************************************************************************
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 287664b1d2..67325c6b80 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -183,10 +183,20 @@ static float r300_get_paramf(struct pipe_screen* pscreen, int param)
     }
 }
 
-static boolean check_tex_format(enum pipe_format format, uint32_t usage,
-                                boolean is_r500)
+static boolean r300_is_format_supported(struct pipe_screen* screen,
+                                        enum pipe_format format,
+                                        enum pipe_texture_target target,
+                                        unsigned usage,
+                                        unsigned geom_flags)
 {
     uint32_t retval = 0;
+    boolean is_r500 = r300_screen(screen)->caps->is_r500;
+
+    if (target >= PIPE_MAX_TEXTURE_TYPES) {
+        debug_printf("r300: Implementation error: Received bogus texture "
+            "target %d in %s\n", target, __FUNCTION__);
+        return FALSE;
+    }
 
     switch (format) {
         /* Supported formats. */
@@ -247,28 +257,13 @@ static boolean check_tex_format(enum pipe_format format, uint32_t usage,
         case PIPE_FORMAT_Z32_UNORM:
         case PIPE_FORMAT_S8Z24_UNORM:
         case PIPE_FORMAT_X8Z24_UNORM:
-            debug_printf("r300: Note: Got unsupported format: %s in %s\n",
-                pf_name(format), __FUNCTION__);
+            SCREEN_DBG(r300_screen(screen), DBG_TEX,
+                       "r300: Note: Got unsupported format: %s in %s\n",
+                       pf_name(format), __FUNCTION__);
             return FALSE;
 
-        /* XXX These don't even exist
-        case PIPE_FORMAT_A32R32G32B32:
-        case PIPE_FORMAT_A16R16G16B16: */
-        /* XXX What the deuce is UV88? (r3xx accel page 14)
-            debug_printf("r300: Warning: Got unimplemented format: %s in %s\n",
-                pf_name(format), __FUNCTION__);
-            return FALSE; */
-
-        /* XXX Supported yet unimplemented r5xx formats: */
-        /* XXX Again, what is UV1010 this time? (r5xx accel page 148) */
-        /* XXX Even more that don't exist
-        case PIPE_FORMAT_A10R10G10B10_UNORM:
-        case PIPE_FORMAT_A2R10G10B10_UNORM:
-        case PIPE_FORMAT_I10_UNORM:
-            debug_printf(
-                "r300: Warning: Got unimplemented r500 format: %s in %s\n",
-                pf_name(format), __FUNCTION__);
-            return FALSE; */
+        /* XXX Add all remaining gallium-supported formats,
+         * see util/u_format.csv. */
 
         default:
             /* Unknown format... */
@@ -286,30 +281,6 @@ static boolean check_tex_format(enum pipe_format format, uint32_t usage,
     return (retval >= usage);
 }
 
-static boolean r300_is_format_supported(struct pipe_screen* pscreen,
-                                        enum pipe_format format,
-                                        enum pipe_texture_target target,
-                                        unsigned tex_usage,
-                                        unsigned geom_flags)
-{
-    switch (target) {
-        case PIPE_TEXTURE_1D:   /* handle 1D textures as 2D ones */
-        case PIPE_TEXTURE_2D:
-        case PIPE_TEXTURE_3D:
-        case PIPE_TEXTURE_CUBE:
-            return check_tex_format(format, tex_usage,
-                r300_screen(pscreen)->caps->is_r500);
-
-        default:
-            debug_printf("r300: Fatal: This is not a format target: %d\n",
-                target);
-            assert(0);
-            break;
-    }
-
-    return FALSE;
-}
-
 static struct pipe_transfer*
 r300_get_tex_transfer(struct pipe_screen *screen,
                       struct pipe_texture *texture,
@@ -319,6 +290,7 @@ r300_get_tex_transfer(struct pipe_screen *screen,
 {
     struct r300_texture *tex = (struct r300_texture *)texture;
     struct r300_transfer *trans;
+    struct r300_screen *rscreen = r300_screen(screen);
     unsigned offset;
 
     offset = r300_texture_get_offset(tex, level, zslice, face);  /* in bytes */
@@ -330,11 +302,8 @@ r300_get_tex_transfer(struct pipe_screen *screen,
         trans->transfer.y = y;
         trans->transfer.width = w;
         trans->transfer.height = h;
-        trans->transfer.stride = r300_texture_get_stride(tex, level);
+        trans->transfer.stride = r300_texture_get_stride(rscreen, tex, level);
         trans->transfer.usage = usage;
-
-        /* XXX not sure whether it's required to set these two,
-               the driver doesn't use them */
         trans->transfer.zslice = zslice;
         trans->transfer.face = face;
 
@@ -396,6 +365,7 @@ struct pipe_screen* r300_create_screen(struct radeon_winsys* radeon_winsys)
     caps->num_frag_pipes = radeon_winsys->gb_pipes;
     caps->num_z_pipes = radeon_winsys->z_pipes;
 
+    r300_init_debug(r300screen);
     r300_parse_chipset(caps);
 
     r300screen->caps = caps;
diff --git a/src/gallium/drivers/r300/r300_screen.h b/src/gallium/drivers/r300/r300_screen.h
index 2217988add..580fda3984 100644
--- a/src/gallium/drivers/r300/r300_screen.h
+++ b/src/gallium/drivers/r300/r300_screen.h
@@ -35,6 +35,9 @@ struct r300_screen {
 
     /* Chipset capabilities */
     struct r300_capabilities* caps;
+
+    /** Combination of DBG_xxx flags */
+    unsigned debug;
 };
 
 struct r300_transfer {
@@ -60,4 +63,44 @@ r300_transfer(struct pipe_transfer* transfer)
 /* Creates a new r300 screen. */
 struct pipe_screen* r300_create_screen(struct radeon_winsys* radeon_winsys);
 
+/* Debug functionality. */
+
+/**
+ * Debug flags to disable/enable certain groups of debugging outputs.
+ *
+ * \note These may be rather coarse, and the grouping may be impractical.
+ * If you find, while debugging the driver, that a different grouping
+ * of these flags would be beneficial, just feel free to change them
+ * but make sure to update the documentation in r300_debug.c to reflect
+ * those changes.
+ */
+/*@{*/
+#define DBG_HELP    0x0000001
+#define DBG_FP      0x0000002
+#define DBG_VP      0x0000004
+#define DBG_CS      0x0000008
+#define DBG_DRAW    0x0000010
+#define DBG_TEX     0x0000020
+#define DBG_FALL    0x0000040
+/*@}*/
+
+static INLINE boolean SCREEN_DBG_ON(struct r300_screen * screen, unsigned flags)
+{
+    return (screen->debug & flags) ? TRUE : FALSE;
+}
+
+static INLINE void SCREEN_DBG(struct r300_screen * screen, unsigned flags,
+                              const char * fmt, ...)
+{
+    if (SCREEN_DBG_ON(screen, flags)) {
+        va_list va;
+        va_start(va, fmt);
+        debug_vprintf(fmt, va);
+        va_end(va);
+    }
+}
+
+void r300_init_debug(struct r300_screen* ctx);
+
 #endif /* R300_SCREEN_H */
+
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index 78764ddc98..e2ec0bc5bd 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -340,6 +340,7 @@ static void r300_set_blend_color(struct pipe_context* pipe,
                                  const struct pipe_blend_color* color)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct r300_screen* r300screen = r300_screen(pipe->screen);
     struct r300_blend_color_state* state =
         (struct r300_blend_color_state*)r300->blend_color_state.state;
     union util_color uc;
@@ -355,6 +356,7 @@ static void r300_set_blend_color(struct pipe_context* pipe,
         float_to_fixed10(color->color[2]) |
         (float_to_fixed10(color->color[1]) << 16);
 
+    r300->blend_color_state.size = r300screen->caps->is_r500 ? 3 : 2;
     r300->blend_color_state.dirty = TRUE;
 }
 
@@ -365,11 +367,14 @@ static void r300_set_clip_state(struct pipe_context* pipe,
 
     if (r300_screen(pipe->screen)->caps->has_tcl) {
         memcpy(r300->clip_state.state, state, sizeof(struct pipe_clip_state));
-        r300->clip_state.dirty = TRUE;
+        r300->clip_state.size = 29;
     } else {
         draw_flush(r300->draw);
         draw_set_clip_state(r300->draw, state);
+        r300->clip_state.size = 2;
     }
+
+    r300->clip_state.dirty = TRUE;
 }
 
 /* Create a new depth, stencil, and alpha state based on the CSO dsa state.
@@ -462,8 +467,10 @@ static void r300_bind_dsa_state(struct pipe_context* pipe,
                                 void* state)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct r300_screen* r300screen = r300_screen(pipe->screen);
 
     r300->dsa_state.state = state;
+    r300->dsa_state.size = r300screen->caps->is_r500 ? 8 : 6;
     r300->dsa_state.dirty = TRUE;
 }
 
@@ -474,36 +481,12 @@ static void r300_delete_dsa_state(struct pipe_context* pipe,
     FREE(state);
 }
 
-static void r300_set_scissor_regs(const struct pipe_scissor_state* state,
-                                  struct r300_scissor_regs *scissor,
-                                  boolean is_r500)
-{
-    if (is_r500) {
-        scissor->top_left =
-            (state->minx << R300_SCISSORS_X_SHIFT) |
-            (state->miny << R300_SCISSORS_Y_SHIFT);
-        scissor->bottom_right =
-            ((state->maxx - 1) << R300_SCISSORS_X_SHIFT) |
-            ((state->maxy - 1) << R300_SCISSORS_Y_SHIFT);
-    } else {
-        /* Offset of 1440 in non-R500 chipsets. */
-        scissor->top_left =
-            ((state->minx + 1440) << R300_SCISSORS_X_SHIFT) |
-            ((state->miny + 1440) << R300_SCISSORS_Y_SHIFT);
-        scissor->bottom_right =
-            (((state->maxx - 1) + 1440) << R300_SCISSORS_X_SHIFT) |
-            (((state->maxy - 1) + 1440) << R300_SCISSORS_Y_SHIFT);
-    }
-}
-
 static void
     r300_set_framebuffer_state(struct pipe_context* pipe,
                                const struct pipe_framebuffer_state* state)
 {
     struct r300_context* r300 = r300_context(pipe);
-    struct r300_scissor_state* scissor =
-        (struct r300_scissor_state*)r300->scissor_state.state;
-    struct pipe_scissor_state pscissor;
+    uint32_t zbuffer_bpp = 0;
 
     if (r300->draw) {
         draw_flush(r300->draw);
@@ -511,19 +494,29 @@ static void
 
     r300->framebuffer_state = *state;
 
-    /* XXX Arg. This is silly. */
-    pscissor.minx = pscissor.miny = 0;
-    pscissor.maxx = state->width;
-    pscissor.maxy = state->height;
-    r300_set_scissor_regs(&pscissor, &scissor->framebuffer,
-                          r300_screen(r300->context.screen)->caps->is_r500);
-
     /* Don't rely on the order of states being set for the first time. */
     r300->dirty_state |= R300_NEW_FRAMEBUFFERS;
 
     r300->blend_state.dirty = TRUE;
     r300->dsa_state.dirty = TRUE;
     r300->scissor_state.dirty = TRUE;
+
+    /* Polyfon offset depends on the zbuffer bit depth. */
+    if (state->zsbuf && r300->polygon_offset_enabled) {
+        switch (util_format_get_blocksize(state->zsbuf->texture->format)) {
+            case 2:
+                zbuffer_bpp = 16;
+                break;
+            case 4:
+                zbuffer_bpp = 24;
+                break;
+        }
+
+        if (r300->zbuffer_bpp != zbuffer_bpp) {
+            r300->zbuffer_bpp = zbuffer_bpp;
+            r300->rs_state.dirty = TRUE;
+        }
+    }
 }
 
 /* Create fragment shader state. */
@@ -627,9 +620,6 @@ static void* r300_create_rs_state(struct pipe_context* pipe,
     rs->line_control = pack_float_16_6x(state->line_width) |
         R300_GA_LINE_CNTL_END_TYPE_COMP;
 
-    /* XXX I think there is something wrong with the polygon mode,
-     * XXX re-test when r300g is in a better shape */
-
     /* Enable polygon mode */
     if (state->fill_cw != PIPE_POLYGON_MODE_FILL ||
         state->fill_ccw != PIPE_POLYGON_MODE_FILL) {
@@ -682,10 +672,8 @@ static void* r300_create_rs_state(struct pipe_context* pipe,
     }
 
     if (rs->polygon_offset_enable) {
-        rs->depth_offset_front = rs->depth_offset_back =
-            fui(state->offset_units);
-        rs->depth_scale_front = rs->depth_scale_back =
-            fui(state->offset_scale);
+        rs->depth_offset = state->offset_units;
+        rs->depth_scale = state->offset_scale;
     }
 
     if (state->line_stipple_enable) {
@@ -717,7 +705,13 @@ static void r300_bind_rs_state(struct pipe_context* pipe, void* state)
         draw_set_rasterizer_state(r300->draw, &rs->rs);
     }
 
-    r300->tcl_bypass = rs->rs.bypass_vs_clip_and_viewport;
+    if (rs) {
+        r300->tcl_bypass = rs->rs.bypass_vs_clip_and_viewport;
+        r300->polygon_offset_enabled = rs->rs.offset_cw || rs->rs.offset_ccw;
+    } else {
+        r300->tcl_bypass = FALSE;
+        r300->polygon_offset_enabled = FALSE;
+    }
 
     r300->rs_state.state = rs;
     r300->rs_state.dirty = TRUE;
@@ -864,11 +858,9 @@ static void r300_set_scissor_state(struct pipe_context* pipe,
                                    const struct pipe_scissor_state* state)
 {
     struct r300_context* r300 = r300_context(pipe);
-    struct r300_scissor_state* scissor =
-        (struct r300_scissor_state*)r300->scissor_state.state;
 
-    r300_set_scissor_regs(state, &scissor->scissor,
-                          r300_screen(r300->context.screen)->caps->is_r500);
+    memcpy(r300->scissor_state.state, state,
+        sizeof(struct pipe_scissor_state));
 
     r300->scissor_state.dirty = TRUE;
 }
@@ -1015,22 +1007,22 @@ static void r300_delete_vs_state(struct pipe_context* pipe, void* shader)
 
 static void r300_set_constant_buffer(struct pipe_context *pipe,
                                      uint shader, uint index,
-                                     const struct pipe_constant_buffer *buf)
+                                     struct pipe_buffer *buf)
 {
     struct r300_context* r300 = r300_context(pipe);
     void *mapped;
 
-    if (buf == NULL || buf->buffer->size == 0 ||
-        (mapped = pipe_buffer_map(pipe->screen, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)) == NULL)
+    if (buf == NULL || buf->size == 0 ||
+        (mapped = pipe_buffer_map(pipe->screen, buf, PIPE_BUFFER_USAGE_CPU_READ)) == NULL)
     {
         r300->shader_constants[shader].count = 0;
         return;
     }
 
-    assert((buf->buffer->size % 4 * sizeof(float)) == 0);
-    memcpy(r300->shader_constants[shader].constants, mapped, buf->buffer->size);
-    r300->shader_constants[shader].count = buf->buffer->size / (4 * sizeof(float));
-    pipe_buffer_unmap(pipe->screen, buf->buffer);
+    assert((buf->size % 4 * sizeof(float)) == 0);
+    memcpy(r300->shader_constants[shader].constants, mapped, buf->size);
+    r300->shader_constants[shader].count = buf->size / (4 * sizeof(float));
+    pipe_buffer_unmap(pipe->screen, buf);
 
     if (shader == PIPE_SHADER_VERTEX)
         r300->dirty_state |= R300_NEW_VERTEX_SHADER_CONSTANTS;
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index 192846411b..99c2720897 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -350,7 +350,8 @@ static void r300_update_rs_block(struct r300_context* r300,
 
     /* Rasterize colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
-        if (vs_outputs->color[i] != ATTR_UNUSED || any_bcolor_used) {
+        if (vs_outputs->color[i] != ATTR_UNUSED || any_bcolor_used ||
+            vs_outputs->color[1] != ATTR_UNUSED) {
             /* Always rasterize if it's written by the VS,
              * otherwise it locks up. */
             rX00_rs_col(rs, col_count, i, FALSE);
diff --git a/src/gallium/drivers/r300/r300_state_inlines.h b/src/gallium/drivers/r300/r300_state_inlines.h
index 35be00e1b0..e2180b33b7 100644
--- a/src/gallium/drivers/r300/r300_state_inlines.h
+++ b/src/gallium/drivers/r300/r300_state_inlines.h
@@ -537,6 +537,7 @@ r300_translate_vertex_data_type(enum pipe_format format) {
 static INLINE uint16_t
 r300_translate_vertex_data_swizzle(enum pipe_format format) {
     const struct util_format_description *desc = util_format_description(format);
+    unsigned swizzle[4], i;
 
     assert(format);
 
@@ -547,11 +548,26 @@ r300_translate_vertex_data_swizzle(enum pipe_format format) {
         return 0;
     }
 
-    return ((desc->swizzle[0] << R300_SWIZZLE_SELECT_X_SHIFT) |
-        (desc->swizzle[1] << R300_SWIZZLE_SELECT_Y_SHIFT) |
-        (desc->swizzle[2] << R300_SWIZZLE_SELECT_Z_SHIFT) |
-        (desc->swizzle[3] << R300_SWIZZLE_SELECT_W_SHIFT) |
-        (0xf << R300_WRITE_ENA_SHIFT));
+    /* Swizzles for 8bits formats are in the reversed order, not sure why. */
+    if (desc->channel[0].size == 8) {
+        for (i = 0; i < 4; i++) {
+            if (desc->swizzle[i] <= 3) {
+                swizzle[i] = 3 - desc->swizzle[i];
+            } else {
+                swizzle[i] = desc->swizzle[i];
+            }
+        }
+    } else {
+        for (i = 0; i < 4; i++) {
+            swizzle[i] = desc->swizzle[i];
+        }
+    }
+
+    return ((swizzle[0] << R300_SWIZZLE_SELECT_X_SHIFT) |
+            (swizzle[1] << R300_SWIZZLE_SELECT_Y_SHIFT) |
+            (swizzle[2] << R300_SWIZZLE_SELECT_Z_SHIFT) |
+            (swizzle[3] << R300_SWIZZLE_SELECT_W_SHIFT) |
+            (0xf << R300_WRITE_ENA_SHIFT));
 }
 
 #endif /* R300_STATE_INLINES_H */
diff --git a/src/gallium/drivers/r300/r300_state_invariant.c b/src/gallium/drivers/r300/r300_state_invariant.c
index f25f3ca217..47d7e60a40 100644
--- a/src/gallium/drivers/r300/r300_state_invariant.c
+++ b/src/gallium/drivers/r300/r300_state_invariant.c
@@ -79,7 +79,8 @@ void r300_emit_invariant_state(struct r300_context* r300)
     END_CS;
 
     /* XXX unsorted stuff from surface_fill */
-    BEGIN_CS(44 + (caps->has_tcl ? 7 : 0) + (caps->is_r500 ? 4 : 0));
+    BEGIN_CS(44 + (caps->has_tcl ? 7 : 0) +
+             (caps->family >= CHIP_FAMILY_RV350 ? 4 : 0));
 
     if (caps->has_tcl) {
         /*Flushing PVS is required before the VAP_GB registers can be changed*/
@@ -115,10 +116,12 @@ void r300_emit_invariant_state(struct r300_context* r300)
     OUT_CS_REG(R300_SC_HYPERZ, 0x0000001C);
     OUT_CS_REG(R300_SC_EDGERULE, 0x2DA49525);
     OUT_CS_REG(R300_RB3D_AARESOLVE_CTL, 0x00000000);
-    if (caps->is_r500) {
+
+    if (caps->family >= CHIP_FAMILY_RV350) {
         OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 0x01010101);
         OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD, 0xFEFEFEFE);
     }
+
     OUT_CS_REG(R300_ZB_BW_CNTL, 0x00000000);
     OUT_CS_REG(R300_ZB_DEPTHCLEARVALUE, 0x00000000);
     OUT_CS_REG(R300_ZB_HIZ_OFFSET, 0x00000000);
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 9a96206a4d..1f73f74c26 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -30,10 +30,23 @@
 #include "r300_texture.h"
 #include "r300_screen.h"
 
-static void r300_setup_texture_state(struct r300_texture* tex, boolean is_r500)
+#define TILE_WIDTH 0
+#define TILE_HEIGHT 1
+
+static const unsigned microblock_table[5][3][2] = {
+    /*linear  tiled   square-tiled */
+    {{32, 1}, {8, 4}, {0, 0}}, /*   8 bits per pixel */
+    {{16, 1}, {8, 2}, {4, 4}}, /*  16 bits per pixel */
+    {{ 8, 1}, {4, 2}, {0, 0}}, /*  32 bits per pixel */
+    {{ 4, 1}, {0, 0}, {2, 2}}, /*  64 bits per pixel */
+    {{ 2, 1}, {0, 0}, {0, 0}}  /* 128 bits per pixel */
+};
+
+static void r300_setup_texture_state(struct r300_screen* screen, struct r300_texture* tex)
 {
     struct r300_texture_state* state = &tex->state;
     struct pipe_texture *pt = &tex->tex;
+    boolean is_r500 = screen->caps->is_r500;
 
     state->format0 = R300_TX_WIDTH((pt->width0 - 1) & 0x7ff) |
                      R300_TX_HEIGHT((pt->height0 - 1) & 0x7ff);
@@ -67,8 +80,8 @@ static void r300_setup_texture_state(struct r300_texture* tex, boolean is_r500)
     }
     assert(is_r500 || (pt->width0 <= 2048 && pt->height0 <= 2048));
 
-    debug_printf("r300: Set texture state (%dx%d, %d levels)\n",
-		 pt->width0, pt->height0, pt->last_level);
+    SCREEN_DBG(screen, DBG_TEX, "r300: Set texture state (%dx%d, %d levels)\n",
+               pt->width0, pt->height0, pt->last_level);
 }
 
 unsigned r300_texture_get_offset(struct r300_texture* tex, unsigned level,
@@ -92,33 +105,78 @@ unsigned r300_texture_get_offset(struct r300_texture* tex, unsigned level,
 }
 
 /**
+ * Return the width (dim==TILE_WIDTH) or height (dim==TILE_HEIGHT) of one tile
+ * of the given texture.
+ */
+static unsigned r300_texture_get_tile_size(struct r300_texture* tex, int dim)
+{
+    unsigned pixsize, tile_size;
+
+    pixsize = util_format_get_blocksize(tex->tex.format);
+    tile_size = microblock_table[util_logbase2(pixsize)][tex->microtile][dim] *
+                (tex->macrotile == R300_BUFFER_TILED ? 8 : 1);
+
+    assert(tile_size);
+    return tile_size;
+}
+
+/**
  * Return the stride, in bytes, of the texture images of the given texture
  * at the given level.
  */
-unsigned r300_texture_get_stride(struct r300_texture* tex, unsigned level)
+unsigned r300_texture_get_stride(struct r300_screen* screen,
+                                 struct r300_texture* tex, unsigned level)
 {
+    unsigned tile_width, width;
+
     if (tex->stride_override)
         return tex->stride_override;
 
+    /* Check the level. */
     if (level > tex->tex.last_level) {
-        debug_printf("%s: level (%u) > last_level (%u)\n", __FUNCTION__,
-            level, tex->tex.last_level);
+        SCREEN_DBG(screen, DBG_TEX, "%s: level (%u) > last_level (%u)\n",
+                   __FUNCTION__, level, tex->tex.last_level);
         return 0;
     }
 
-    return align(util_format_get_stride(tex->tex.format, u_minify(tex->tex.width0, level)), 32);
+    width = u_minify(tex->tex.width0, level);
+
+    if (!util_format_is_compressed(tex->tex.format)) {
+        tile_width = r300_texture_get_tile_size(tex, TILE_WIDTH);
+        width = align(width, tile_width);
+        return util_format_get_stride(tex->tex.format, width);
+    } else {
+        return align(util_format_get_stride(tex->tex.format, width), 32);
+    }
+}
+
+static unsigned r300_texture_get_nblocksy(struct r300_texture* tex,
+                                          unsigned level)
+{
+    unsigned height, tile_height;
+
+    height = u_minify(tex->tex.height0, level);
+
+    if (!util_format_is_compressed(tex->tex.format)) {
+        tile_height = r300_texture_get_tile_size(tex, TILE_HEIGHT);
+        height = align(height, tile_height);
+    }
+
+    return util_format_get_nblocksy(tex->tex.format, height);
 }
 
-static void r300_setup_miptree(struct r300_texture* tex)
+static void r300_setup_miptree(struct r300_screen* screen,
+                               struct r300_texture* tex)
 {
     struct pipe_texture* base = &tex->tex;
-    int stride, size, layer_size;
-    int i;
+    unsigned stride, size, layer_size, nblocksy, i;
 
-    for (i = 0; i <= base->last_level; i++) {
-        unsigned nblocksy = util_format_get_nblocksy(base->format, u_minify(base->height0, i));
+    SCREEN_DBG(screen, DBG_TEX, "r300: Making miptree for texture, format %s\n",
+               pf_name(base->format));
 
-        stride = r300_texture_get_stride(tex, i);
+    for (i = 0; i <= base->last_level; i++) {
+        stride = r300_texture_get_stride(screen, tex, i);
+        nblocksy = r300_texture_get_nblocksy(tex, i);
         layer_size = stride * nblocksy;
 
         if (base->target == PIPE_TEXTURE_CUBE)
@@ -131,10 +189,10 @@ static void r300_setup_miptree(struct r300_texture* tex)
         tex->layer_size[i] = layer_size;
         tex->pitch[i] = stride / util_format_get_blocksize(base->format);
 
-        debug_printf("r300: Texture miptree: Level %d "
-                "(%dx%dx%d px, pitch %d bytes)\n",
+        SCREEN_DBG(screen, DBG_TEX, "r300: Texture miptree: Level %d "
+                "(%dx%dx%d px, pitch %d bytes) %d bytes total\n",
                 i, u_minify(base->width0, i), u_minify(base->height0, i),
-                u_minify(base->depth0, i), stride);
+                u_minify(base->depth0, i), stride, tex->size);
     }
 }
 
@@ -150,6 +208,7 @@ static struct pipe_texture*
                         const struct pipe_texture* template)
 {
     struct r300_texture* tex = CALLOC_STRUCT(r300_texture);
+    struct r300_screen* rscreen = r300_screen(screen);
 
     if (!tex) {
         return NULL;
@@ -160,10 +219,10 @@ static struct pipe_texture*
     tex->tex.screen = screen;
 
     r300_setup_flags(tex);
-    r300_setup_miptree(tex);
-    r300_setup_texture_state(tex, r300_screen(screen)->caps->is_r500);
+    r300_setup_miptree(rscreen, tex);
+    r300_setup_texture_state(rscreen, tex);
 
-    tex->buffer = screen->buffer_create(screen, 1024,
+    tex->buffer = screen->buffer_create(screen, 2048,
                                         PIPE_BUFFER_USAGE_PIXEL,
                                         tex->size);
 
@@ -227,6 +286,7 @@ static struct pipe_texture*
                          struct pipe_buffer* buffer)
 {
     struct r300_texture* tex;
+    struct r300_screen* rscreen = r300_screen(screen);
 
     /* Support only 2D textures without mipmaps */
     if (base->target != PIPE_TEXTURE_2D ||
@@ -248,7 +308,7 @@ static struct pipe_texture*
     tex->pitch[0] = *stride / util_format_get_blocksize(base->format);
 
     r300_setup_flags(tex);
-    r300_setup_texture_state(tex, r300_screen(screen)->caps->is_r500);
+    r300_setup_texture_state(rscreen, tex);
 
     pipe_buffer_reference(&tex->buffer, buffer);
 
@@ -315,7 +375,8 @@ void r300_init_screen_texture_functions(struct pipe_screen* screen)
     screen->video_surface_destroy= r300_video_surface_destroy;
 }
 
-boolean r300_get_texture_buffer(struct pipe_texture* texture,
+boolean r300_get_texture_buffer(struct pipe_screen* screen,
+                                struct pipe_texture* texture,
                                 struct pipe_buffer** buffer,
                                 unsigned* stride)
 {
@@ -327,7 +388,7 @@ boolean r300_get_texture_buffer(struct pipe_texture* texture,
     pipe_buffer_reference(buffer, tex->buffer);
 
     if (stride) {
-        *stride = r300_texture_get_stride(tex, 0);
+        *stride = r300_texture_get_stride(r300_screen(screen), tex, 0);
     }
 
     return TRUE;
diff --git a/src/gallium/drivers/r300/r300_texture.h b/src/gallium/drivers/r300/r300_texture.h
index 55ceb1a513..1be1e6843c 100644
--- a/src/gallium/drivers/r300/r300_texture.h
+++ b/src/gallium/drivers/r300/r300_texture.h
@@ -31,7 +31,8 @@ struct r300_texture;
 
 void r300_init_screen_texture_functions(struct pipe_screen* screen);
 
-unsigned r300_texture_get_stride(struct r300_texture* tex, unsigned level);
+unsigned r300_texture_get_stride(struct r300_screen* screen,
+                                 struct r300_texture* tex, unsigned level);
 
 unsigned r300_texture_get_offset(struct r300_texture* tex, unsigned level,
                                  unsigned zslice, unsigned face);
@@ -115,7 +116,8 @@ r300_video_surface(struct pipe_video_surface *pvs)
 
 #ifndef R300_WINSYS_H
 
-boolean r300_get_texture_buffer(struct pipe_texture* texture,
+boolean r300_get_texture_buffer(struct pipe_screen* screen,
+                                struct pipe_texture* texture,
                                 struct pipe_buffer** buffer,
                                 unsigned* stride);
 
diff --git a/src/gallium/drivers/r300/r300_vs.c b/src/gallium/drivers/r300/r300_vs.c
index 68aef70872..9fbb830047 100644
--- a/src/gallium/drivers/r300/r300_vs.c
+++ b/src/gallium/drivers/r300/r300_vs.c
@@ -124,7 +124,8 @@ static void r300_shader_vap_output_fmt(struct r300_vertex_shader* vs)
 
     /* Colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
-        if (vs_outputs->color[i] != ATTR_UNUSED || any_bcolor_used) {
+        if (vs_outputs->color[i] != ATTR_UNUSED || any_bcolor_used ||
+            vs_outputs->color[1] != ATTR_UNUSED) {
             hwfmt[1] |= R300_INPUT_CNTL_COLOR;
             hwfmt[2] |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT << i;
         }
@@ -182,7 +183,8 @@ static void r300_stream_locations_notcl(
 
     /* Colors. */
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
-        if (vs_outputs->color[i] != ATTR_UNUSED || any_bcolor_used) {
+        if (vs_outputs->color[i] != ATTR_UNUSED || any_bcolor_used ||
+            vs_outputs->color[1] != ATTR_UNUSED) {
             stream_loc[tabi++] = 2 + i;
         }
     }
@@ -259,7 +261,8 @@ static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
     for (i = 0; i < ATTR_COLOR_COUNT; i++) {
         if (outputs->color[i] != ATTR_UNUSED) {
             c->code->outputs[outputs->color[i]] = reg++;
-        } else if (any_bcolor_used) {
+        } else if (any_bcolor_used ||
+                   outputs->color[1] != ATTR_UNUSED) {
             reg++;
         }
     }
diff --git a/src/gallium/drivers/r300/r300_winsys.h b/src/gallium/drivers/r300/r300_winsys.h
index 1ae6de70fe..bdb8b54bab 100644
--- a/src/gallium/drivers/r300/r300_winsys.h
+++ b/src/gallium/drivers/r300/r300_winsys.h
@@ -40,7 +40,8 @@ extern "C" {
 struct pipe_context* r300_create_context(struct pipe_screen* screen,
                                          struct radeon_winsys* radeon_winsys);
 
-boolean r300_get_texture_buffer(struct pipe_texture* texture,
+boolean r300_get_texture_buffer(struct pipe_screen* screen,
+                                struct pipe_texture* texture,
                                 struct pipe_buffer** buffer,
                                 unsigned* stride);
 
diff --git a/src/gallium/drivers/softpipe/Makefile b/src/gallium/drivers/softpipe/Makefile
index bcb887a0b2..e4ac49fa85 100644
--- a/src/gallium/drivers/softpipe/Makefile
+++ b/src/gallium/drivers/softpipe/Makefile
@@ -32,6 +32,7 @@ C_SOURCES = \
 	sp_tex_tile_cache.c \
 	sp_tile_cache.c \
 	sp_surface.c \
-	sp_video_context.c
+	sp_video_context.c \
+	sp_winsys.c
 
 include ../../Makefile.template
diff --git a/src/gallium/drivers/softpipe/SConscript b/src/gallium/drivers/softpipe/SConscript
index aac9edf44e..3042e556c6 100644
--- a/src/gallium/drivers/softpipe/SConscript
+++ b/src/gallium/drivers/softpipe/SConscript
@@ -34,6 +34,7 @@ softpipe = env.ConvenienceLibrary(
 		'sp_texture.c',
 		'sp_tile_cache.c',
 		'sp_video_context.c',
+		'sp_winsys.c'
 	])
 
 Export('softpipe')
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index f3ac6760db..8e01793940 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -113,8 +113,8 @@ softpipe_destroy( struct pipe_context *pipe )
    }
 
    for (i = 0; i < Elements(softpipe->constants); i++) {
-      if (softpipe->constants[i].buffer) {
-         pipe_buffer_reference(&softpipe->constants[i].buffer, NULL);
+      if (softpipe->constants[i]) {
+         pipe_buffer_reference(&softpipe->constants[i], NULL);
       }
    }
 
@@ -256,6 +256,8 @@ softpipe_create( struct pipe_screen *screen )
    softpipe->pipe.draw_arrays = softpipe_draw_arrays;
    softpipe->pipe.draw_elements = softpipe_draw_elements;
    softpipe->pipe.draw_range_elements = softpipe_draw_range_elements;
+   softpipe->pipe.draw_arrays_instanced = softpipe_draw_arrays_instanced;
+   softpipe->pipe.draw_elements_instanced = softpipe_draw_elements_instanced;
 
    softpipe->pipe.clear = softpipe_clear;
    softpipe->pipe.flush = softpipe_flush;
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 73fa744f9d..da673c57ad 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -63,7 +63,7 @@ struct softpipe_context {
    /** Other rendering state */
    struct pipe_blend_color blend_color;
    struct pipe_clip_state clip;
-   struct pipe_constant_buffer constants[PIPE_SHADER_TYPES];
+   struct pipe_buffer *constants[PIPE_SHADER_TYPES];
    struct pipe_framebuffer_state framebuffer;
    struct pipe_poly_stipple poly_stipple;
    struct pipe_scissor_state scissor;
diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c
index 03d35fb3cb..03b58d2fb7 100644
--- a/src/gallium/drivers/softpipe/sp_draw_arrays.c
+++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c
@@ -52,18 +52,18 @@ softpipe_map_constant_buffers(struct softpipe_context *sp)
    uint i, vssize, gssize;
 
    for (i = 0; i < PIPE_SHADER_TYPES; i++) {
-      if (sp->constants[i].buffer && sp->constants[i].buffer->size)
-         sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i].buffer,
+      if (sp->constants[i] && sp->constants[i]->size)
+         sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i],
                                                   PIPE_BUFFER_USAGE_CPU_READ);
    }
 
-   if (sp->constants[PIPE_SHADER_VERTEX].buffer)
-      vssize = sp->constants[PIPE_SHADER_VERTEX].buffer->size;
+   if (sp->constants[PIPE_SHADER_VERTEX])
+      vssize = sp->constants[PIPE_SHADER_VERTEX]->size;
    else
       vssize = 0;
 
-   if (sp->constants[PIPE_SHADER_GEOMETRY].buffer)
-      gssize = sp->constants[PIPE_SHADER_GEOMETRY].buffer->size;
+   if (sp->constants[PIPE_SHADER_GEOMETRY])
+      gssize = sp->constants[PIPE_SHADER_GEOMETRY]->size;
    else
       gssize = 0;
 
@@ -91,26 +91,48 @@ softpipe_unmap_constant_buffers(struct softpipe_context *sp)
    draw_set_mapped_constant_buffer(sp->draw, PIPE_SHADER_GEOMETRY, NULL, 0);
 
    for (i = 0; i < PIPE_SHADER_TYPES; i++) {
-      if (sp->constants[i].buffer && sp->constants[i].buffer->size)
-         ws->buffer_unmap(ws, sp->constants[i].buffer);
+      if (sp->constants[i] && sp->constants[i]->size)
+         ws->buffer_unmap(ws, sp->constants[i]);
       sp->mapped_constants[i] = NULL;
    }
 }
 
 
+/**
+ * Draw vertex arrays, with optional indexing.
+ * Basically, map the vertex buffers (and drawing surfaces), then hand off
+ * the drawing to the 'draw' module.
+ */
+static void
+softpipe_draw_range_elements_instanced(struct pipe_context *pipe,
+                                       struct pipe_buffer *indexBuffer,
+                                       unsigned indexSize,
+                                       unsigned minIndex,
+                                       unsigned maxIndex,
+                                       unsigned mode,
+                                       unsigned start,
+                                       unsigned count,
+                                       unsigned startInstance,
+                                       unsigned instanceCount);
+
+
 void
 softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
                      unsigned start, unsigned count)
 {
-   softpipe_draw_elements(pipe, NULL, 0, mode, start, count);
+   softpipe_draw_range_elements_instanced(pipe,
+                                          NULL,
+                                          0,
+                                          0,
+                                          0xffffffff,
+                                          mode,
+                                          start,
+                                          count,
+                                          0,
+                                          1);
 }
 
 
-/**
- * Draw vertex arrays, with optional indexing.
- * Basically, map the vertex buffers (and drawing surfaces), then hand off
- * the drawing to the 'draw' module.
- */
 void
 softpipe_draw_range_elements(struct pipe_context *pipe,
                              struct pipe_buffer *indexBuffer,
@@ -119,6 +141,91 @@ softpipe_draw_range_elements(struct pipe_context *pipe,
                              unsigned max_index,
                              unsigned mode, unsigned start, unsigned count)
 {
+   softpipe_draw_range_elements_instanced(pipe,
+                                          indexBuffer,
+                                          indexSize,
+                                          min_index,
+                                          max_index,
+                                          mode,
+                                          start,
+                                          count,
+                                          0,
+                                          1);
+}
+
+
+void
+softpipe_draw_elements(struct pipe_context *pipe,
+                       struct pipe_buffer *indexBuffer,
+                       unsigned indexSize,
+                       unsigned mode, unsigned start, unsigned count)
+{
+   softpipe_draw_range_elements_instanced(pipe,
+                                          indexBuffer,
+                                          indexSize,
+                                          0,
+                                          0xffffffff,
+                                          mode,
+                                          start,
+                                          count,
+                                          0,
+                                          1);
+}
+
+void
+softpipe_draw_arrays_instanced(struct pipe_context *pipe,
+                               unsigned mode,
+                               unsigned start,
+                               unsigned count,
+                               unsigned startInstance,
+                               unsigned instanceCount)
+{
+   softpipe_draw_range_elements_instanced(pipe,
+                                          NULL,
+                                          0,
+                                          0,
+                                          0xffffffff,
+                                          mode,
+                                          start,
+                                          count,
+                                          startInstance,
+                                          instanceCount);
+}
+
+void
+softpipe_draw_elements_instanced(struct pipe_context *pipe,
+                                 struct pipe_buffer *indexBuffer,
+                                 unsigned indexSize,
+                                 unsigned mode,
+                                 unsigned start,
+                                 unsigned count,
+                                 unsigned startInstance,
+                                 unsigned instanceCount)
+{
+   softpipe_draw_range_elements_instanced(pipe,
+                                          indexBuffer,
+                                          indexSize,
+                                          0,
+                                          0xffffffff,
+                                          mode,
+                                          start,
+                                          count,
+                                          startInstance,
+                                          instanceCount);
+}
+
+static void
+softpipe_draw_range_elements_instanced(struct pipe_context *pipe,
+                                       struct pipe_buffer *indexBuffer,
+                                       unsigned indexSize,
+                                       unsigned minIndex,
+                                       unsigned maxIndex,
+                                       unsigned mode,
+                                       unsigned start,
+                                       unsigned count,
+                                       unsigned startInstance,
+                                       unsigned instanceCount)
+{
    struct softpipe_context *sp = softpipe_context(pipe);
    struct draw_context *draw = sp->draw;
    unsigned i;
@@ -128,45 +235,48 @@ softpipe_draw_range_elements(struct pipe_context *pipe,
 
    sp->reduced_api_prim = u_reduced_prim(mode);
 
-   if (sp->dirty)
-      softpipe_update_derived( sp );
+   if (sp->dirty) {
+      softpipe_update_derived(sp);
+   }
 
    softpipe_map_transfers(sp);
    softpipe_map_constant_buffers(sp);
 
-   /*
-    * Map vertex buffers
-    */
+   /* Map vertex buffers */
    for (i = 0; i < sp->num_vertex_buffers; i++) {
-      void *buf
-         = pipe_buffer_map(pipe->screen,
-                                    sp->vertex_buffer[i].buffer,
-                                    PIPE_BUFFER_USAGE_CPU_READ);
+      void *buf;
+
+      buf = pipe_buffer_map(pipe->screen,
+                            sp->vertex_buffer[i].buffer,
+                            PIPE_BUFFER_USAGE_CPU_READ);
       draw_set_mapped_vertex_buffer(draw, i, buf);
    }
 
    /* Map index buffer, if present */
    if (indexBuffer) {
-      void *mapped_indexes
-         = pipe_buffer_map(pipe->screen, indexBuffer,
-                                    PIPE_BUFFER_USAGE_CPU_READ);
-      draw_set_mapped_element_buffer_range(draw, indexSize,
-                                           min_index,
-                                           max_index,
+      void *mapped_indexes;
+
+      mapped_indexes = pipe_buffer_map(pipe->screen,
+                                       indexBuffer,
+                                       PIPE_BUFFER_USAGE_CPU_READ);
+      draw_set_mapped_element_buffer_range(draw,
+                                           indexSize,
+                                           minIndex,
+                                           maxIndex,
                                            mapped_indexes);
-   }
-   else {
+   } else {
       /* no index/element buffer */
-      draw_set_mapped_element_buffer_range(draw, 0, start,
-                                           start + count - 1, NULL);
+      draw_set_mapped_element_buffer_range(draw,
+                                           0,
+                                           start,
+                                           start + count - 1,
+                                           NULL);
    }
 
    /* draw! */
-   draw_arrays(draw, mode, start, count);
+   draw_arrays_instanced(draw, mode, start, count, startInstance, instanceCount);
 
-   /*
-    * unmap vertex/index buffers - will cause draw module to flush
-    */
+   /* unmap vertex/index buffers - will cause draw module to flush */
    for (i = 0; i < sp->num_vertex_buffers; i++) {
       draw_set_mapped_vertex_buffer(draw, i, NULL);
       pipe_buffer_unmap(pipe->screen, sp->vertex_buffer[i].buffer);
@@ -176,22 +286,8 @@ softpipe_draw_range_elements(struct pipe_context *pipe,
       pipe_buffer_unmap(pipe->screen, indexBuffer);
    }
 
-
    /* Note: leave drawing surfaces mapped */
    softpipe_unmap_constant_buffers(sp);
 
    sp->dirty_render_cache = TRUE;
 }
-
-
-void
-softpipe_draw_elements(struct pipe_context *pipe,
-                       struct pipe_buffer *indexBuffer,
-                       unsigned indexSize,
-                       unsigned mode, unsigned start, unsigned count)
-{
-   softpipe_draw_range_elements( pipe, indexBuffer,
-                                 indexSize,
-                                 0, 0xffffffff,
-                                 mode, start, count );
-}
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index 7f573aef3c..5812d1eefe 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -526,6 +526,7 @@ static void
 sp_vbuf_destroy(struct vbuf_render *vbr)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   align_free(cvbr->vertex_buffer);
    sp_setup_destroy_context(cvbr->setup);
    FREE(cvbr);
 }
@@ -541,7 +542,6 @@ sp_create_vbuf_backend(struct softpipe_context *sp)
 
    assert(sp->draw);
 
-
    cvbr->base.max_indices = SP_MAX_VBUF_INDEXES;
    cvbr->base.max_vertex_buffer_bytes = SP_MAX_VBUF_SIZE;
 
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 9b18dac67b..7f244c4fd4 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -139,7 +139,7 @@ void softpipe_set_clip_state( struct pipe_context *,
 
 void softpipe_set_constant_buffer(struct pipe_context *,
                                   uint shader, uint index,
-                                  const struct pipe_constant_buffer *buf);
+                                  struct pipe_buffer *buf);
 
 void *softpipe_create_fs_state(struct pipe_context *,
                                const struct pipe_shader_state *);
@@ -200,6 +200,24 @@ softpipe_draw_range_elements(struct pipe_context *pipe,
                              unsigned mode, unsigned start, unsigned count);
 
 void
+softpipe_draw_arrays_instanced(struct pipe_context *pipe,
+                               unsigned mode,
+                               unsigned start,
+                               unsigned count,
+                               unsigned startInstance,
+                               unsigned instanceCount);
+
+void
+softpipe_draw_elements_instanced(struct pipe_context *pipe,
+                                 struct pipe_buffer *indexBuffer,
+                                 unsigned indexSize,
+                                 unsigned mode,
+                                 unsigned start,
+                                 unsigned count,
+                                 unsigned startInstance,
+                                 unsigned instanceCount);
+
+void
 softpipe_map_transfers(struct softpipe_context *sp);
 
 void
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
index aa12bb215a..b7ed4441b4 100644
--- a/src/gallium/drivers/softpipe/sp_state_fs.c
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -159,7 +159,7 @@ softpipe_delete_vs_state(struct pipe_context *pipe, void *vs)
 void
 softpipe_set_constant_buffer(struct pipe_context *pipe,
                              uint shader, uint index,
-                             const struct pipe_constant_buffer *buf)
+                             struct pipe_buffer *buf)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
@@ -169,8 +169,7 @@ softpipe_set_constant_buffer(struct pipe_context *pipe,
    draw_flush(softpipe->draw);
 
    /* note: reference counting */
-   pipe_buffer_reference(&softpipe->constants[shader].buffer,
-			 buf ? buf->buffer : NULL);
+   pipe_buffer_reference(&softpipe->constants[shader], buf);
 
    softpipe->dirty |= SP_NEW_CONSTANTS;
 }
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index a9436a3394..fae72c81aa 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -57,13 +57,8 @@ softpipe_texture_layout(struct pipe_screen *screen,
    unsigned width = pt->width0;
    unsigned height = pt->height0;
    unsigned depth = pt->depth0;
-
    unsigned buffer_size = 0;
 
-   pt->width0 = width;
-   pt->height0 = height;
-   pt->depth0 = depth;
-
    for (level = 0; level <= pt->last_level; level++) {
       spt->stride[level] = util_format_get_stride(pt->format, width);
 
diff --git a/src/gallium/drivers/softpipe/sp_winsys.c b/src/gallium/drivers/softpipe/sp_winsys.c
new file mode 100644
index 0000000000..8169071dc9
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_winsys.c
@@ -0,0 +1,245 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Malloc softpipe winsys. Uses malloc for all memory allocations.
+ * 
+ * @author Keith Whitwell
+ * @author Brian Paul
+ * @author Jose Fonseca
+ */
+
+
+#include "pipe/internal/p_winsys_screen.h"/* port to just p_screen */
+#include "pipe/p_format.h"
+#include "pipe/p_context.h"
+#include "pipe/p_inlines.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "softpipe/sp_winsys.h"
+
+
+struct st_softpipe_buffer
+{
+   struct pipe_buffer base;
+   boolean userBuffer;  /** Is this a user-space buffer? */
+   void *data;
+   void *mapped;
+};
+
+
+/** Cast wrapper */
+static INLINE struct st_softpipe_buffer *
+st_softpipe_buffer( struct pipe_buffer *buf )
+{
+   return (struct st_softpipe_buffer *)buf;
+}
+
+
+static void *
+st_softpipe_buffer_map(struct pipe_winsys *winsys, 
+                       struct pipe_buffer *buf,
+                       unsigned flags)
+{
+   struct st_softpipe_buffer *st_softpipe_buf = st_softpipe_buffer(buf);
+   st_softpipe_buf->mapped = st_softpipe_buf->data;
+   return st_softpipe_buf->mapped;
+}
+
+
+static void
+st_softpipe_buffer_unmap(struct pipe_winsys *winsys, 
+                         struct pipe_buffer *buf)
+{
+   struct st_softpipe_buffer *st_softpipe_buf = st_softpipe_buffer(buf);
+   st_softpipe_buf->mapped = NULL;
+}
+
+
+static void
+st_softpipe_buffer_destroy(struct pipe_buffer *buf)
+{
+   struct st_softpipe_buffer *oldBuf = st_softpipe_buffer(buf);
+
+   if (oldBuf->data) {
+      if (!oldBuf->userBuffer)
+         align_free(oldBuf->data);
+
+      oldBuf->data = NULL;
+   }
+
+   FREE(oldBuf);
+}
+
+
+static void
+st_softpipe_flush_frontbuffer(struct pipe_winsys *winsys,
+                              struct pipe_surface *surf,
+                              void *context_private)
+{
+}
+
+
+
+static const char *
+st_softpipe_get_name(struct pipe_winsys *winsys)
+{
+   return "softpipe";
+}
+
+
+static struct pipe_buffer *
+st_softpipe_buffer_create(struct pipe_winsys *winsys, 
+                          unsigned alignment, 
+                          unsigned usage,
+                          unsigned size)
+{
+   struct st_softpipe_buffer *buffer = CALLOC_STRUCT(st_softpipe_buffer);
+
+   pipe_reference_init(&buffer->base.reference, 1);
+   buffer->base.alignment = alignment;
+   buffer->base.usage = usage;
+   buffer->base.size = size;
+
+   buffer->data = align_malloc(size, alignment);
+
+   return &buffer->base;
+}
+
+
+/**
+ * Create buffer which wraps user-space data.
+ */
+static struct pipe_buffer *
+st_softpipe_user_buffer_create(struct pipe_winsys *winsys, 
+                               void *ptr, 
+                               unsigned bytes)
+{
+   struct st_softpipe_buffer *buffer;
+   
+   buffer = CALLOC_STRUCT(st_softpipe_buffer);
+   if(!buffer)
+      return NULL;
+   
+   pipe_reference_init(&buffer->base.reference, 1);
+   buffer->base.size = bytes;
+   buffer->userBuffer = TRUE;
+   buffer->data = ptr;
+
+   return &buffer->base;
+}
+
+
+static struct pipe_buffer *
+st_softpipe_surface_buffer_create(struct pipe_winsys *winsys,
+                                  unsigned width, unsigned height,
+                                  enum pipe_format format,
+                                  unsigned usage,
+                                  unsigned tex_usage,
+                                  unsigned *stride)
+{
+   const unsigned alignment = 64;
+   unsigned nblocksy;
+
+   nblocksy = util_format_get_nblocksy(format, height);
+   *stride = align(util_format_get_stride(format, width), alignment);
+
+   return winsys->buffer_create(winsys, alignment,
+                                usage,
+                                *stride * nblocksy);
+}
+
+
+static void
+st_softpipe_fence_reference(struct pipe_winsys *winsys, 
+                            struct pipe_fence_handle **ptr,
+                            struct pipe_fence_handle *fence)
+{
+}
+
+
+static int
+st_softpipe_fence_signalled(struct pipe_winsys *winsys, 
+                            struct pipe_fence_handle *fence,
+                            unsigned flag)
+{
+   return 0;
+}
+
+
+static int
+st_softpipe_fence_finish(struct pipe_winsys *winsys, 
+                         struct pipe_fence_handle *fence,
+                         unsigned flag)
+{
+   return 0;
+}
+
+
+static void
+st_softpipe_destroy(struct pipe_winsys *winsys)
+{
+   FREE(winsys);
+}
+
+
+struct pipe_screen *
+softpipe_create_screen_malloc(void)
+{
+   static struct pipe_winsys *winsys;
+   struct pipe_screen *screen;
+
+   winsys = CALLOC_STRUCT(pipe_winsys);
+   if(!winsys)
+      return NULL;
+
+   winsys->destroy = st_softpipe_destroy;
+   
+   winsys->buffer_create = st_softpipe_buffer_create;
+   winsys->user_buffer_create = st_softpipe_user_buffer_create;
+   winsys->buffer_map = st_softpipe_buffer_map;
+   winsys->buffer_unmap = st_softpipe_buffer_unmap;
+   winsys->buffer_destroy = st_softpipe_buffer_destroy;
+
+   winsys->surface_buffer_create = st_softpipe_surface_buffer_create;
+
+   winsys->fence_reference = st_softpipe_fence_reference;
+   winsys->fence_signalled = st_softpipe_fence_signalled;
+   winsys->fence_finish = st_softpipe_fence_finish;
+
+   winsys->flush_frontbuffer = st_softpipe_flush_frontbuffer;
+   winsys->get_name = st_softpipe_get_name;
+
+   screen = softpipe_create_screen(winsys);
+   if(!screen)
+      st_softpipe_destroy(winsys);
+
+   return screen;
+}
diff --git a/src/gallium/drivers/softpipe/sp_winsys.h b/src/gallium/drivers/softpipe/sp_winsys.h
index f203ded29e..3042e01a05 100644
--- a/src/gallium/drivers/softpipe/sp_winsys.h
+++ b/src/gallium/drivers/softpipe/sp_winsys.h
@@ -49,10 +49,17 @@ struct pipe_buffer;
 
 struct pipe_context *softpipe_create( struct pipe_screen * );
 
+/**
+ * Create a softpipe screen that uses the
+ * given winsys for allocating buffers.
+ */
+struct pipe_screen *softpipe_create_screen( struct pipe_winsys * );
 
-struct pipe_screen *
-softpipe_create_screen(struct pipe_winsys *);
-
+/**
+ * Create a softpipe screen that uses
+ * regular malloc to create all its buffers.
+ */
+struct pipe_screen *softpipe_create_screen_malloc(void);
 
 boolean
 softpipe_get_texture_buffer( struct pipe_texture *texture,
diff --git a/src/gallium/drivers/svga/svga_pipe_constants.c b/src/gallium/drivers/svga/svga_pipe_constants.c
index 10e7a12189..ca2c7c49d7 100644
--- a/src/gallium/drivers/svga/svga_pipe_constants.c
+++ b/src/gallium/drivers/svga/svga_pipe_constants.c
@@ -49,7 +49,7 @@ struct svga_constbuf
 
 static void svga_set_constant_buffer(struct pipe_context *pipe,
                                      uint shader, uint index,
-                                     const struct pipe_constant_buffer *buf)
+                                     struct pipe_buffer *buf)
 {
    struct svga_context *svga = svga_context(pipe);
 
@@ -57,7 +57,7 @@ static void svga_set_constant_buffer(struct pipe_context *pipe,
    assert(index == 0);
 
    pipe_buffer_reference( &svga->curr.cb[shader],
-                          buf->buffer );
+                          buf );
 
    if (shader == PIPE_SHADER_FRAGMENT)
       svga->dirty |= SVGA_NEW_FS_CONST_BUFFER;
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
index d29f3762d2..ec2886348b 100644
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -81,8 +81,10 @@ static enum pipe_error compile_fs( struct svga_context *svga,
    }
 
    result->id = util_bitmask_add(svga->fs_bm);
-   if(result->id == UTIL_BITMASK_INVALID_INDEX)
+   if(result->id == UTIL_BITMASK_INVALID_INDEX) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
       goto fail;
+   }
 
    ret = SVGA3D_DefineShader(svga->swc, 
                              result->id,
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
index ae1e77e7d4..e7e6c08432 100644
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -71,7 +71,7 @@ static enum pipe_error compile_vs( struct svga_context *svga,
                                    struct svga_shader_result **out_result )
 {
    struct svga_shader_result *result;
-   enum pipe_error ret = PIPE_OK;
+   enum pipe_error ret = PIPE_ERROR;
 
    result = svga_translate_vertex_program( vs, key );
    if (result == NULL) {
@@ -80,8 +80,10 @@ static enum pipe_error compile_vs( struct svga_context *svga,
    }
 
    result->id = util_bitmask_add(svga->vs_bm);
-   if(result->id == UTIL_BITMASK_INVALID_INDEX)
+   if(result->id == UTIL_BITMASK_INVALID_INDEX) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
       goto fail;
+   }
 
    ret = SVGA3D_DefineShader(svga->swc, 
                              result->id,
@@ -200,10 +202,12 @@ static int update_zero_stride( struct svga_context *svga,
 
          key.output_stride = 4 * sizeof(float);
          key.nr_elements = 1;
+         key.element[0].type = TRANSLATE_ELEMENT_NORMAL;
          key.element[0].input_format = vel->src_format;
          key.element[0].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
          key.element[0].input_buffer = vel->vertex_buffer_index;
          key.element[0].input_offset = vel->src_offset;
+         key.element[0].instance_divisor = vel->instance_divisor;
          key.element[0].output_offset = const_idx * 4 * sizeof(float);
 
          translate_key_sanitize(&key);
@@ -222,7 +226,7 @@ static int update_zero_stride( struct svga_context *svga,
          translate->set_buffer(translate, vel->vertex_buffer_index,
                                mapped_buffer,
                                vbuffer->stride);
-         translate->run(translate, 0, 1,
+         translate->run(translate, 0, 1, 0,
                         svga->curr.zero_stride_constants);
 
          pipe_buffer_unmap(svga->pipe.screen,
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index 075e4f9a0b..5a9f0fc690 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -812,13 +812,13 @@ trace_context_set_clip_state(struct pipe_context *_pipe,
 static INLINE void
 trace_context_set_constant_buffer(struct pipe_context *_pipe,
                                   uint shader, uint index,
-                                  const struct pipe_constant_buffer *buffer)
+                                  struct pipe_buffer *buffer)
 {
    struct trace_context *tr_ctx = trace_context(_pipe);
    struct pipe_context *pipe = tr_ctx->pipe;
 
    if (buffer)
-      trace_screen_user_buffer_update(_pipe->screen, buffer->buffer);
+      trace_screen_user_buffer_update(_pipe->screen, buffer);
 
    trace_dump_call_begin("pipe_context", "set_constant_buffer");
 
@@ -827,10 +827,11 @@ trace_context_set_constant_buffer(struct pipe_context *_pipe,
    trace_dump_arg(uint, index);
    trace_dump_arg(constant_buffer, buffer);
 
+   /* XXX hmm? */
    if (buffer) {
-      struct pipe_constant_buffer _buffer;
-      _buffer.buffer = trace_buffer_unwrap(tr_ctx, buffer->buffer);
-      pipe->set_constant_buffer(pipe, shader, index, &_buffer);
+      struct pipe_buffer *_buffer;
+      _buffer = trace_buffer_unwrap(tr_ctx, buffer);
+      pipe->set_constant_buffer(pipe, shader, index, _buffer);
    } else {
       pipe->set_constant_buffer(pipe, shader, index, buffer);
    }
diff --git a/src/gallium/drivers/trace/tr_dump_state.c b/src/gallium/drivers/trace/tr_dump_state.c
index 86237e03bc..32f61f8c94 100644
--- a/src/gallium/drivers/trace/tr_dump_state.c
+++ b/src/gallium/drivers/trace/tr_dump_state.c
@@ -229,7 +229,7 @@ void trace_dump_clip_state(const struct pipe_clip_state *state)
 }
 
 
-void trace_dump_constant_buffer(const struct pipe_constant_buffer *state)
+void trace_dump_constant_buffer(const struct pipe_buffer *state)
 {
    if (!trace_dumping_enabled_locked())
       return;
@@ -241,7 +241,7 @@ void trace_dump_constant_buffer(const struct pipe_constant_buffer *state)
 
    trace_dump_struct_begin("pipe_constant_buffer");
 
-   trace_dump_member(buffer_ptr, state, buffer);
+   trace_dump_reference(&state->reference);
 
    trace_dump_struct_end();
 }
diff --git a/src/gallium/drivers/trace/tr_dump_state.h b/src/gallium/drivers/trace/tr_dump_state.h
index 07ad6fbb20..c7860fd6e1 100644
--- a/src/gallium/drivers/trace/tr_dump_state.h
+++ b/src/gallium/drivers/trace/tr_dump_state.h
@@ -47,7 +47,7 @@ void trace_dump_scissor_state(const struct pipe_scissor_state *state);
 
 void trace_dump_clip_state(const struct pipe_clip_state *state);
 
-void trace_dump_constant_buffer(const struct pipe_constant_buffer *state);
+void trace_dump_constant_buffer(const struct pipe_buffer *state);
 
 void trace_dump_token(const struct tgsi_token *token);
 
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index 26a940593f..272d0308cc 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -104,7 +104,8 @@ typedef unsigned char boolean;
 
 /* Function visibility */
 #ifndef PUBLIC
-#  if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303
+#  if (defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303) \
+	|| (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
 #    define PUBLIC __attribute__((visibility("default")))
 #  else
 #    define PUBLIC
@@ -139,22 +140,40 @@ typedef unsigned char boolean;
 
 
 
-#if defined(__GNUC__)
-#define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___aligned[SIZE] __attribute__(( aligned( 16 ) ))
-#define ALIGN16_ASSIGN(NAME) NAME##___aligned
-#define ALIGN16_ATTRIB  __attribute__(( aligned( 16 ) ))
-#define ALIGN8_ATTRIB  __attribute__(( aligned( 8 ) ))
+/* Macros for data alignment. */
+#if defined(__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
+
+/* See http://gcc.gnu.org/onlinedocs/gcc-4.4.2/gcc/Type-Attributes.html */
+#define PIPE_ALIGN_TYPE(_alignment, _type) _type __attribute__((aligned(_alignment)))
+
+/* See http://gcc.gnu.org/onlinedocs/gcc-4.4.2/gcc/Variable-Attributes.html */
+#define PIPE_ALIGN_VAR(_alignment) __attribute__((aligned(_alignment)))
+
 #if (__GNUC__ > 4 || (__GNUC__ == 4 &&__GNUC_MINOR__>1)) && !defined(PIPE_ARCH_X86_64)
-#define ALIGN_STACK __attribute__((force_align_arg_pointer))
+#define PIPE_ALIGN_STACK __attribute__((force_align_arg_pointer))
 #else
-#define ALIGN_STACK
+#define PIPE_ALIGN_STACK
 #endif
+
+#elif defined(_MSC_VER)
+
+/* See http://msdn.microsoft.com/en-us/library/83ythb65.aspx */
+#define PIPE_ALIGN_TYPE(_alignment, _type) __declspec(align(_alignment)) _type
+#define PIPE_ALIGN_VAR(_alignment) __declspec(align(_alignment))
+
+#define PIPE_ALIGN_STACK
+
+#elif defined(SWIG)
+
+#define PIPE_ALIGN_TYPE(_alignment, _type) _type
+#define PIPE_ALIGN_VAR(_alignment)
+
+#define PIPE_ALIGN_STACK
+
 #else
-#define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___unaligned[SIZE + 1]
-#define ALIGN16_ASSIGN(NAME) align16(NAME##___unaligned)
-#define ALIGN16_ATTRIB
-#define ALIGN8_ATTRIB
-#define ALIGN_STACK
+
+#error "Unsupported compiler"
+
 #endif
 
 
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index d2f8085b42..0b8f6da2f4 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -69,6 +69,22 @@ struct pipe_context {
                           unsigned indexSize,
                           unsigned mode, unsigned start, unsigned count);
 
+   void (*draw_arrays_instanced)(struct pipe_context *pipe,
+                                 unsigned mode,
+                                 unsigned start,
+                                 unsigned count,
+                                 unsigned startInstance,
+                                 unsigned instanceCount);
+
+   void (*draw_elements_instanced)(struct pipe_context *pipe,
+                                   struct pipe_buffer *indexBuffer,
+                                   unsigned indexSize,
+                                   unsigned mode,
+                                   unsigned start,
+                                   unsigned count,
+                                   unsigned startInstance,
+                                   unsigned instanceCount);
+
    /* XXX: this is (probably) a temporary entrypoint, as the range
     * information should be available from the vertex_buffer state.
     * Using this to quickly evaluate a specialized path in the draw
@@ -170,7 +186,7 @@ struct pipe_context {
 
    void (*set_constant_buffer)( struct pipe_context *,
                                 uint shader, uint index,
-                                const struct pipe_constant_buffer *buf );
+                                struct pipe_buffer *buf );
 
    void (*set_framebuffer_state)( struct pipe_context *,
                                   const struct pipe_framebuffer_state * );
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index 550e2abc32..b489b04466 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -121,17 +121,18 @@ struct tgsi_declaration_range
    unsigned Last    : 16; /**< UINT */
 };
 
-#define TGSI_SEMANTIC_POSITION  0
-#define TGSI_SEMANTIC_COLOR     1
-#define TGSI_SEMANTIC_BCOLOR    2 /**< back-face color */
-#define TGSI_SEMANTIC_FOG       3
-#define TGSI_SEMANTIC_PSIZE     4
-#define TGSI_SEMANTIC_GENERIC   5
-#define TGSI_SEMANTIC_NORMAL    6
-#define TGSI_SEMANTIC_FACE      7
-#define TGSI_SEMANTIC_EDGEFLAG  8
-#define TGSI_SEMANTIC_PRIMID    9
-#define TGSI_SEMANTIC_COUNT    10 /**< number of semantic values */
+#define TGSI_SEMANTIC_POSITION   0
+#define TGSI_SEMANTIC_COLOR      1
+#define TGSI_SEMANTIC_BCOLOR     2  /**< back-face color */
+#define TGSI_SEMANTIC_FOG        3
+#define TGSI_SEMANTIC_PSIZE      4
+#define TGSI_SEMANTIC_GENERIC    5
+#define TGSI_SEMANTIC_NORMAL     6
+#define TGSI_SEMANTIC_FACE       7
+#define TGSI_SEMANTIC_EDGEFLAG   8
+#define TGSI_SEMANTIC_PRIMID     9
+#define TGSI_SEMANTIC_INSTANCEID 10
+#define TGSI_SEMANTIC_COUNT      11 /**< number of semantic values */
 
 struct tgsi_declaration_semantic
 {
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 60e96b98de..fdd29ed449 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -66,10 +66,6 @@ extern "C" {
 #define PIPE_MAX_TEXTURE_LEVELS   16
 
 
-/* fwd decls */
-struct pipe_surface;
-
-
 /**
  * The driver will certainly subclass this to include actual memory
  * management information.
@@ -178,15 +174,6 @@ struct pipe_clip_state
 };
 
 
-/**
- * Constants for vertex/fragment shaders
- */
-struct pipe_constant_buffer
-{
-   struct pipe_buffer *buffer;
-};
-
-
 struct pipe_shader_state
 {
    const struct tgsi_token *tokens;
@@ -376,6 +363,11 @@ struct pipe_vertex_element
    /** Offset of this attribute, in bytes, from the start of the vertex */
    unsigned src_offset;
 
+   /** Instance data rate divisor. 0 means this is per-vertex data,
+    *  n means per-instance data used for n consecutive instances (n > 0).
+    */
+   unsigned instance_divisor;
+
    /** Which vertex_buffer (as given to pipe->set_vertex_buffer()) does
     * this attribute live in?
     */
diff --git a/src/gallium/state_trackers/dri/dri_drawable.c b/src/gallium/state_trackers/dri/dri_drawable.c
index f131e77ac5..0fdfa96b35 100644
--- a/src/gallium/state_trackers/dri/dri_drawable.c
+++ b/src/gallium/state_trackers/dri/dri_drawable.c
@@ -180,6 +180,7 @@ dri_get_buffers(__DRIdrawable * dPriv)
 
       switch (buffers[i].attachment) {
       case __DRI_BUFFER_FRONT_LEFT:
+	 continue;
       case __DRI_BUFFER_FAKE_FRONT_LEFT:
 	 index = ST_SURFACE_FRONT_LEFT;
 	 format = drawable->color_format;
@@ -372,6 +373,7 @@ dri_create_buffer(__DRIscreen * sPriv,
    /* TODO incase of double buffer visual, delay fake creation */
    i = 0;
    drawable->attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
+   drawable->attachments[i++] = __DRI_BUFFER_FAKE_FRONT_LEFT;
 
    if (visual->doubleBufferMode)
       drawable->attachments[i++] = __DRI_BUFFER_BACK_LEFT;
diff --git a/src/gallium/state_trackers/dri/dri_screen.c b/src/gallium/state_trackers/dri/dri_screen.c
index 793db087ee..d8c054313b 100644
--- a/src/gallium/state_trackers/dri/dri_screen.c
+++ b/src/gallium/state_trackers/dri/dri_screen.c
@@ -83,7 +83,7 @@ dri_fill_in_modes(struct dri_screen *screen,
    unsigned num_modes;
    uint8_t depth_bits_array[5];
    uint8_t stencil_bits_array[5];
-   uint8_t msaa_samples_array[1];
+   uint8_t msaa_samples_array[2];
    unsigned depth_buffer_factor;
    unsigned back_buffer_factor;
    unsigned msaa_samples_factor;
@@ -147,8 +147,9 @@ dri_fill_in_modes(struct dri_screen *screen,
    }
 
    msaa_samples_array[0] = 0;
+   msaa_samples_array[1] = 4;
    back_buffer_factor = 3;
-   msaa_samples_factor = 1;
+   msaa_samples_factor = 2;
 
    num_modes =
       depth_buffer_factor * back_buffer_factor * msaa_samples_factor * 4;
@@ -158,7 +159,7 @@ dri_fill_in_modes(struct dri_screen *screen,
 				 depth_bits_array, stencil_bits_array,
 				 depth_buffer_factor, back_buffer_modes,
 				 back_buffer_factor,
-				 msaa_samples_array, 1);
+				 msaa_samples_array, msaa_samples_factor);
    } else {
       __DRIconfig **configs_a8r8g8b8 = NULL;
       __DRIconfig **configs_x8r8g8b8 = NULL;
@@ -170,7 +171,8 @@ dri_fill_in_modes(struct dri_screen *screen,
 					     depth_buffer_factor,
 					     back_buffer_modes,
 					     back_buffer_factor,
-					     msaa_samples_array, 1);
+					     msaa_samples_array,
+                                             msaa_samples_factor);
       if (pf_x8r8g8b8)
 	 configs_x8r8g8b8 = driCreateConfigs(GL_BGR, GL_UNSIGNED_INT_8_8_8_8_REV,
 					     depth_bits_array,
@@ -178,7 +180,8 @@ dri_fill_in_modes(struct dri_screen *screen,
 					     depth_buffer_factor,
 					     back_buffer_modes,
 					     back_buffer_factor,
-					     msaa_samples_array, 1);
+					     msaa_samples_array,
+                                             msaa_samples_factor);
 
       if (configs_a8r8g8b8 && configs_x8r8g8b8)
 	 configs = driConcatConfigs(configs_x8r8g8b8, configs_a8r8g8b8);
diff --git a/src/gallium/state_trackers/egl_g3d/common/egl_g3d.c b/src/gallium/state_trackers/egl_g3d/common/egl_g3d.c
index 8b2e3b5f85..042e9518c2 100644
--- a/src/gallium/state_trackers/egl_g3d/common/egl_g3d.c
+++ b/src/gallium/state_trackers/egl_g3d/common/egl_g3d.c
@@ -24,7 +24,9 @@
 
 #include <assert.h>
 #include <string.h>
+#include "pipe/p_screen.h"
 #include "util/u_memory.h"
+#include "util/u_rect.h"
 #include "egldriver.h"
 #include "eglcurrent.h"
 #include "eglconfigutil.h"
@@ -43,8 +45,13 @@ egl_g3d_validate_context(_EGLDisplay *dpy, _EGLContext *ctx)
    struct egl_g3d_display *gdpy = egl_g3d_display(dpy);
    struct pipe_screen *screen = gdpy->native->screen;
    struct egl_g3d_context *gctx = egl_g3d_context(ctx);
-   EGLint num_surfaces;
-   EGLint s, i;
+   const uint st_att_map[NUM_NATIVE_ATTACHMENTS] = {
+      ST_SURFACE_FRONT_LEFT,
+      ST_SURFACE_BACK_LEFT,
+      ST_SURFACE_FRONT_RIGHT,
+      ST_SURFACE_BACK_RIGHT,
+   };
+   EGLint num_surfaces, s;
 
    /* validate draw and/or read buffers */
    num_surfaces = (gctx->base.ReadSurface == gctx->base.DrawSurface) ? 1 : 2;
@@ -52,6 +59,7 @@ egl_g3d_validate_context(_EGLDisplay *dpy, _EGLContext *ctx)
       struct pipe_texture *textures[NUM_NATIVE_ATTACHMENTS];
       struct egl_g3d_surface *gsurf;
       struct egl_g3d_buffer *gbuf;
+      EGLint att;
 
       if (s == 0) {
          gsurf = egl_g3d_surface(gctx->base.DrawSurface);
@@ -63,35 +71,33 @@ egl_g3d_validate_context(_EGLDisplay *dpy, _EGLContext *ctx)
       }
 
       if (!gctx->force_validate) {
-         EGLint cur_w, cur_h;
-
-         cur_w = gsurf->base.Width;
-         cur_h = gsurf->base.Height;
-         gsurf->native->validate(gsurf->native,
-               gbuf->native_atts, gbuf->num_atts,
-               NULL,
-               &gsurf->base.Width, &gsurf->base.Height);
-         /* validate only when the geometry changed */
-         if (gsurf->base.Width == cur_w && gsurf->base.Height == cur_h)
+         unsigned int seq_num;
+
+         gsurf->native->validate(gsurf->native, gbuf->attachment_mask,
+               &seq_num, NULL, NULL, NULL);
+         /* skip validation */
+         if (gsurf->sequence_number == seq_num)
             continue;
       }
 
-      gsurf->native->validate(gsurf->native,
-            gbuf->native_atts, gbuf->num_atts,
-            (struct pipe_texture **) textures,
+      pipe_surface_reference(&gsurf->render_surface, NULL);
+      memset(textures, 0, sizeof(textures));
+
+      gsurf->native->validate(gsurf->native, gbuf->attachment_mask,
+            &gsurf->sequence_number, textures,
             &gsurf->base.Width, &gsurf->base.Height);
-      for (i = 0; i < gbuf->num_atts; i++) {
-         struct pipe_texture *pt = textures[i];
+      for (att = 0; att < NUM_NATIVE_ATTACHMENTS; att++) {
+         struct pipe_texture *pt = textures[att];
          struct pipe_surface *ps;
 
-         if (pt) {
+         if (native_attachment_mask_test(gbuf->attachment_mask, att) && pt) {
             ps = screen->get_tex_surface(screen, pt, 0, 0, 0,
                   PIPE_BUFFER_USAGE_GPU_READ |
                   PIPE_BUFFER_USAGE_GPU_WRITE);
             gctx->stapi->st_set_framebuffer_surface(gbuf->st_fb,
-                  gbuf->st_atts[i], ps);
+                  st_att_map[att], ps);
 
-            if (gbuf->native_atts[i] == gsurf->render_att)
+            if (gsurf->render_att == att)
                pipe_surface_reference(&gsurf->render_surface, ps);
 
             pipe_surface_reference(&ps, NULL);
@@ -130,13 +136,7 @@ static void
 egl_g3d_route_context(_EGLDisplay *dpy, _EGLContext *ctx)
 {
    struct egl_g3d_context *gctx = egl_g3d_context(ctx);
-   const uint st_att_map[NUM_NATIVE_ATTACHMENTS] = {
-      ST_SURFACE_FRONT_LEFT,
-      ST_SURFACE_BACK_LEFT,
-      ST_SURFACE_FRONT_RIGHT,
-      ST_SURFACE_BACK_RIGHT,
-   };
-   EGLint s, i;
+   EGLint s;
 
    /* route draw and read buffers' attachments */
    for (s = 0; s < 2; s++) {
@@ -152,11 +152,7 @@ egl_g3d_route_context(_EGLDisplay *dpy, _EGLContext *ctx)
          gbuf = &gctx->read;
       }
 
-      gbuf->native_atts[0] = gsurf->render_att;
-      gbuf->num_atts = 1;
-
-      for (i = 0; i < gbuf->num_atts; i++)
-         gbuf->st_atts[i] = st_att_map[gbuf->native_atts[i]];
+      gbuf->attachment_mask = (1 << gsurf->render_att);
 
       /* FIXME OpenGL defaults to draw the front or back buffer when the
        * context is single-buffered or double-buffered respectively.  In EGL,
@@ -198,19 +194,19 @@ egl_g3d_realloc_context(_EGLDisplay *dpy, _EGLContext *ctx)
       if (!gdraw || priv != (void *) &gdraw->base) {
          gctx->stapi->st_unreference_framebuffer(gctx->draw.st_fb);
          gctx->draw.st_fb = NULL;
-         gctx->draw.num_atts = 0;
+         gctx->draw.attachment_mask = 0x0;
       }
 
       if (is_equal) {
          gctx->read.st_fb = NULL;
-         gctx->draw.num_atts = 0;
+         gctx->draw.attachment_mask = 0x0;
       }
       else {
          priv = gctx->stapi->st_framebuffer_private(gctx->read.st_fb);
          if (!gread || priv != (void *) &gread->base) {
             gctx->stapi->st_unreference_framebuffer(gctx->read.st_fb);
             gctx->read.st_fb = NULL;
-            gctx->draw.num_atts = 0;
+            gctx->draw.attachment_mask = 0x0;
          }
       }
    }
@@ -459,16 +455,30 @@ egl_g3d_add_configs(_EGLDriver *drv, _EGLDisplay *dpy, EGLint id)
  * Flush the front buffer of the context's draw surface.
  */
 static void
-egl_g3d_flush_frontbuffer(void *dummy, struct pipe_surface *surf,
-                          void *context_private)
+egl_g3d_flush_frontbuffer(struct pipe_screen *screen,
+                          struct pipe_surface *surf, void *context_private)
 {
    struct egl_g3d_context *gctx = egl_g3d_context(context_private);
    struct egl_g3d_surface *gsurf = egl_g3d_surface(gctx->base.DrawSurface);
 
-   if (gsurf) {
+   if (gsurf)
       gsurf->native->flush_frontbuffer(gsurf->native);
-      egl_g3d_validate_context(gctx->base.Display, &gctx->base);
-   }
+}
+
+/**
+ * Re-validate the context.
+ */
+static void
+egl_g3d_update_buffer(struct pipe_screen *screen, void *context_private)
+{
+   struct egl_g3d_context *gctx = egl_g3d_context(context_private);
+
+   /**
+    * It is likely that the surface has changed when this function is called.
+    * Set force_validate to skip an unnecessary check.
+    */
+   gctx->force_validate = EGL_TRUE;
+   egl_g3d_validate_context(gctx->base.Display, &gctx->base);
 }
 
 static EGLBoolean
@@ -512,13 +522,15 @@ egl_g3d_initialize(_EGLDriver *drv, _EGLDisplay *dpy,
    }
    dpy->DriverData = gdpy;
 
-   gdpy->native =
-      native_create_display(dpy->NativeDisplay, egl_g3d_flush_frontbuffer);
+   gdpy->native = native_create_display(dpy->NativeDisplay);
    if (!gdpy->native) {
       _eglError(EGL_NOT_INITIALIZED, "eglInitialize(no usable display)");
       goto fail;
    }
 
+   gdpy->native->screen->flush_frontbuffer = egl_g3d_flush_frontbuffer;
+   gdpy->native->screen->update_buffer = egl_g3d_update_buffer;
+
    dpy->ClientAPIsMask = gdrv->api_mask;
 
    if (egl_g3d_add_configs(drv, dpy, 1) == 1) {
@@ -550,7 +562,7 @@ egl_g3d_create_context(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *conf,
                        _EGLContext *share, const EGLint *attribs)
 {
    struct egl_g3d_display *gdpy = egl_g3d_display(dpy);
-   struct egl_g3d_context *xshare = egl_g3d_context(share);
+   struct egl_g3d_context *gshare = egl_g3d_context(share);
    struct egl_g3d_config *gconf = egl_g3d_config(conf);
    struct egl_g3d_context *gctx;
    const __GLcontextModes *mode;
@@ -575,8 +587,18 @@ egl_g3d_create_context(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *conf,
    mode = &gconf->native->mode;
    gctx->pipe =
       gdpy->native->create_context(gdpy->native, (void *) &gctx->base);
+   if (!gctx->pipe) {
+      free(gctx);
+      return NULL;
+   }
+
    gctx->st_ctx = gctx->stapi->st_create_context(gctx->pipe, mode,
-                        (xshare) ? xshare->st_ctx : NULL);
+                        (gshare) ? gshare->st_ctx : NULL);
+   if (!gctx->st_ctx) {
+      gctx->pipe->destroy(gctx->pipe);
+      free(gctx);
+      return NULL;
+   }
 
    return &gctx->base;
 }
@@ -599,6 +621,16 @@ egl_g3d_destroy_context(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx)
    return EGL_TRUE;
 }
 
+static EGLBoolean
+init_surface_geometry(_EGLSurface *surf)
+{
+   struct egl_g3d_surface *gsurf = egl_g3d_surface(surf);
+
+   return gsurf->native->validate(gsurf->native, 0x0,
+         &gsurf->sequence_number, NULL,
+         &gsurf->base.Width, &gsurf->base.Height);
+}
+
 static _EGLSurface *
 egl_g3d_create_window_surface(_EGLDriver *drv, _EGLDisplay *dpy,
                               _EGLConfig *conf, EGLNativeWindowType win,
@@ -626,8 +658,7 @@ egl_g3d_create_window_surface(_EGLDriver *drv, _EGLDisplay *dpy,
       return NULL;
    }
 
-   if (!gsurf->native->validate(gsurf->native, NULL, 0, NULL,
-            &gsurf->base.Width, &gsurf->base.Height)) {
+   if (!init_surface_geometry(&gsurf->base)) {
       gsurf->native->destroy(gsurf->native);
       free(gsurf);
       return NULL;
@@ -667,8 +698,7 @@ egl_g3d_create_pixmap_surface(_EGLDriver *drv, _EGLDisplay *dpy,
       return NULL;
    }
 
-   if (!gsurf->native->validate(gsurf->native, NULL, 0, NULL,
-            &gsurf->base.Width, &gsurf->base.Height)) {
+   if (!init_surface_geometry(&gsurf->base)) {
       gsurf->native->destroy(gsurf->native);
       free(gsurf);
       return NULL;
@@ -706,6 +736,12 @@ egl_g3d_create_pbuffer_surface(_EGLDriver *drv, _EGLDisplay *dpy,
       return NULL;
    }
 
+   if (!init_surface_geometry(&gsurf->base)) {
+      gsurf->native->destroy(gsurf->native);
+      free(gsurf);
+      return NULL;
+   }
+
    gsurf->render_att = (!gconf->native->mode.doubleBufferMode) ?
       NATIVE_ATTACHMENT_FRONT_LEFT : NATIVE_ATTACHMENT_BACK_LEFT;
 
@@ -817,14 +853,110 @@ egl_g3d_swap_buffers(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surf)
       struct egl_g3d_config *gconf = egl_g3d_config(gsurf->base.Config);
 
       /* force validation if the swap method is not copy */
-      if (gconf->native->mode.swapMethod != GLX_SWAP_COPY_OML)
+      if (gconf->native->mode.swapMethod != GLX_SWAP_COPY_OML) {
          gctx->force_validate = EGL_TRUE;
-      egl_g3d_validate_context(dpy, &gctx->base);
+         egl_g3d_validate_context(dpy, &gctx->base);
+      }
    }
 
    return EGL_TRUE;
 }
 
+/**
+ * Find a config that supports the pixmap.
+ */
+static _EGLConfig *
+find_pixmap_config(_EGLDisplay *dpy, EGLNativePixmapType pix)
+{
+   struct egl_g3d_display *gdpy = egl_g3d_display(dpy);
+   struct egl_g3d_config *gconf;
+   EGLint i;
+
+   for (i = 0; i < dpy->NumConfigs; i++) {
+      gconf = egl_g3d_config(dpy->Configs[i]);
+      if (gdpy->native->is_pixmap_supported(gdpy->native, pix, gconf->native))
+         break;
+   }
+
+   return (i < dpy->NumConfigs) ? &gconf->base : NULL;
+}
+
+/**
+ * Get the pipe surface of the given attachment of the native surface.
+ */
+static struct pipe_surface *
+get_pipe_surface(struct native_display *ndpy, struct native_surface *nsurf,
+                 enum native_attachment natt)
+{
+   struct pipe_texture *textures[NUM_NATIVE_ATTACHMENTS];
+   struct pipe_surface *psurf;
+
+   textures[natt] = NULL;
+   nsurf->validate(nsurf, 1 << natt, NULL, textures, NULL, NULL);
+   if (!textures[natt])
+      return NULL;
+
+   psurf = ndpy->screen->get_tex_surface(ndpy->screen, textures[natt],
+         0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE);
+   pipe_texture_reference(&textures[natt], NULL);
+
+   return psurf;
+}
+
+static EGLBoolean
+egl_g3d_copy_buffers(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surf,
+                     NativePixmapType target)
+{
+   struct egl_g3d_display *gdpy = egl_g3d_display(dpy);
+   struct egl_g3d_surface *gsurf = egl_g3d_surface(surf);
+   _EGLContext *ctx = _eglGetCurrentContext();
+   struct egl_g3d_config *gconf;
+   struct native_surface *nsurf;
+   struct pipe_screen *screen = gdpy->native->screen;
+   struct pipe_surface *psurf;
+
+   if (!gsurf->render_surface)
+      return EGL_TRUE;
+
+   gconf = egl_g3d_config(find_pixmap_config(dpy, target));
+   if (!gconf)
+      return _eglError(EGL_BAD_NATIVE_PIXMAP, "eglCopyBuffers");
+
+   nsurf = gdpy->native->create_pixmap_surface(gdpy->native,
+         target, gconf->native);
+   if (!nsurf)
+      return _eglError(EGL_BAD_NATIVE_PIXMAP, "eglCopyBuffers");
+
+   /* flush if the surface is current */
+   if (ctx && ctx->DrawSurface == &gsurf->base) {
+      struct egl_g3d_context *gctx = egl_g3d_context(ctx);
+      gctx->stapi->st_flush(gctx->st_ctx,
+            PIPE_FLUSH_RENDER_CACHE | PIPE_FLUSH_FRAME, NULL);
+   }
+
+   psurf = get_pipe_surface(gdpy->native, nsurf, NATIVE_ATTACHMENT_FRONT_LEFT);
+   if (psurf) {
+      struct pipe_context pipe;
+
+      /**
+       * XXX This is hacky.  If we might allow the EGLDisplay to create a pipe
+       * context of its own and use the blitter context for this.
+       */
+      memset(&pipe, 0, sizeof(pipe));
+      pipe.screen = screen;
+
+      util_surface_copy(&pipe, FALSE, psurf, 0, 0,
+            gsurf->render_surface, 0, 0, psurf->width, psurf->height);
+
+      pipe_surface_reference(&psurf, NULL);
+      nsurf->flush_frontbuffer(nsurf);
+   }
+
+   nsurf->destroy(nsurf);
+
+   return EGL_TRUE;
+}
+
 static EGLBoolean
 egl_g3d_wait_client(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx)
 {
@@ -849,15 +981,14 @@ egl_g3d_wait_native(_EGLDriver *drv, _EGLDisplay *dpy, EGLint engine)
 }
 
 static _EGLProc
-egl_g3d_get_proc_address(const char *procname)
+egl_g3d_get_proc_address(_EGLDriver *drv, const char *procname)
 {
-   /* FIXME how come _EGLDriver is not passed? */
-   const struct egl_g3d_st *stapi;
+   struct egl_g3d_driver *gdrv = egl_g3d_driver(drv);
    _EGLProc proc;
    EGLint i;
 
    for (i = 0; i < NUM_EGL_G3D_STS; i++) {
-      stapi = egl_g3d_get_st(i);
+      const struct egl_g3d_st *stapi = gdrv->stapis[i];
       if (stapi) {
          proc = (_EGLProc) stapi->st_get_proc_address(procname);
          if (proc)
@@ -1079,6 +1210,7 @@ _eglMain(const char *args)
    gdrv->base.API.DestroySurface = egl_g3d_destroy_surface;
    gdrv->base.API.MakeCurrent = egl_g3d_make_current;
    gdrv->base.API.SwapBuffers = egl_g3d_swap_buffers;
+   gdrv->base.API.CopyBuffers = egl_g3d_copy_buffers;
    gdrv->base.API.WaitClient = egl_g3d_wait_client;
    gdrv->base.API.WaitNative = egl_g3d_wait_native;
    gdrv->base.API.GetProcAddress = egl_g3d_get_proc_address;
diff --git a/src/gallium/state_trackers/egl_g3d/common/egl_g3d.h b/src/gallium/state_trackers/egl_g3d/common/egl_g3d.h
index 33894b614f..1da8af495b 100644
--- a/src/gallium/state_trackers/egl_g3d/common/egl_g3d.h
+++ b/src/gallium/state_trackers/egl_g3d/common/egl_g3d.h
@@ -52,9 +52,7 @@ struct egl_g3d_display {
 
 struct egl_g3d_buffer {
    struct st_framebuffer *st_fb;
-   EGLint num_atts;
-   enum native_attachment native_atts[NUM_NATIVE_ATTACHMENTS];
-   uint st_atts[NUM_NATIVE_ATTACHMENTS];
+   uint attachment_mask;
 };
 
 struct egl_g3d_context {
@@ -73,6 +71,7 @@ struct egl_g3d_surface {
    struct native_surface *native;
    enum native_attachment render_att;
    struct pipe_surface *render_surface;
+   unsigned int sequence_number;
 };
 
 struct egl_g3d_config {
diff --git a/src/gallium/state_trackers/egl_g3d/common/native.h b/src/gallium/state_trackers/egl_g3d/common/native.h
index 88d87c6f3a..f374f2e4a6 100644
--- a/src/gallium/state_trackers/egl_g3d/common/native.h
+++ b/src/gallium/state_trackers/egl_g3d/common/native.h
@@ -64,13 +64,19 @@ struct native_surface {
    boolean (*flush_frontbuffer)(struct native_surface *nsurf);
 
    /**
-    * Validate the buffers of the surface.  Those not listed in the attachments
-    * will be destroyed.  The returned textures are owned by the caller.
+    * Validate the buffers of the surface.  textures, if not NULL, points to an
+    * array of size NUM_NATIVE_ATTACHMENTS and the returned textures are owned
+    * by the caller.  A sequence number is also returned.  The caller can use
+    * it to check if anything has changed since the last call. Any of the
+    * pointers may be NULL and it indicates the caller has no interest in those
+    * values.
+    *
+    * If this function is called multiple times with different attachment
+    * masks, those not listed in the latest call might be destroyed.  This
+    * behavior might change in the future.
     */
-   boolean (*validate)(struct native_surface *nsurf,
-                       const enum native_attachment *natts,
-                       unsigned num_natts,
-                       struct pipe_texture **textures,
+   boolean (*validate)(struct native_surface *nsurf, uint attachment_mask,
+                       unsigned int *seq_num, struct pipe_texture **textures,
                        int *width, int *height);
 
    /**
@@ -80,6 +86,7 @@ struct native_surface {
 };
 
 struct native_config {
+   /* __GLcontextModes should go away some day */
    __GLcontextModes mode;
    enum pipe_format color_format;
    enum pipe_format depth_format;
@@ -107,7 +114,14 @@ struct native_display_modeset;
  * the native display server.
  */
 struct native_display {
+   /**
+    * The pipe screen of the native display.
+    *
+    * Note that the "flush_frontbuffer" and "update_buffer" callbacks will be
+    * overridden.
+    */
    struct pipe_screen *screen;
+
    void (*destroy)(struct native_display *ndpy);
 
    /**
@@ -122,6 +136,17 @@ struct native_display {
                                                int *num_configs);
 
    /**
+    * Test if a pixmap is supported by the given config.  Required unless no
+    * config has GLX_PIXMAP_BIT set.
+    *
+    * This function is usually called to find a config that supports a given
+    * pixmap.  Thus, it is usually called with the same pixmap in a row.
+    */
+   boolean (*is_pixmap_supported)(struct native_display *ndpy,
+                                  EGLNativePixmapType pix,
+                                  const struct native_config *nconf);
+
+   /**
     * Create a pipe context.
     */
    struct pipe_context *(*create_context)(struct native_display *ndpy,
@@ -197,15 +222,19 @@ struct native_display_modeset {
                       const struct native_mode *nmode);
 };
 
-typedef void (*native_flush_frontbuffer)(void *dummy,
-                                         struct pipe_surface *surf,
-                                         void *context_private);
+/**
+ * Test whether an attachment is set in the mask.
+ */
+static INLINE boolean
+native_attachment_mask_test(uint mask, enum native_attachment att)
+{
+   return !!(mask & (1 << att));
+}
 
 const char *
 native_get_name(void);
 
 struct native_display *
-native_create_display(EGLNativeDisplayType dpy,
-                      native_flush_frontbuffer flush_frontbuffer);
+native_create_display(EGLNativeDisplayType dpy);
 
 #endif /* _NATIVE_H_ */
diff --git a/src/gallium/state_trackers/egl_g3d/kms/native_kms.c b/src/gallium/state_trackers/egl_g3d/kms/native_kms.c
index 0e0babdb14..dc66436630 100644
--- a/src/gallium/state_trackers/egl_g3d/kms/native_kms.c
+++ b/src/gallium/state_trackers/egl_g3d/kms/native_kms.c
@@ -33,22 +33,17 @@
 #include "native_kms.h"
 
 static boolean
-kms_surface_validate(struct native_surface *nsurf,
-                     const enum native_attachment *natts,
-                     unsigned num_natts,
-                     struct pipe_texture **textures,
+kms_surface_validate(struct native_surface *nsurf, uint attachment_mask,
+                     unsigned int *seq_num, struct pipe_texture **textures,
                      int *width, int *height)
 {
    struct kms_surface *ksurf = kms_surface(nsurf);
    struct kms_display *kdpy = ksurf->kdpy;
    struct pipe_screen *screen = kdpy->base.screen;
    struct pipe_texture templ, *ptex;
-   int i;
-
-   if (num_natts) {
-      if (textures)
-         memset(textures, 0, sizeof(*textures) * num_natts);
+   int att;
 
+   if (attachment_mask) {
       memset(&templ, 0, sizeof(templ));
       templ.target = PIPE_TEXTURE_2D;
       templ.last_level = 0;
@@ -62,19 +57,25 @@ kms_surface_validate(struct native_surface *nsurf,
    }
 
    /* create textures */
-   for (i = 0; i < num_natts; i++) {
-      enum native_attachment natt = natts[i];
+   for (att = 0; att < NUM_NATIVE_ATTACHMENTS; att++) {
+      /* delay the allocation */
+      if (!native_attachment_mask_test(attachment_mask, att))
+         continue;
 
-      ptex = ksurf->textures[natt];
+      ptex = ksurf->textures[att];
       if (!ptex) {
          ptex = screen->texture_create(screen, &templ);
-         ksurf->textures[natt] = ptex;
+         ksurf->textures[att] = ptex;
       }
 
-      if (textures)
-         pipe_texture_reference(&textures[i], ptex);
+      if (textures) {
+         textures[att] = NULL;
+         pipe_texture_reference(&textures[att], ptex);
+      }
    }
 
+   if (seq_num)
+      *seq_num = ksurf->sequence_number;
    if (width)
       *width = ksurf->width;
    if (height)
@@ -111,7 +112,7 @@ kms_surface_init_framebuffers(struct native_surface *nsurf, boolean need_back)
 
       if (!fb->texture) {
          /* make sure the texture has been allocated */
-         kms_surface_validate(&ksurf->base, &natt, 1, NULL, NULL, NULL);
+         kms_surface_validate(&ksurf->base, 1 << natt, NULL, NULL, NULL, NULL);
          if (!ksurf->textures[natt])
             return FALSE;
 
@@ -196,6 +197,9 @@ kms_surface_swap_buffers(struct native_surface *nsurf)
       ksurf->textures[NATIVE_ATTACHMENT_BACK_LEFT];
    ksurf->textures[NATIVE_ATTACHMENT_BACK_LEFT] = tmp_texture;
 
+   /* the front/back textures are swapped */
+   ksurf->sequence_number++;
+
    return TRUE;
 }
 
@@ -764,8 +768,7 @@ static struct native_display_modeset kms_display_modeset = {
 };
 
 static struct native_display *
-kms_create_display(EGLNativeDisplayType dpy, struct drm_api *api,
-                   native_flush_frontbuffer flush_frontbuffer)
+kms_create_display(EGLNativeDisplayType dpy, struct drm_api *api)
 {
    struct kms_display *kdpy;
 
@@ -807,10 +810,6 @@ kms_create_display(EGLNativeDisplayType dpy, struct drm_api *api,
       return NULL;
    }
 
-   kdpy->base.screen->flush_frontbuffer =
-      (void (*)(struct pipe_screen *, struct pipe_surface *, void *))
-      flush_frontbuffer;
-
    kdpy->base.destroy = kms_display_destroy;
    kdpy->base.get_configs = kms_display_get_configs;
    kdpy->base.create_context = kms_display_create_context;
@@ -821,13 +820,6 @@ kms_create_display(EGLNativeDisplayType dpy, struct drm_api *api,
    return &kdpy->base;
 }
 
-static void
-dummy_flush_frontbuffer(void *dummy, struct pipe_surface *surf,
-                        void *context_private)
-{
-   _eglLog(_EGL_WARNING, "flush_frontbuffer is not supplied");
-}
-
 /* the api is destroyed with the native display */
 static struct drm_api *drm_api;
 
@@ -848,19 +840,15 @@ native_get_name(void)
 }
 
 struct native_display *
-native_create_display(EGLNativeDisplayType dpy,
-                      native_flush_frontbuffer flush_frontbuffer)
+native_create_display(EGLNativeDisplayType dpy)
 {
    struct native_display *ndpy = NULL;
 
    if (!drm_api)
       drm_api = drm_api_create();
 
-   if (!flush_frontbuffer)
-      flush_frontbuffer = dummy_flush_frontbuffer;
-
    if (drm_api)
-      ndpy = kms_create_display(dpy, drm_api, flush_frontbuffer);
+      ndpy = kms_create_display(dpy, drm_api);
 
    return ndpy;
 }
diff --git a/src/gallium/state_trackers/egl_g3d/kms/native_kms.h b/src/gallium/state_trackers/egl_g3d/kms/native_kms.h
index 3f869b25ac..095186e3cf 100644
--- a/src/gallium/state_trackers/egl_g3d/kms/native_kms.h
+++ b/src/gallium/state_trackers/egl_g3d/kms/native_kms.h
@@ -81,6 +81,7 @@ struct kms_surface {
    int width, height;
 
    struct pipe_texture *textures[NUM_NATIVE_ATTACHMENTS];
+   unsigned int sequence_number;
    struct kms_framebuffer front_fb, back_fb;
 
    boolean is_shown;
diff --git a/src/gallium/state_trackers/egl_g3d/x11/native_dri2.c b/src/gallium/state_trackers/egl_g3d/x11/native_dri2.c
index 0dda786bbd..07f82d878c 100644
--- a/src/gallium/state_trackers/egl_g3d/x11/native_dri2.c
+++ b/src/gallium/state_trackers/egl_g3d/x11/native_dri2.c
@@ -64,6 +64,7 @@ struct dri2_surface {
    struct pipe_texture *pbuffer_textures[NUM_NATIVE_ATTACHMENTS];
    boolean have_back, have_fake;
    int width, height;
+   unsigned int sequence_number;
 };
 
 struct dri2_config {
@@ -133,21 +134,18 @@ dri2_surface_swap_buffers(struct native_surface *nsurf)
 }
 
 static boolean
-dri2_surface_validate(struct native_surface *nsurf,
-                             const enum native_attachment *natts,
-                             unsigned num_natts,
-                             struct pipe_texture **textures,
-                             int *width, int *height)
+dri2_surface_validate(struct native_surface *nsurf, uint attachment_mask,
+                      unsigned int *seq_num, struct pipe_texture **textures,
+                      int *width, int *height)
 {
    struct dri2_surface *dri2surf = dri2_surface(nsurf);
    struct dri2_display *dri2dpy = dri2surf->dri2dpy;
    unsigned int dri2atts[NUM_NATIVE_ATTACHMENTS];
-   EGLint texture_indices[NUM_NATIVE_ATTACHMENTS];
    struct pipe_texture templ;
    struct x11_drawable_buffer *xbufs;
-   int num_ins, num_outs, i;
+   int num_ins, num_outs, att, i;
 
-   if (num_natts) {
+   if (attachment_mask) {
       memset(&templ, 0, sizeof(templ));
       templ.target = PIPE_TEXTURE_2D;
       templ.last_level = 0;
@@ -158,26 +156,31 @@ dri2_surface_validate(struct native_surface *nsurf,
       templ.tex_usage = PIPE_TEXTURE_USAGE_RENDER_TARGET;
 
       if (textures)
-         memset(textures, 0, sizeof(*textures) * num_natts);
+         memset(textures, 0, sizeof(*textures) * NUM_NATIVE_ATTACHMENTS);
    }
 
    /* create textures for pbuffer */
    if (dri2surf->type == DRI2_SURFACE_TYPE_PBUFFER) {
       struct pipe_screen *screen = dri2dpy->base.screen;
 
-      for (i = 0; i < num_natts; i++) {
-         enum native_attachment natt = natts[i];
-         struct pipe_texture *ptex = dri2surf->pbuffer_textures[natt];
+      for (att = 0; att < NUM_NATIVE_ATTACHMENTS; att++) {
+         struct pipe_texture *ptex = dri2surf->pbuffer_textures[att];
+
+         /* delay the allocation */
+         if (!native_attachment_mask_test(attachment_mask, att))
+            continue;
 
          if (!ptex) {
             ptex = screen->texture_create(screen, &templ);
-            dri2surf->pbuffer_textures[natt] = ptex;
+            dri2surf->pbuffer_textures[att] = ptex;
          }
 
          if (textures)
-            pipe_texture_reference(&textures[i], ptex);
+            pipe_texture_reference(&textures[att], ptex);
       }
 
+      if (seq_num)
+         *seq_num = dri2surf->sequence_number;
       if (width)
          *width = dri2surf->width;
       if (height)
@@ -186,48 +189,56 @@ dri2_surface_validate(struct native_surface *nsurf,
       return TRUE;
    }
 
-   for (i = 0; i < NUM_NATIVE_ATTACHMENTS; i++)
-      texture_indices[i] = -1;
-
    /* prepare the attachments */
-   num_ins = num_natts;
-   for (i = 0; i < num_natts; i++) {
-      unsigned int dri2att;
+   num_ins = 0;
+   for (att = 0; att < NUM_NATIVE_ATTACHMENTS; att++) {
+      if (native_attachment_mask_test(attachment_mask, att)) {
+         unsigned int dri2att;
+
+         switch (att) {
+         case NATIVE_ATTACHMENT_FRONT_LEFT:
+            dri2att = DRI2BufferFrontLeft;
+            break;
+         case NATIVE_ATTACHMENT_BACK_LEFT:
+            dri2att = DRI2BufferBackLeft;
+            break;
+         case NATIVE_ATTACHMENT_FRONT_RIGHT:
+            dri2att = DRI2BufferFrontRight;
+            break;
+         case NATIVE_ATTACHMENT_BACK_RIGHT:
+            dri2att = DRI2BufferBackRight;
+            break;
+         default:
+            assert(0);
+            dri2att = 0;
+            break;
+         }
 
-      switch (natts[i]) {
-      case NATIVE_ATTACHMENT_FRONT_LEFT:
-         dri2att = DRI2BufferFrontLeft;
-         break;
-      case NATIVE_ATTACHMENT_BACK_LEFT:
-         dri2att = DRI2BufferBackLeft;
-         break;
-      case NATIVE_ATTACHMENT_FRONT_RIGHT:
-         dri2att = DRI2BufferFrontRight;
-         break;
-      case NATIVE_ATTACHMENT_BACK_RIGHT:
-         dri2att = DRI2BufferBackRight;
-         break;
-      default:
-         assert(0);
-         dri2att = 0;
-         break;
+         dri2atts[num_ins] = dri2att;
+         num_ins++;
       }
-      dri2atts[i] = dri2att;
-      texture_indices[natts[i]] = i;
    }
 
    dri2surf->have_back = FALSE;
    dri2surf->have_fake = FALSE;
 
+   /* remember old geometry */
+   templ.width0 = dri2surf->width;
+   templ.height0 = dri2surf->height;
+
    xbufs = x11_drawable_get_buffers(dri2dpy->xscr, dri2surf->drawable,
                                     &dri2surf->width, &dri2surf->height,
                                     dri2atts, FALSE, num_ins, &num_outs);
    if (!xbufs)
       return FALSE;
 
-   /* update width and height */
-   templ.width0 = dri2surf->width;
-   templ.height0 = dri2surf->height;
+   if (templ.width0 != dri2surf->width || templ.height0 != dri2surf->height) {
+      /* are there cases where the buffers change and the geometry doesn't? */
+      dri2surf->sequence_number++;
+
+      templ.width0 = dri2surf->width;
+      templ.height0 = dri2surf->height;
+   }
 
    for (i = 0; i < num_outs; i++) {
       struct x11_drawable_buffer *xbuf = &xbufs[i];
@@ -254,13 +265,13 @@ dri2_surface_validate(struct native_surface *nsurf,
          break;
       }
 
-      if (!desc || texture_indices[natt] < 0 ||
-          (textures && textures[texture_indices[natt]])) {
+      if (!desc || !native_attachment_mask_test(attachment_mask, natt) ||
+          (textures && textures[natt])) {
          if (!desc)
             _eglLog(_EGL_WARNING, "unknown buffer %d", xbuf->attachment);
-         else if (texture_indices[natt] < 0)
+         else if (!native_attachment_mask_test(attachment_mask, natt))
             _eglLog(_EGL_WARNING, "unexpected buffer %d", xbuf->attachment);
-         else if (textures && textures[texture_indices[natt]])
+         else
             _eglLog(_EGL_WARNING, "both real and fake front buffers are listed");
          continue;
       }
@@ -272,13 +283,15 @@ dri2_surface_validate(struct native_surface *nsurf,
                   desc, xbuf->pitch, xbuf->name);
          if (ptex) {
             /* the caller owns the textures */
-            textures[texture_indices[natt]] = ptex;
+            textures[natt] = ptex;
          }
       }
    }
 
    free(xbufs);
 
+   if (seq_num)
+      *seq_num = dri2surf->sequence_number;
    if (width)
       *width = dri2surf->width;
    if (height)
@@ -575,6 +588,21 @@ dri2_display_get_configs(struct native_display *ndpy, int *num_configs)
    return configs;
 }
 
+static boolean
+dri2_display_is_pixmap_supported(struct native_display *ndpy,
+                                 EGLNativePixmapType pix,
+                                 const struct native_config *nconf)
+{
+   struct dri2_display *dri2dpy = dri2_display(ndpy);
+   uint depth, nconf_depth;
+
+   depth = x11_drawable_get_depth(dri2dpy->xscr, (Drawable) pix);
+   nconf_depth = util_format_get_blocksizebits(nconf->color_format);
+
+   /* simple depth match for now */
+   return (depth == nconf_depth || (depth == 24 && depth + 8 == nconf_depth));
+}
+
 static void
 dri2_display_destroy(struct native_display *ndpy)
 {
@@ -627,18 +655,8 @@ dri2_display_init_screen(struct native_display *ndpy)
    return TRUE;
 }
 
-static void
-dri2_display_flush_frontbuffer(void *dummy, struct pipe_surface *surf,
-                               void *context_private)
-{
-   /* TODO get native surface from context private, and remove the callback */
-   _eglLog(_EGL_WARNING, "flush_frontbuffer is not supplied");
-}
-
 struct native_display *
-x11_create_dri2_display(EGLNativeDisplayType dpy,
-                        struct drm_api *api,
-                        native_flush_frontbuffer flush_frontbuffer)
+x11_create_dri2_display(EGLNativeDisplayType dpy, struct drm_api *api)
 {
    struct dri2_display *dri2dpy;
 
@@ -675,15 +693,9 @@ x11_create_dri2_display(EGLNativeDisplayType dpy,
       return NULL;
    }
 
-   if (!flush_frontbuffer)
-      flush_frontbuffer = dri2_display_flush_frontbuffer;
-
-   dri2dpy->base.screen->flush_frontbuffer =
-      (void (*)(struct pipe_screen *, struct pipe_surface *, void *))
-      flush_frontbuffer;
-
    dri2dpy->base.destroy = dri2_display_destroy;
    dri2dpy->base.get_configs = dri2_display_get_configs;
+   dri2dpy->base.is_pixmap_supported = dri2_display_is_pixmap_supported;
    dri2dpy->base.create_context = dri2_display_create_context;
    dri2dpy->base.create_window_surface = dri2_display_create_window_surface;
    dri2dpy->base.create_pixmap_surface = dri2_display_create_pixmap_surface;
diff --git a/src/gallium/state_trackers/egl_g3d/x11/native_x11.c b/src/gallium/state_trackers/egl_g3d/x11/native_x11.c
index a4f36e9dec..583ce3d329 100644
--- a/src/gallium/state_trackers/egl_g3d/x11/native_x11.c
+++ b/src/gallium/state_trackers/egl_g3d/x11/native_x11.c
@@ -48,8 +48,7 @@ native_get_name(void)
 }
 
 struct native_display *
-native_create_display(EGLNativeDisplayType dpy,
-                      native_flush_frontbuffer flush_frontbuffer)
+native_create_display(EGLNativeDisplayType dpy)
 {
    struct native_display *ndpy = NULL;
    boolean force_sw;
@@ -59,14 +58,14 @@ native_create_display(EGLNativeDisplayType dpy,
 
    force_sw = debug_get_bool_option("EGL_SOFTWARE", FALSE);
    if (api && !force_sw) {
-      ndpy = x11_create_dri2_display(dpy, api, flush_frontbuffer);
+      ndpy = x11_create_dri2_display(dpy, api);
    }
 
    if (!ndpy) {
       EGLint level = (force_sw) ? _EGL_INFO : _EGL_WARNING;
 
       _eglLog(level, "use software fallback");
-      ndpy = x11_create_ximage_display(dpy, TRUE, flush_frontbuffer);
+      ndpy = x11_create_ximage_display(dpy, TRUE);
    }
 
    return ndpy;
diff --git a/src/gallium/state_trackers/egl_g3d/x11/native_x11.h b/src/gallium/state_trackers/egl_g3d/x11/native_x11.h
index 9217eb6252..622ddac5df 100644
--- a/src/gallium/state_trackers/egl_g3d/x11/native_x11.h
+++ b/src/gallium/state_trackers/egl_g3d/x11/native_x11.h
@@ -29,13 +29,9 @@
 #include "common/native.h"
 
 struct native_display *
-x11_create_ximage_display(EGLNativeDisplayType dpy,
-                          boolean use_xshm,
-                          native_flush_frontbuffer flush_frontbuffer);
+x11_create_ximage_display(EGLNativeDisplayType dpy, boolean use_xshm);
 
 struct native_display *
-x11_create_dri2_display(EGLNativeDisplayType dpy,
-                        struct drm_api *api,
-                        native_flush_frontbuffer flush_frontbuffer);
+x11_create_dri2_display(EGLNativeDisplayType dpy, struct drm_api *api);
 
 #endif /* _NATIVE_X11_H_ */
diff --git a/src/gallium/state_trackers/egl_g3d/x11/native_ximage.c b/src/gallium/state_trackers/egl_g3d/x11/native_ximage.c
index e02faa9b7b..d76107c47f 100644
--- a/src/gallium/state_trackers/egl_g3d/x11/native_ximage.c
+++ b/src/gallium/state_trackers/egl_g3d/x11/native_ximage.c
@@ -83,6 +83,7 @@ struct ximage_surface {
    GC gc;
 
    struct ximage_buffer buffers[NUM_NATIVE_ATTACHMENTS];
+   unsigned int sequence_number;
 };
 
 struct ximage_config {
@@ -260,6 +261,9 @@ ximage_surface_swap_buffers(struct native_surface *nsurf)
    *xfront = *xback;
    *xback = xtmp;
 
+   /* the front/back textures are swapped */
+   xsurf->sequence_number++;
+
    return ximage_surface_draw_buffer(nsurf, NATIVE_ATTACHMENT_FRONT_LEFT);
 }
 
@@ -285,33 +289,29 @@ ximage_surface_update_geometry(struct native_surface *nsurf)
 }
 
 static boolean
-ximage_surface_validate(struct native_surface *nsurf,
-                               const enum native_attachment *natts,
-                               unsigned num_natts,
-                               struct pipe_texture **textures,
-                               int *width, int *height)
+ximage_surface_validate(struct native_surface *nsurf, uint attachment_mask,
+                        unsigned int *seq_num, struct pipe_texture **textures,
+                        int *width, int *height)
 {
    struct ximage_surface *xsurf = ximage_surface(nsurf);
-   boolean error = FALSE;
-   unsigned i;
+   boolean new_buffers = FALSE;
+   int att;
 
    ximage_surface_update_geometry(&xsurf->base);
 
-   if (textures)
-      memset(textures, 0, sizeof(*textures) * num_natts);
-
-   for (i = 0; i < num_natts; i++) {
-      enum native_attachment natt = natts[i];
-      struct ximage_buffer *xbuf = &xsurf->buffers[natt];
+   for (att = 0; att < NUM_NATIVE_ATTACHMENTS; att++) {
+      struct ximage_buffer *xbuf = &xsurf->buffers[att];
 
-      if (!xbuf)
+      /* delay the allocation */
+      if (!native_attachment_mask_test(attachment_mask, att))
          continue;
 
       /* reallocate the texture */
       if (!xbuf->texture ||
           xsurf->width != xbuf->texture->width0 ||
           xsurf->height != xbuf->texture->height0) {
-         if (ximage_surface_alloc_buffer(&xsurf->base, natt)) {
+         new_buffers = TRUE;
+         if (ximage_surface_alloc_buffer(&xsurf->base, att)) {
             /* update ximage */
             if (xbuf->ximage) {
                xbuf->ximage->width = xbuf->transfer->width;
@@ -321,27 +321,24 @@ ximage_surface_validate(struct native_surface *nsurf,
          }
       }
 
-      /* allocation failed */
-      if (!xbuf->texture) {
-         unsigned j;
-         for (j = 0; j < i; j++)
-            pipe_texture_reference(&textures[j], NULL);
-         for (j = i; j < num_natts; j++)
-            textures[j] = NULL;
-         error = TRUE;
-         break;
+      if (textures) {
+         textures[att] = NULL;
+         pipe_texture_reference(&textures[att], xbuf->texture);
       }
-
-      if (textures)
-         pipe_texture_reference(&textures[i], xbuf->texture);
    }
 
+   /* increase the sequence number so that caller knows */
+   if (new_buffers)
+      xsurf->sequence_number++;
+
+   if (seq_num)
+      *seq_num = xsurf->sequence_number;
    if (width)
       *width = xsurf->width;
    if (height)
       *height = xsurf->height;
 
-   return !error;
+   return TRUE;
 }
 
 static void
@@ -603,6 +600,34 @@ ximage_display_get_configs(struct native_display *ndpy, int *num_configs)
    return configs;
 }
 
+static boolean
+ximage_display_is_pixmap_supported(struct native_display *ndpy,
+                                   EGLNativePixmapType pix,
+                                   const struct native_config *nconf)
+{
+   struct ximage_display *xdpy = ximage_display(ndpy);
+   enum pipe_format fmt;
+   uint depth;
+
+   depth = x11_drawable_get_depth(xdpy->xscr, (Drawable) pix);
+   switch (depth) {
+   case 32:
+      fmt = PIPE_FORMAT_A8R8G8B8_UNORM;
+      break;
+   case 24:
+      fmt = PIPE_FORMAT_X8R8G8B8_UNORM;
+      break;
+   case 16:
+      fmt = PIPE_FORMAT_R5G6B5_UNORM;
+      break;
+   default:
+      fmt = PIPE_FORMAT_NONE;
+      break;
+   }
+
+   return (fmt == nconf->color_format);
+}
+
 static void
 ximage_display_destroy(struct native_display *ndpy)
 {
@@ -620,18 +645,8 @@ ximage_display_destroy(struct native_display *ndpy)
    free(xdpy);
 }
 
-static void
-ximage_display_flush_frontbuffer(void *dummy, struct pipe_surface *surf,
-                                 void *context_private)
-{
-   /* TODO get native surface from context private, and remove the callback */
-   _eglLog(_EGL_WARNING, "flush_frontbuffer is not supplied");
-}
-
 struct native_display *
-x11_create_ximage_display(EGLNativeDisplayType dpy,
-                          boolean use_xshm,
-                          native_flush_frontbuffer flush_frontbuffer)
+x11_create_ximage_display(EGLNativeDisplayType dpy, boolean use_xshm)
 {
    struct ximage_display *xdpy;
 
@@ -660,17 +675,12 @@ x11_create_ximage_display(EGLNativeDisplayType dpy,
       (use_xshm && x11_screen_support(xdpy->xscr, X11_SCREEN_EXTENSION_XSHM));
 
    xdpy->winsys = create_sw_winsys();
-   if (!flush_frontbuffer)
-      flush_frontbuffer = ximage_display_flush_frontbuffer;
-   xdpy->winsys->flush_frontbuffer =
-      (void (*)(struct pipe_winsys *, struct pipe_surface *, void *))
-      flush_frontbuffer;
-
    xdpy->base.screen = softpipe_create_screen(xdpy->winsys);
 
    xdpy->base.destroy = ximage_display_destroy;
 
    xdpy->base.get_configs = ximage_display_get_configs;
+   xdpy->base.is_pixmap_supported = ximage_display_is_pixmap_supported;
    xdpy->base.create_context = ximage_display_create_context;
    xdpy->base.create_window_surface = ximage_display_create_window_surface;
    xdpy->base.create_pixmap_surface = ximage_display_create_pixmap_surface;
diff --git a/src/gallium/state_trackers/egl_g3d/x11/x11_screen.c b/src/gallium/state_trackers/egl_g3d/x11/x11_screen.c
index 1e98943242..4d68a88d2e 100644
--- a/src/gallium/state_trackers/egl_g3d/x11/x11_screen.c
+++ b/src/gallium/state_trackers/egl_g3d/x11/x11_screen.c
@@ -30,6 +30,7 @@
 #include <X11/extensions/XShm.h>
 #include "util/u_memory.h"
 #include "util/u_math.h"
+#include "util/u_format.h"
 #include "xf86drm.h"
 #include "egllog.h"
 
@@ -50,6 +51,10 @@ struct x11_screen {
 
    XVisualInfo *visuals;
    int num_visuals;
+
+   /* cached values for x11_drawable_get_depth */
+   Drawable last_drawable;
+   unsigned int last_depth;
 };
 
 
@@ -351,6 +356,37 @@ x11_drawable_get_buffers(struct x11_screen *xscr, Drawable drawable,
 }
 
 /**
+ * Return the depth of a drawable.
+ *
+ * Unlike other drawable functions, the drawable needs not be a DRI2 drawable.
+ */
+uint
+x11_drawable_get_depth(struct x11_screen *xscr, Drawable drawable)
+{
+   unsigned int depth;
+
+   if (drawable != xscr->last_drawable) {
+      Window root;
+      int x, y;
+      unsigned int w, h, border;
+      Status ok;
+
+      ok = XGetGeometry(xscr->dpy, drawable, &root,
+            &x, &y, &w, &h, &border, &depth);
+      if (!ok)
+         depth = 0;
+
+      xscr->last_drawable = drawable;
+      xscr->last_depth = depth;
+   }
+   else {
+      depth = xscr->last_depth;
+   }
+
+   return depth;
+}
+
+/**
  * Create a mode list of the given size.
  */
 __GLcontextModes *
diff --git a/src/gallium/state_trackers/egl_g3d/x11/x11_screen.h b/src/gallium/state_trackers/egl_g3d/x11/x11_screen.h
index 86e8e0501a..bf48218905 100644
--- a/src/gallium/state_trackers/egl_g3d/x11/x11_screen.h
+++ b/src/gallium/state_trackers/egl_g3d/x11/x11_screen.h
@@ -96,4 +96,7 @@ x11_drawable_get_buffers(struct x11_screen *xscr, Drawable drawable,
                          int *width, int *height, unsigned int *attachments,
                          boolean with_format, int num_ins, int *num_outs);
 
+uint
+x11_drawable_get_depth(struct x11_screen *xscr, Drawable drawable);
+
 #endif /* _X11_SCREEN_H_ */
diff --git a/src/gallium/state_trackers/glx/xlib/glx_getproc.c b/src/gallium/state_trackers/glx/xlib/glx_getproc.c
index 84d47b12ed..bd4a85caa0 100644
--- a/src/gallium/state_trackers/glx/xlib/glx_getproc.c
+++ b/src/gallium/state_trackers/glx/xlib/glx_getproc.c
@@ -193,7 +193,7 @@ _glxapi_get_proc_address(const char *funcName)
 }
 
 
-__GLXextFuncPtr
+PUBLIC __GLXextFuncPtr
 glXGetProcAddressARB(const GLubyte *procName)
 {
    __GLXextFuncPtr f;
diff --git a/src/gallium/state_trackers/python/gallium.i b/src/gallium/state_trackers/python/gallium.i
index 96b13c2258..6879722845 100644
--- a/src/gallium/state_trackers/python/gallium.i
+++ b/src/gallium/state_trackers/python/gallium.i
@@ -76,7 +76,6 @@
 %rename(BlendColor) pipe_blend_color;
 %rename(Blend) pipe_blend_state;
 %rename(Clip) pipe_clip_state;
-%rename(ConstantBuffer) pipe_constant_buffer;
 %rename(Depth) pipe_depth_state;
 %rename(Stencil) pipe_stencil_state;
 %rename(Alpha) pipe_alpha_state;
diff --git a/src/gallium/state_trackers/python/p_context.i b/src/gallium/state_trackers/python/p_context.i
index 84ce1a41e6..ce893dad45 100644
--- a/src/gallium/state_trackers/python/p_context.i
+++ b/src/gallium/state_trackers/python/p_context.i
@@ -142,10 +142,7 @@ struct st_context {
    void set_constant_buffer(unsigned shader, unsigned index,
                             struct pipe_buffer *buffer ) 
    {
-      struct pipe_constant_buffer state;
-      memset(&state, 0, sizeof(state));
-      state.buffer = buffer;
-      $self->pipe->set_constant_buffer($self->pipe, shader, index, &state);
+      $self->pipe->set_constant_buffer($self->pipe, shader, index, buffer);
    }
 
    void set_framebuffer(const struct pipe_framebuffer_state *state ) 
diff --git a/src/gallium/state_trackers/python/retrace/interpreter.py b/src/gallium/state_trackers/python/retrace/interpreter.py
index a68709f5cf..b61d47d645 100755
--- a/src/gallium/state_trackers/python/retrace/interpreter.py
+++ b/src/gallium/state_trackers/python/retrace/interpreter.py
@@ -94,7 +94,7 @@ struct_factories = {
     "pipe_blend_color": gallium.BlendColor,
     "pipe_blend_state": gallium.Blend,
     #"pipe_clip_state": gallium.Clip,
-    #"pipe_constant_buffer": gallium.ConstantBuffer,
+    #"pipe_buffer": gallium.Buffer,
     "pipe_depth_state": gallium.Depth,
     "pipe_stencil_state": gallium.Stencil,
     "pipe_alpha_state": gallium.Alpha,
@@ -462,10 +462,10 @@ class Context(Object):
         sys.stdout.flush()
 
     def set_constant_buffer(self, shader, index, buffer):
-        if buffer is not None and buffer.buffer is not None:
-            self.real.set_constant_buffer(shader, index, buffer.buffer)
+        if buffer is not None:
+            self.real.set_constant_buffer(shader, index, buffer)
 
-            self.dump_constant_buffer(buffer.buffer)
+            self.dump_constant_buffer(buffer)
 
     def set_framebuffer_state(self, state):
         _state = gallium.Framebuffer()
diff --git a/src/gallium/state_trackers/python/st_softpipe_winsys.c b/src/gallium/state_trackers/python/st_softpipe_winsys.c
index a3294e877a..dfe3e465f7 100644
--- a/src/gallium/state_trackers/python/st_softpipe_winsys.c
+++ b/src/gallium/state_trackers/python/st_softpipe_winsys.c
@@ -35,225 +35,10 @@
  * @author Jose Fonseca
  */
 
-
-#include "pipe/internal/p_winsys_screen.h"/* port to just p_screen */
-#include "pipe/p_format.h"
-#include "pipe/p_context.h"
-#include "pipe/p_inlines.h"
-#include "util/u_format.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
 #include "softpipe/sp_winsys.h"
 #include "st_winsys.h"
 
-
-struct st_softpipe_buffer
-{
-   struct pipe_buffer base;
-   boolean userBuffer;  /** Is this a user-space buffer? */
-   void *data;
-   void *mapped;
-};
-
-
-/** Cast wrapper */
-static INLINE struct st_softpipe_buffer *
-st_softpipe_buffer( struct pipe_buffer *buf )
-{
-   return (struct st_softpipe_buffer *)buf;
-}
-
-
-static void *
-st_softpipe_buffer_map(struct pipe_winsys *winsys, 
-                       struct pipe_buffer *buf,
-                       unsigned flags)
-{
-   struct st_softpipe_buffer *st_softpipe_buf = st_softpipe_buffer(buf);
-   st_softpipe_buf->mapped = st_softpipe_buf->data;
-   return st_softpipe_buf->mapped;
-}
-
-
-static void
-st_softpipe_buffer_unmap(struct pipe_winsys *winsys, 
-                         struct pipe_buffer *buf)
-{
-   struct st_softpipe_buffer *st_softpipe_buf = st_softpipe_buffer(buf);
-   st_softpipe_buf->mapped = NULL;
-}
-
-
-static void
-st_softpipe_buffer_destroy(struct pipe_buffer *buf)
-{
-   struct st_softpipe_buffer *oldBuf = st_softpipe_buffer(buf);
-
-   if (oldBuf->data) {
-      if (!oldBuf->userBuffer)
-         align_free(oldBuf->data);
-
-      oldBuf->data = NULL;
-   }
-
-   FREE(oldBuf);
-}
-
-
-static void
-st_softpipe_flush_frontbuffer(struct pipe_winsys *winsys,
-                              struct pipe_surface *surf,
-                              void *context_private)
-{
-}
-
-
-
-static const char *
-st_softpipe_get_name(struct pipe_winsys *winsys)
-{
-   return "softpipe";
-}
-
-
-static struct pipe_buffer *
-st_softpipe_buffer_create(struct pipe_winsys *winsys, 
-                          unsigned alignment, 
-                          unsigned usage,
-                          unsigned size)
-{
-   struct st_softpipe_buffer *buffer = CALLOC_STRUCT(st_softpipe_buffer);
-
-   pipe_reference_init(&buffer->base.reference, 1);
-   buffer->base.alignment = alignment;
-   buffer->base.usage = usage;
-   buffer->base.size = size;
-
-   buffer->data = align_malloc(size, alignment);
-
-   return &buffer->base;
-}
-
-
-/**
- * Create buffer which wraps user-space data.
- */
-static struct pipe_buffer *
-st_softpipe_user_buffer_create(struct pipe_winsys *winsys, 
-                               void *ptr, 
-                               unsigned bytes)
-{
-   struct st_softpipe_buffer *buffer;
-   
-   buffer = CALLOC_STRUCT(st_softpipe_buffer);
-   if(!buffer)
-      return NULL;
-   
-   pipe_reference_init(&buffer->base.reference, 1);
-   buffer->base.size = bytes;
-   buffer->userBuffer = TRUE;
-   buffer->data = ptr;
-
-   return &buffer->base;
-}
-
-
-static struct pipe_buffer *
-st_softpipe_surface_buffer_create(struct pipe_winsys *winsys,
-                                  unsigned width, unsigned height,
-                                  enum pipe_format format,
-                                  unsigned usage,
-                                  unsigned tex_usage,
-                                  unsigned *stride)
-{
-   const unsigned alignment = 64;
-   unsigned nblocksy;
-
-   nblocksy = util_format_get_nblocksy(format, height);
-   *stride = align(util_format_get_stride(format, width), alignment);
-
-   return winsys->buffer_create(winsys, alignment,
-                                usage,
-                                *stride * nblocksy);
-}
-
-
-static void
-st_softpipe_fence_reference(struct pipe_winsys *winsys, 
-                            struct pipe_fence_handle **ptr,
-                            struct pipe_fence_handle *fence)
-{
-}
-
-
-static int
-st_softpipe_fence_signalled(struct pipe_winsys *winsys, 
-                            struct pipe_fence_handle *fence,
-                            unsigned flag)
-{
-   return 0;
-}
-
-
-static int
-st_softpipe_fence_finish(struct pipe_winsys *winsys, 
-                         struct pipe_fence_handle *fence,
-                         unsigned flag)
-{
-   return 0;
-}
-
-
-static void
-st_softpipe_destroy(struct pipe_winsys *winsys)
-{
-   FREE(winsys);
-}
-
-
-static struct pipe_screen *
-st_softpipe_screen_create(void)
-{
-   static struct pipe_winsys *winsys;
-   struct pipe_screen *screen;
-
-   winsys = CALLOC_STRUCT(pipe_winsys);
-   if(!winsys)
-      return NULL;
-
-   winsys->destroy = st_softpipe_destroy;
-   
-   winsys->buffer_create = st_softpipe_buffer_create;
-   winsys->user_buffer_create = st_softpipe_user_buffer_create;
-   winsys->buffer_map = st_softpipe_buffer_map;
-   winsys->buffer_unmap = st_softpipe_buffer_unmap;
-   winsys->buffer_destroy = st_softpipe_buffer_destroy;
-
-   winsys->surface_buffer_create = st_softpipe_surface_buffer_create;
-
-   winsys->fence_reference = st_softpipe_fence_reference;
-   winsys->fence_signalled = st_softpipe_fence_signalled;
-   winsys->fence_finish = st_softpipe_fence_finish;
-
-   winsys->flush_frontbuffer = st_softpipe_flush_frontbuffer;
-   winsys->get_name = st_softpipe_get_name;
-
-   screen = softpipe_create_screen(winsys);
-   if(!screen)
-      st_softpipe_destroy(winsys);
-
-   return screen;
-}
-
-
-static struct pipe_context *
-st_softpipe_context_create(struct pipe_screen *screen)
-{
-   return softpipe_create(screen);
-}
-
-
 const struct st_winsys st_softpipe_winsys = {
-   &st_softpipe_screen_create,
-   &st_softpipe_context_create,
+   &softpipe_create_screen_malloc,
+   &softpipe_create,
 };
diff --git a/src/gallium/state_trackers/python/tests/regress/fragment-shader/.gitignore b/src/gallium/state_trackers/python/tests/regress/fragment-shader/.gitignore
new file mode 100644
index 0000000000..e33609d251
--- /dev/null
+++ b/src/gallium/state_trackers/python/tests/regress/fragment-shader/.gitignore
@@ -0,0 +1 @@
+*.png
diff --git a/src/gallium/state_trackers/python/tests/regress/vertex-shader/.gitignore b/src/gallium/state_trackers/python/tests/regress/vertex-shader/.gitignore
new file mode 100644
index 0000000000..e33609d251
--- /dev/null
+++ b/src/gallium/state_trackers/python/tests/regress/vertex-shader/.gitignore
@@ -0,0 +1 @@
+*.png
diff --git a/src/gallium/state_trackers/vega/api_filters.c b/src/gallium/state_trackers/vega/api_filters.c
index 2f984fb7b9..8f69ee0109 100644
--- a/src/gallium/state_trackers/vega/api_filters.c
+++ b/src/gallium/state_trackers/vega/api_filters.c
@@ -147,22 +147,22 @@ static void setup_constant_buffer(struct vg_context *ctx, const void *buffer,
                                   VGint param_bytes)
 {
    struct pipe_context *pipe = ctx->pipe;
-   struct pipe_constant_buffer *cbuf = &ctx->filter.buffer;
+   struct pipe_buffer **cbuf = &ctx->filter.buffer;
 
    /* We always need to get a new buffer, to keep the drivers simple and
     * avoid gratuitous rendering synchronization. */
-   pipe_buffer_reference(&cbuf->buffer, NULL);
+   pipe_buffer_reference(cbuf, NULL);
 
-   cbuf->buffer = pipe_buffer_create(pipe->screen, 16,
-                                     PIPE_BUFFER_USAGE_CONSTANT,
-                                     param_bytes);
+   *cbuf = pipe_buffer_create(pipe->screen, 16,
+                              PIPE_BUFFER_USAGE_CONSTANT,
+                              param_bytes);
 
-   if (cbuf->buffer) {
-      st_no_flush_pipe_buffer_write(ctx, cbuf->buffer,
+   if (*cbuf) {
+      st_no_flush_pipe_buffer_write(ctx, *cbuf,
                                     0, param_bytes, buffer);
    }
 
-   ctx->pipe->set_constant_buffer(ctx->pipe, PIPE_SHADER_FRAGMENT, 0, cbuf);
+   ctx->pipe->set_constant_buffer(ctx->pipe, PIPE_SHADER_FRAGMENT, 0, *cbuf);
 }
 
 static void setup_samplers(struct vg_context *ctx, struct filter_info *info)
diff --git a/src/gallium/state_trackers/vega/api_masks.c b/src/gallium/state_trackers/vega/api_masks.c
index 4f9f3dae17..97cbe69205 100644
--- a/src/gallium/state_trackers/vega/api_masks.c
+++ b/src/gallium/state_trackers/vega/api_masks.c
@@ -32,7 +32,6 @@
 #include "vg_context.h"
 #include "pipe/p_context.h"
 #include "pipe/p_inlines.h"
-#include "pipe/internal/p_winsys_screen.h" /* for winsys->update_buffer */
 
 #include "util/u_pack_color.h"
 #include "util/u_draw_quad.h"
@@ -116,8 +115,8 @@ clear_with_quad(struct vg_context *st, float x0, float y0,
      x1, y1);
    */
 
-   if (st->pipe->winsys && st->pipe->winsys->update_buffer)
-      st->pipe->winsys->update_buffer( st->pipe->winsys,
+   if (st->pipe->screen && st->pipe->screen->update_buffer)
+      st->pipe->screen->update_buffer( st->pipe->screen,
                                        st->pipe->priv );
 
    cso_save_blend(st->cso_context);
diff --git a/src/gallium/state_trackers/vega/mask.c b/src/gallium/state_trackers/vega/mask.c
index 42300bb6d5..3e260e7073 100644
--- a/src/gallium/state_trackers/vega/mask.c
+++ b/src/gallium/state_trackers/vega/mask.c
@@ -217,7 +217,7 @@ static void setup_mask_framebuffer(struct pipe_surface *surf,
 static void setup_mask_operation(VGMaskOperation operation)
 {
    struct vg_context *ctx = vg_current_context();
-   struct pipe_constant_buffer *cbuf = &ctx->mask.cbuf;
+   struct pipe_buffer **cbuf = &ctx->mask.cbuf;
    const VGint param_bytes = 4 * sizeof(VGfloat);
    const VGfloat ones[4] = {1.f, 1.f, 1.f, 1.f};
    void *shader = 0;
@@ -225,17 +225,17 @@ static void setup_mask_operation(VGMaskOperation operation)
    /* We always need to get a new buffer, to keep the drivers simple and
     * avoid gratuitous rendering synchronization.
     */
-   pipe_buffer_reference(&cbuf->buffer, NULL);
+   pipe_buffer_reference(cbuf, NULL);
 
-   cbuf->buffer = pipe_buffer_create(ctx->pipe->screen, 1,
-                                     PIPE_BUFFER_USAGE_CONSTANT,
-                                     param_bytes);
-   if (cbuf->buffer) {
-      st_no_flush_pipe_buffer_write(ctx, cbuf->buffer,
+   *cbuf = pipe_buffer_create(ctx->pipe->screen, 1,
+                              PIPE_BUFFER_USAGE_CONSTANT,
+                              param_bytes);
+   if (*cbuf) {
+      st_no_flush_pipe_buffer_write(ctx, *cbuf,
                                     0, param_bytes, ones);
    }
 
-   ctx->pipe->set_constant_buffer(ctx->pipe, PIPE_SHADER_FRAGMENT, 0, cbuf);
+   ctx->pipe->set_constant_buffer(ctx->pipe, PIPE_SHADER_FRAGMENT, 0, *cbuf);
    switch (operation) {
    case VG_UNION_MASK: {
       if (!ctx->mask.union_fs) {
@@ -320,22 +320,22 @@ static void setup_mask_samplers(struct pipe_texture *umask)
 static void setup_mask_fill(const VGfloat color[4])
 {
    struct vg_context *ctx = vg_current_context();
-   struct pipe_constant_buffer *cbuf = &ctx->mask.cbuf;
+   struct pipe_buffer **cbuf = &ctx->mask.cbuf;
    const VGint param_bytes = 4 * sizeof(VGfloat);
 
    /* We always need to get a new buffer, to keep the drivers simple and
     * avoid gratuitous rendering synchronization.
     */
-   pipe_buffer_reference(&cbuf->buffer, NULL);
+   pipe_buffer_reference(cbuf, NULL);
 
-   cbuf->buffer = pipe_buffer_create(ctx->pipe->screen, 1,
-                                     PIPE_BUFFER_USAGE_CONSTANT,
-                                     param_bytes);
-   if (cbuf->buffer) {
-      st_no_flush_pipe_buffer_write(ctx, cbuf->buffer, 0, param_bytes, color);
+   *cbuf = pipe_buffer_create(ctx->pipe->screen, 1,
+                              PIPE_BUFFER_USAGE_CONSTANT,
+                              param_bytes);
+   if (*cbuf) {
+      st_no_flush_pipe_buffer_write(ctx, *cbuf, 0, param_bytes, color);
    }
 
-   ctx->pipe->set_constant_buffer(ctx->pipe, PIPE_SHADER_FRAGMENT, 0, cbuf);
+   ctx->pipe->set_constant_buffer(ctx->pipe, PIPE_SHADER_FRAGMENT, 0, *cbuf);
    cso_set_fragment_shader_handle(ctx->cso_context,
                                   shaders_cache_fill(ctx->sc,
                                                      VEGA_SOLID_FILL_SHADER));
diff --git a/src/gallium/state_trackers/vega/paint.c b/src/gallium/state_trackers/vega/paint.c
index cc73771d35..d8f6299b2d 100644
--- a/src/gallium/state_trackers/vega/paint.c
+++ b/src/gallium/state_trackers/vega/paint.c
@@ -77,7 +77,8 @@ struct vg_paint {
       struct pipe_sampler_state sampler;
    } pattern;
 
-   struct pipe_constant_buffer cbuf;
+   /* XXX next 3 all unneded? */
+   struct pipe_buffer *cbuf;
    struct pipe_shader_state fs_state;
    void *fs;
 };
diff --git a/src/gallium/state_trackers/vega/polygon.c b/src/gallium/state_trackers/vega/polygon.c
index b6d282d803..d385ee567f 100644
--- a/src/gallium/state_trackers/vega/polygon.c
+++ b/src/gallium/state_trackers/vega/polygon.c
@@ -293,6 +293,7 @@ static void draw_polygon(struct vg_context *ctx,
 
    /* tell pipe about the vertex attributes */
    velement.src_offset = 0;
+   velement.instance_divisor = 0;
    velement.vertex_buffer_index = 0;
    velement.src_format = PIPE_FORMAT_R32G32_FLOAT;
    velement.nr_components = COMPONENTS;
diff --git a/src/gallium/state_trackers/vega/shader.c b/src/gallium/state_trackers/vega/shader.c
index d9074a377b..bd5ae79e55 100644
--- a/src/gallium/state_trackers/vega/shader.c
+++ b/src/gallium/state_trackers/vega/shader.c
@@ -51,7 +51,7 @@ struct shader {
    VGImageMode image_mode;
 
    float constants[MAX_CONSTANTS];
-   struct pipe_constant_buffer cbuf;
+   struct pipe_buffer *cbuf;
    struct pipe_shader_state fs_state;
    void *fs;
 };
@@ -96,25 +96,25 @@ static void setup_constant_buffer(struct shader *shader)
 {
    struct vg_context *ctx = shader->context;
    struct pipe_context *pipe = shader->context->pipe;
-   struct pipe_constant_buffer *cbuf = &shader->cbuf;
+   struct pipe_buffer **cbuf = &shader->cbuf;
    VGint param_bytes = paint_constant_buffer_size(shader->paint);
    float temp_buf[MAX_CONSTANTS];
 
    assert(param_bytes <= sizeof(temp_buf));
    paint_fill_constant_buffer(shader->paint, temp_buf);
 
-   if (cbuf->buffer == NULL ||
+   if (*cbuf == NULL ||
        memcmp(temp_buf, shader->constants, param_bytes) != 0)
    {
-      pipe_buffer_reference(&cbuf->buffer, NULL);
+      pipe_buffer_reference(cbuf, NULL);
 
       memcpy(shader->constants, temp_buf, param_bytes);
-      cbuf->buffer = pipe_user_buffer_create(pipe->screen,
-                                             &shader->constants,
-                                             sizeof(shader->constants));
+      *cbuf = pipe_user_buffer_create(pipe->screen,
+                                      &shader->constants,
+                                      sizeof(shader->constants));
    }
 
-   ctx->pipe->set_constant_buffer(ctx->pipe, PIPE_SHADER_FRAGMENT, 0, cbuf);
+   ctx->pipe->set_constant_buffer(ctx->pipe, PIPE_SHADER_FRAGMENT, 0, *cbuf);
 }
 
 static VGint blend_bind_samplers(struct vg_context *ctx,
diff --git a/src/gallium/state_trackers/vega/vg_context.c b/src/gallium/state_trackers/vega/vg_context.c
index 00d23f5c22..c16ac036e3 100644
--- a/src/gallium/state_trackers/vega/vg_context.c
+++ b/src/gallium/state_trackers/vega/vg_context.c
@@ -122,8 +122,8 @@ struct vg_context * vg_create_context(struct pipe_context *pipe,
 
 void vg_destroy_context(struct vg_context *ctx)
 {
-   struct pipe_constant_buffer *cbuf = &ctx->mask.cbuf;
-   struct pipe_constant_buffer *vsbuf = &ctx->vs_const_buffer;
+   struct pipe_buffer **cbuf = &ctx->mask.cbuf;
+   struct pipe_buffer **vsbuf = &ctx->vs_const_buffer;
 
    util_destroy_blit(ctx->blit);
    renderer_destroy(ctx->renderer);
@@ -131,11 +131,11 @@ void vg_destroy_context(struct vg_context *ctx)
    shader_destroy(ctx->shader);
    paint_destroy(ctx->default_paint);
 
-   if (cbuf && cbuf->buffer)
-      pipe_buffer_reference(&cbuf->buffer, NULL);
+   if (*cbuf)
+      pipe_buffer_reference(cbuf, NULL);
 
-   if (vsbuf && vsbuf->buffer)
-      pipe_buffer_reference(&vsbuf->buffer, NULL);
+   if (*vsbuf)
+      pipe_buffer_reference(vsbuf, NULL);
 
    if (ctx->clear.fs) {
       cso_delete_fragment_shader(ctx->cso_context, ctx->clear.fs);
@@ -371,20 +371,20 @@ void vg_validate_state(struct vg_context *ctx)
          2.f/fb->width, 2.f/fb->height, 1, 1,
          -1, -1, 0, 0
       };
-      struct pipe_constant_buffer *cbuf = &ctx->vs_const_buffer;
+      struct pipe_buffer **cbuf = &ctx->vs_const_buffer;
 
       vg_set_viewport(ctx, VEGA_Y0_BOTTOM);
 
-      pipe_buffer_reference(&cbuf->buffer, NULL);
-      cbuf->buffer = pipe_buffer_create(ctx->pipe->screen, 16,
+      pipe_buffer_reference(cbuf, NULL);
+      *cbuf = pipe_buffer_create(ctx->pipe->screen, 16,
                                         PIPE_BUFFER_USAGE_CONSTANT,
                                         param_bytes);
 
-      if (cbuf->buffer) {
-         st_no_flush_pipe_buffer_write(ctx, cbuf->buffer,
+      if (*cbuf) {
+         st_no_flush_pipe_buffer_write(ctx, *cbuf,
                                        0, param_bytes, vs_consts);
       }
-      ctx->pipe->set_constant_buffer(ctx->pipe, PIPE_SHADER_VERTEX, 0, cbuf);
+      ctx->pipe->set_constant_buffer(ctx->pipe, PIPE_SHADER_VERTEX, 0, *cbuf);
    }
    if ((ctx->state.dirty & VS_DIRTY)) {
       cso_set_vertex_shader_handle(ctx->cso_context,
diff --git a/src/gallium/state_trackers/vega/vg_context.h b/src/gallium/state_trackers/vega/vg_context.h
index ccc8889c8c..bc88c8d139 100644
--- a/src/gallium/state_trackers/vega/vg_context.h
+++ b/src/gallium/state_trackers/vega/vg_context.h
@@ -50,7 +50,7 @@ struct st_renderbuffer {
 };
 
 struct st_framebuffer {
-   VGint init_width, init_height;
+   VGint width, height;
    struct st_renderbuffer *strb;
    struct st_renderbuffer *dsrb;
 
@@ -113,7 +113,7 @@ struct vg_context
    } clear;
 
    struct {
-      struct pipe_constant_buffer cbuf;
+      struct pipe_buffer *cbuf;
       struct pipe_sampler_state sampler;
 
       struct vg_shader *union_fs;
@@ -135,7 +135,7 @@ struct vg_context
 
    struct pipe_sampler_state blend_sampler;
    struct {
-      struct pipe_constant_buffer buffer;
+      struct pipe_buffer *buffer;
       void *color_matrix_fs;
    } filter;
    struct vg_paint *default_paint;
@@ -145,7 +145,7 @@ struct vg_context
    struct vg_shader *plain_vs;
    struct vg_shader *clear_vs;
    struct vg_shader *texture_vs;
-   struct pipe_constant_buffer vs_const_buffer;
+   struct pipe_buffer *vs_const_buffer;
 };
 
 struct vg_object {
diff --git a/src/gallium/state_trackers/vega/vg_tracker.c b/src/gallium/state_trackers/vega/vg_tracker.c
index ff80aab03a..617c174eb6 100644
--- a/src/gallium/state_trackers/vega/vg_tracker.c
+++ b/src/gallium/state_trackers/vega/vg_tracker.c
@@ -193,8 +193,8 @@ struct st_framebuffer * st_create_framebuffer(const void *visual,
       */
       stfb->alpha_mask = 0;
 
-      stfb->init_width = width;
-      stfb->init_height = height;
+      stfb->width = width;
+      stfb->height = height;
       stfb->privateData = privateData;
    }
 
@@ -282,11 +282,14 @@ void st_resize_framebuffer(struct st_framebuffer *stfb,
 
    /* If this is a noop, exit early and don't do the clear, etc below.
     */
-   if (strb->width == width &&
-       strb->height == height &&
+   if (stfb->width == width &&
+       stfb->height == height &&
        state->zsbuf)
       return;
 
+   stfb->width = width;
+   stfb->height = height;
+
    if (strb->width != width || strb->height != height)
       st_renderbuffer_alloc_storage(ctx, strb,
                                  width, height);
diff --git a/src/gallium/state_trackers/xorg/xorg_renderer.c b/src/gallium/state_trackers/xorg/xorg_renderer.c
index d80f341e6c..8f729b565b 100644
--- a/src/gallium/state_trackers/xorg/xorg_renderer.c
+++ b/src/gallium/state_trackers/xorg/xorg_renderer.c
@@ -379,14 +379,14 @@ struct xorg_renderer * renderer_create(struct pipe_context *pipe)
 
 void renderer_destroy(struct xorg_renderer *r)
 {
-   struct pipe_constant_buffer *vsbuf = &r->vs_const_buffer;
-   struct pipe_constant_buffer *fsbuf = &r->fs_const_buffer;
+   struct pipe_buffer **vsbuf = &r->vs_const_buffer;
+   struct pipe_buffer **fsbuf = &r->fs_const_buffer;
 
-   if (vsbuf && vsbuf->buffer)
-      pipe_buffer_reference(&vsbuf->buffer, NULL);
+   if (*vsbuf)
+      pipe_buffer_reference(vsbuf, NULL);
 
-   if (fsbuf && fsbuf->buffer)
-      pipe_buffer_reference(&fsbuf->buffer, NULL);
+   if (*fsbuf)
+      pipe_buffer_reference(fsbuf, NULL);
 
    if (r->shaders) {
       xorg_shaders_destroy(r->shaders);
@@ -409,20 +409,20 @@ void renderer_set_constants(struct xorg_renderer *r,
                             const float *params,
                             int param_bytes)
 {
-   struct pipe_constant_buffer *cbuf =
+   struct pipe_buffer **cbuf =
       (shader_type == PIPE_SHADER_VERTEX) ? &r->vs_const_buffer :
       &r->fs_const_buffer;
 
-   pipe_buffer_reference(&cbuf->buffer, NULL);
-   cbuf->buffer = pipe_buffer_create(r->pipe->screen, 16,
-                                     PIPE_BUFFER_USAGE_CONSTANT,
-                                     param_bytes);
+   pipe_buffer_reference(cbuf, NULL);
+   *cbuf = pipe_buffer_create(r->pipe->screen, 16,
+                              PIPE_BUFFER_USAGE_CONSTANT,
+                              param_bytes);
 
-   if (cbuf->buffer) {
-      pipe_buffer_write(r->pipe->screen, cbuf->buffer,
+   if (*cbuf) {
+      pipe_buffer_write(r->pipe->screen, *cbuf,
                         0, param_bytes, params);
    }
-   r->pipe->set_constant_buffer(r->pipe, shader_type, 0, cbuf);
+   r->pipe->set_constant_buffer(r->pipe, shader_type, 0, *cbuf);
 }
 
 
diff --git a/src/gallium/state_trackers/xorg/xorg_renderer.h b/src/gallium/state_trackers/xorg/xorg_renderer.h
index 5272cde2b3..af6aa0567d 100644
--- a/src/gallium/state_trackers/xorg/xorg_renderer.h
+++ b/src/gallium/state_trackers/xorg/xorg_renderer.h
@@ -23,8 +23,8 @@ struct xorg_renderer {
 
    int fb_width;
    int fb_height;
-   struct pipe_constant_buffer vs_const_buffer;
-   struct pipe_constant_buffer fs_const_buffer;
+   struct pipe_buffer *vs_const_buffer;
+   struct pipe_buffer *fs_const_buffer;
 
    float buffer[BUF_SIZE];
    int buffer_size;
diff --git a/src/gallium/state_trackers/xorg/xorg_xv.c b/src/gallium/state_trackers/xorg/xorg_xv.c
index 6b5a41a372..5bf0e94b62 100644
--- a/src/gallium/state_trackers/xorg/xorg_xv.c
+++ b/src/gallium/state_trackers/xorg/xorg_xv.c
@@ -486,8 +486,11 @@ display_video(ScrnInfoPtr pScrn, struct xorg_xv_port_priv *pPriv, int id,
    int dxo, dyo;
    Bool hdtv;
    int x, y, w, h;
-   struct exa_pixmap_priv *dst = exaGetPixmapDriverPrivate(pPixmap);
-   struct pipe_surface *dst_surf = xorg_gpu_surface(pPriv->r->pipe->screen, dst);
+   struct exa_pixmap_priv *dst;
+   struct pipe_surface *dst_surf = NULL;
+
+   exaMoveInPixmap(pPixmap);
+   dst = exaGetPixmapDriverPrivate(pPixmap);
 
    if (dst && !dst->tex) {
 	xorg_exa_set_shared_usage(pPixmap);
@@ -497,6 +500,7 @@ display_video(ScrnInfoPtr pScrn, struct xorg_xv_port_priv *pPriv, int id,
    if (!dst || !dst->tex)
       XORG_FALLBACK("Xv destination %s", !dst ? "!dst" : "!dst->tex");
 
+   dst_surf = xorg_gpu_surface(pPriv->r->pipe->screen, dst);
    hdtv = ((src_w >= RES_720P_X) && (src_h >= RES_720P_Y));
 
    REGION_TRANSLATE(pScrn->pScreen, dstRegion, -pPixmap->screen_x,
@@ -516,7 +520,6 @@ display_video(ScrnInfoPtr pScrn, struct xorg_xv_port_priv *pPriv, int id,
    bind_samplers(pPriv);
    setup_fs_video_constants(pPriv->r, hdtv);
 
-   exaMoveInPixmap(pPixmap);
    DamageDamageRegion(&pPixmap->drawable, dstRegion);
 
    while (nbox--) {
diff --git a/src/gallium/winsys/drm/i965/xorg/Makefile b/src/gallium/winsys/drm/i965/xorg/Makefile
index d91d0006ef..c25726b0bb 100644
--- a/src/gallium/winsys/drm/i965/xorg/Makefile
+++ b/src/gallium/winsys/drm/i965/xorg/Makefile
@@ -1,19 +1,25 @@
-TARGET     = modesetting_drv.so
-CFILES     = $(wildcard ./*.c)
-OBJECTS    = $(patsubst ./%.c,./%.o,$(CFILES))
 TOP        = ../../../../../..
 
-include $(TOP)/configs/current
 
-INCLUDES = \
-	$(shell pkg-config --cflags-only-I pixman-1 xorg-server libdrm xproto) \
-	-I../gem \
-	-I$(TOP)/src/gallium/include \
-	-I$(TOP)/src/gallium/drivers \
-	-I$(TOP)/src/gallium/auxiliary \
-	-I$(TOP)/src/mesa \
-	-I$(TOP)/include \
-	-I$(TOP)/src/egl/main
+GALLIUMDIR = $(TOP)/src/gallium
+
+TARGET     = i965g_drv.so
+
+CFILES     = $(wildcard ./*.c)
+
+include ${TOP}/configs/current
+
+OBJECTS    = $(patsubst ./%.c,./%.o,$(CFILES))
+
+CFLAGS = -DHAVE_CONFIG_H \
+         -g -Wall -Wimplicit-function-declaration -fPIC \
+         $(shell pkg-config --cflags pixman-1 xorg-server libdrm xproto) \
+         -I${GALLIUMDIR}/include \
+         -I${GALLIUMDIR}/drivers \
+         -I${GALLIUMDIR}/auxiliary \
+         -I${TOP}/src/mesa \
+         -I$(TOP)/include \
+         -I$(TOP)/src/egl/main
 
 LIBS = \
 	$(TOP)/src/gallium/state_trackers/xorg/libxorgtracker.a \
@@ -23,20 +29,21 @@ LIBS = \
 	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
 	$(GALLIUM_AUXILIARIES)
 
-DRIVER_DEFINES = \
-	-DHAVE_CONFIG_H
-
-
+TARGET_STAGING = $(TOP)/$(LIB_DIR)/gallium/$(TARGET)
 #############################################
 
+all default: $(TARGET) $(TARGET_STAGING)
 
-
-all default: $(TARGET)
-
-$(TARGET): $(OBJECTS) Makefile $(TOP)/src/gallium/state_trackers/xorg/libxorgtracker.a $(LIBS)
+$(TARGET): $(OBJECTS) Makefile $(GALLIUMDIR)/state_trackers/xorg/libxorgtracker.a $(LIBS)
 	$(TOP)/bin/mklib -noprefix -o $@ \
 	$(OBJECTS) $(LIBS) $(shell pkg-config --libs libdrm) -ldrm_intel
 
+$(TOP)/$(LIB_DIR)/gallium:
+	mkdir -p $@
+
+$(TARGET_STAGING): $(TARGET) $(TOP)/$(LIB_DIR)/gallium
+	$(INSTALL) $(TARGET) $(TOP)/$(LIB_DIR)/gallium
+
 clean:
 	rm -rf $(OBJECTS) $(TARGET)
 
@@ -44,14 +51,4 @@ install:
 	$(INSTALL) -d $(DESTDIR)/$(XORG_DRIVER_INSTALL_DIR)
 	$(MINSTALL) -m 755 $(TARGET) $(DESTDIR)/$(XORG_DRIVER_INSTALL_DIR)
 
-
-##############################################
-
-
-.c.o:
-	$(CC) -c $(CFLAGS) $(INCLUDES) $(DRIVER_DEFINES) $< -o $@
-
-
-##############################################
-
 .PHONY	= all clean install
diff --git a/src/gallium/winsys/drm/radeon/core/radeon_buffer.c b/src/gallium/winsys/drm/radeon/core/radeon_buffer.c
index 385fa857b5..b020ff38fa 100644
--- a/src/gallium/winsys/drm/radeon/core/radeon_buffer.c
+++ b/src/gallium/winsys/drm/radeon/core/radeon_buffer.c
@@ -58,6 +58,7 @@ static struct pipe_buffer *radeon_buffer_create(struct pipe_winsys *ws,
 {
     struct radeon_winsys *radeon_ws = (struct radeon_winsys *)ws;
     struct radeon_pipe_buffer *radeon_buffer;
+    struct pb_desc desc;
     uint32_t domain;
 
     radeon_buffer = CALLOC_STRUCT(radeon_pipe_buffer);
@@ -70,6 +71,14 @@ static struct pipe_buffer *radeon_buffer_create(struct pipe_winsys *ws,
     radeon_buffer->base.usage = usage;
     radeon_buffer->base.size = size;
 
+    if (usage == PIPE_BUFFER_USAGE_CONSTANT && is_r3xx(radeon_ws->pci_id)) {
+        /* Don't bother allocating a BO, as it'll never get to the card. */
+        desc.alignment = alignment;
+        desc.usage = usage;
+        radeon_buffer->pb = pb_malloc_buffer_create(size, &desc);
+        return &radeon_buffer->base;
+    }
+
     domain = 0;
 
     if (usage & PIPE_BUFFER_USAGE_PIXEL) {
@@ -133,8 +142,16 @@ static void radeon_buffer_del(struct pipe_buffer *buffer)
     struct radeon_pipe_buffer *radeon_buffer =
         (struct radeon_pipe_buffer*)buffer;
 
-    radeon_bo_unref(radeon_buffer->bo);
-    free(radeon_buffer);
+    if (radeon_buffer->pb) {
+        pipe_reference_init(&radeon_buffer->pb->base.reference, 0);
+        pb_destroy(radeon_buffer->pb);
+    }
+
+    if (radeon_buffer->bo) {
+        radeon_bo_unref(radeon_buffer->bo);
+    }
+
+    FREE(radeon_buffer);
 }
 
 static void *radeon_buffer_map(struct pipe_winsys *ws,
@@ -146,6 +163,10 @@ static void *radeon_buffer_map(struct pipe_winsys *ws,
         (struct radeon_pipe_buffer*)buffer;
     int write = 0;
 
+    if (radeon_buffer->pb) {
+        return pb_map(radeon_buffer->pb, flags);
+    }
+
     if (flags & PIPE_BUFFER_USAGE_DONTBLOCK) {
         uint32_t domain;
 
@@ -174,7 +195,11 @@ static void radeon_buffer_unmap(struct pipe_winsys *ws,
     struct radeon_pipe_buffer *radeon_buffer =
         (struct radeon_pipe_buffer*)buffer;
 
-    radeon_bo_unmap(radeon_buffer->bo);
+    if (radeon_buffer->pb) {
+        pb_unmap(radeon_buffer->pb);
+    } else {
+        radeon_bo_unmap(radeon_buffer->bo);
+    }
 }
 
 static void radeon_fence_reference(struct pipe_winsys *ws,
diff --git a/src/gallium/winsys/drm/radeon/core/radeon_buffer.h b/src/gallium/winsys/drm/radeon/core/radeon_buffer.h
index d7f17564a9..de71cb2f42 100644
--- a/src/gallium/winsys/drm/radeon/core/radeon_buffer.h
+++ b/src/gallium/winsys/drm/radeon/core/radeon_buffer.h
@@ -36,7 +36,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
 
-//#include "state_tracker/st_public.h"
+#include "pipebuffer/pb_buffer.h"
 
 #include "util/u_memory.h"
 
@@ -49,7 +49,10 @@
 
 struct radeon_pipe_buffer {
     struct pipe_buffer  base;
+    /* Pointer to GPU-backed BO. */
     struct radeon_bo    *bo;
+    /* Pointer to fallback PB buffer. */
+    struct pb_buffer    *pb;
     boolean flinked;
     uint32_t flink;
 };
diff --git a/src/gallium/winsys/drm/radeon/core/radeon_drm.c b/src/gallium/winsys/drm/radeon/core/radeon_drm.c
index 851c223697..9552f0ad6a 100644
--- a/src/gallium/winsys/drm/radeon/core/radeon_drm.c
+++ b/src/gallium/winsys/drm/radeon/core/radeon_drm.c
@@ -40,12 +40,16 @@ static void do_ioctls(int fd, struct radeon_winsys* winsys)
     struct drm_radeon_info info = {0};
     int target = 0;
     int retval;
+    drmVersionPtr version;
 
     info.value = (unsigned long)&target;
 
     /* We do things in a specific order here.
      *
-     * First, the PCI ID. This is essential and should return usable numbers
+     * DRM version first. We need to be sure we're running on a KMS chipset.
+     * This is also for some features.
+     *
+     * Then, the PCI ID. This is essential and should return usable numbers
      * for all Radeons. If this fails, we probably got handed an FD for some
      * non-Radeon card.
      *
@@ -55,8 +59,18 @@ static void do_ioctls(int fd, struct radeon_winsys* winsys)
      *
      * The GEM info is actually bogus on the kernel side, as well as our side
      * (see radeon_gem_info_ioctl in radeon_gem.c) but that's alright because
-     * we don't actually use the info for anything yet.
-     * XXX update the above when we can safely use vram_size instead of vram_visible */
+     * we don't actually use the info for anything yet. */
+
+    version = drmGetVersion(fd);
+    if (version->version_major != 2) {
+        fprintf(stderr, "%s: DRM version is %d.%d.%d but this driver is "
+                "only compatible with 2.x.x\n", __FUNCTION__,
+                version->version_major, version->version_minor,
+                version->version_patchlevel);
+        drmFreeVersion(version);
+        exit(1);
+    }
+
     info.request = RADEON_INFO_DEVICE_ID;
     retval = drmCommandWriteRead(fd, DRM_RADEON_INFO, &info, sizeof(info));
     if (retval) {
@@ -92,16 +106,18 @@ static void do_ioctls(int fd, struct radeon_winsys* winsys)
         exit(1);
     }
     winsys->gart_size = gem_info.gart_size;
-    /* XXX */
-    winsys->vram_size = gem_info.vram_visible;
-}
-
-/* Guess at whether this chipset should use r300g.
- *
- * I believe that this check is valid, but I haven't been exhaustive. */
-static boolean is_r3xx(int pciid)
-{
-    return (pciid > 0x3150) && (pciid < 0x796f);
+    winsys->vram_size = gem_info.vram_size;
+
+    debug_printf("radeon: Successfully grabbed chipset info from kernel!\n"
+                 "radeon: DRM version: %d.%d.%d ID: 0x%04x GB: %d Z: %d\n"
+                 "radeon: GART size: %d MB VRAM size: %d MB\n",
+                 version->version_major, version->version_minor,
+                 version->version_patchlevel, winsys->pci_id,
+                 winsys->gb_pipes, winsys->z_pipes,
+                 winsys->gart_size / 1024 / 1024,
+                 winsys->vram_size / 1024 / 1024);
+
+    drmFreeVersion(version);
 }
 
 /* Create a pipe_screen. */
@@ -136,12 +152,13 @@ struct pipe_context* radeon_create_context(struct drm_api* api,
 }
 
 boolean radeon_buffer_from_texture(struct drm_api* api,
+                                   struct pipe_screen* screen,
                                    struct pipe_texture* texture,
                                    struct pipe_buffer** buffer,
                                    unsigned* stride)
 {
     /* XXX fix this */
-    return r300_get_texture_buffer(texture, buffer, stride);
+    return r300_get_texture_buffer(screen, texture, buffer, stride);
 }
 
 /* Create a buffer from a handle. */
@@ -208,7 +225,7 @@ static boolean radeon_shared_handle_from_texture(struct drm_api *api,
     struct radeon_pipe_buffer* radeon_buffer;
     struct pipe_buffer *buffer = NULL;
 
-    if (!radeon_buffer_from_texture(api, texture, &buffer, stride)) {
+    if (!radeon_buffer_from_texture(api, screen, texture, &buffer, stride)) {
         return FALSE;
     }
 
@@ -240,7 +257,7 @@ static boolean radeon_local_handle_from_texture(struct drm_api *api,
                                                 unsigned *handle)
 {
     struct pipe_buffer *buffer = NULL;
-    if (!radeon_buffer_from_texture(api, texture, &buffer, stride)) {
+    if (!radeon_buffer_from_texture(api, screen, texture, &buffer, stride)) {
         return FALSE;
     }
 
diff --git a/src/gallium/winsys/drm/radeon/core/radeon_drm.h b/src/gallium/winsys/drm/radeon/core/radeon_drm.h
index bf0e78138d..ddd7983824 100644
--- a/src/gallium/winsys/drm/radeon/core/radeon_drm.h
+++ b/src/gallium/winsys/drm/radeon/core/radeon_drm.h
@@ -56,6 +56,7 @@ struct pipe_context* radeon_create_context(struct drm_api* api,
                                            struct pipe_screen* screen);
 
 boolean radeon_buffer_from_texture(struct drm_api* api,
+                                   struct pipe_screen* screen,
                                    struct pipe_texture* texture,
                                    struct pipe_buffer** buffer,
                                    unsigned* stride);
@@ -76,4 +77,13 @@ boolean radeon_global_handle_from_buffer(struct drm_api* api,
                                          unsigned* handle);
 
 void radeon_destroy_drm_api(struct drm_api* api);
+
+/* Guess at whether this chipset should use r300g.
+ *
+ * I believe that this check is valid, but I haven't been exhaustive. */
+static boolean is_r3xx(int pciid)
+{
+    return (pciid > 0x3150) && (pciid < 0x796f);
+}
+
 #endif
diff --git a/src/gallium/winsys/drm/radeon/dri/Makefile b/src/gallium/winsys/drm/radeon/dri/Makefile
index a9889444de..eaa3418032 100644
--- a/src/gallium/winsys/drm/radeon/dri/Makefile
+++ b/src/gallium/winsys/drm/radeon/dri/Makefile
@@ -2,7 +2,7 @@
 TOP = ../../../../../..
 include $(TOP)/configs/current
 
-LIBNAME = radeon_dri.so
+LIBNAME = radeong_dri.so
 
 MINIGLX_SOURCES =
 
diff --git a/src/gallium/winsys/egl_xlib/egl_xlib.c b/src/gallium/winsys/egl_xlib/egl_xlib.c
index 420dccc92c..1d9bac3871 100644
--- a/src/gallium/winsys/egl_xlib/egl_xlib.c
+++ b/src/gallium/winsys/egl_xlib/egl_xlib.c
@@ -274,7 +274,7 @@ xlib_eglTerminate(_EGLDriver *drv, _EGLDisplay *dpy)
 
 
 static _EGLProc
-xlib_eglGetProcAddress(const char *procname)
+xlib_eglGetProcAddress(_EGLDriver *drv, const char *procname)
 {
    return (_EGLProc) st_get_proc_address(procname);
 }
diff --git a/src/glsl/cl/sl_cl_parse.c b/src/glsl/cl/sl_cl_parse.c
index e9b3707ac1..e256ab8213 100644
--- a/src/glsl/cl/sl_cl_parse.c
+++ b/src/glsl/cl/sl_cl_parse.c
@@ -345,7 +345,7 @@ struct parse_state {
 };
 
 
-static __inline unsigned int
+static unsigned int
 _emit(struct parse_context *ctx,
       unsigned int *out,
       unsigned char b)
diff --git a/src/glu/sgi/libnurbs/interface/bezierPatchMesh.h b/src/glu/sgi/libnurbs/interface/bezierPatchMesh.h
index 449329665c..ba6868a306 100644
--- a/src/glu/sgi/libnurbs/interface/bezierPatchMesh.h
+++ b/src/glu/sgi/libnurbs/interface/bezierPatchMesh.h
@@ -33,6 +33,7 @@
 #ifndef _BEZIERPATCHMESH_H
 #define _BEZIERPATCHMESH_H
 
+#include <GL/gl.h>
 #include "bezierPatch.h"
 
 typedef struct bezierPatchMesh{
diff --git a/src/glu/sgi/libnurbs/interface/glsurfeval.h b/src/glu/sgi/libnurbs/interface/glsurfeval.h
index 1567c6b098..621e59391a 100644
--- a/src/glu/sgi/libnurbs/interface/glsurfeval.h
+++ b/src/glu/sgi/libnurbs/interface/glsurfeval.h
@@ -83,7 +83,7 @@ typedef struct surfEvalMachine{
 
 class StoredVertex {
 public:
-    		StoredVertex() { type = 0; }
+    		StoredVertex() { type = 0; coord[0] = 0; coord[1] = 0; point[0] = 0; point[1] = 0; }
 		~StoredVertex(void) {}
     void	saveEvalCoord(REAL x, REAL y) 
 		    {coord[0] = x; coord[1] = y; type = TYPECOORD; }
diff --git a/src/glu/sgi/libnurbs/internals/mesher.cc b/src/glu/sgi/libnurbs/internals/mesher.cc
index 9cc436adbf..b2d83f4128 100644
--- a/src/glu/sgi/libnurbs/internals/mesher.cc
+++ b/src/glu/sgi/libnurbs/internals/mesher.cc
@@ -58,6 +58,9 @@ Mesher::Mesher( Backend& b )
 {
     stacksize = 0;
     vdata = 0;
+    last[0] = 0;
+    last[1] = 0;
+    itop = 0;
     lastedge = 0; //needed to prevent purify UMR 
 }
 
diff --git a/src/glu/sgi/libnurbs/internals/reader.cc b/src/glu/sgi/libnurbs/internals/reader.cc
index 6135eef60e..c59240d26a 100644
--- a/src/glu/sgi/libnurbs/internals/reader.cc
+++ b/src/glu/sgi/libnurbs/internals/reader.cc
@@ -64,6 +64,7 @@ O_pwlcurve::O_pwlcurve( long _type, long count, INREAL *array, long byte_stride,
     owner = 0;
     pts = trimpts;
     npts = (int) count;
+    save = 0;
     int i;
 
     /* copy user data into internal trimming data structures */
@@ -115,6 +116,7 @@ O_pwlcurve::O_pwlcurve( long _type, long count, INREAL *array, long byte_stride,
     owner = 0;
     pts = trimpts;
     npts = (int) count;
+    save = 0;
 
     /* copy user data into internal trimming data structures */
     switch( _type ) {
diff --git a/src/glu/sgi/libnurbs/internals/renderhints.cc b/src/glu/sgi/libnurbs/internals/renderhints.cc
index a3aa62d42c..7025f74f5b 100644
--- a/src/glu/sgi/libnurbs/internals/renderhints.cc
+++ b/src/glu/sgi/libnurbs/internals/renderhints.cc
@@ -54,6 +54,10 @@ Renderhints::Renderhints()
     errorchecking 	= N_MSG;
     subdivisions 	= 6.0;
     tmp1 		= 0.0;
+    displaydomain 	= 0;
+    maxsubdivisions 	= (int) subdivisions;
+    wiretris	 	= 0;
+    wirequads	 	= 0;
 }
 
 void
diff --git a/src/glu/sgi/libnurbs/internals/simplemath.h b/src/glu/sgi/libnurbs/internals/simplemath.h
index 0a060c57ea..d00062dc70 100644
--- a/src/glu/sgi/libnurbs/internals/simplemath.h
+++ b/src/glu/sgi/libnurbs/internals/simplemath.h
@@ -38,6 +38,8 @@
 
 /* simple inline routines */
 
+#include "types.h"
+
 inline int 
 max( int x, int y ) { return ( x < y ) ? y : x; }
 
diff --git a/src/glu/sgi/libnurbs/internals/slicer.cc b/src/glu/sgi/libnurbs/internals/slicer.cc
index 27d2a650d1..1b18d73c17 100644
--- a/src/glu/sgi/libnurbs/internals/slicer.cc
+++ b/src/glu/sgi/libnurbs/internals/slicer.cc
@@ -1181,6 +1181,10 @@ void Slicer::slice(Arc_ptr loop)
 Slicer::Slicer( Backend &b ) 
 	: CoveAndTiler( b ), Mesher( b ), backend( b )
 {
+    oneOverDu = 0;
+    du = 0;
+    dv = 0;
+    isolines = 0;
     ulinear = 0;
     vlinear = 0;
 }
diff --git a/src/glx/x11/dri2_glx.c b/src/glx/x11/dri2_glx.c
index 83149062f3..7b0c52b50d 100644
--- a/src/glx/x11/dri2_glx.c
+++ b/src/glx/x11/dri2_glx.c
@@ -205,14 +205,14 @@ static int
 dri2DrawableGetMSC(__GLXscreenConfigs *psc, __GLXDRIdrawable *pdraw,
 		   int64_t *ust, int64_t *msc, int64_t *sbc)
 {
-   return DRI2GetMSC(psc->dpy, pdraw->drawable, ust, msc, sbc);
+   return DRI2GetMSC(psc->dpy, pdraw->xDrawable, ust, msc, sbc);
 }
 
 static int
 dri2WaitForMSC(__GLXDRIdrawable *pdraw, int64_t target_msc, int64_t divisor,
 	       int64_t remainder, int64_t *ust, int64_t *msc, int64_t *sbc)
 {
-   return DRI2WaitMSC(pdraw->psc->dpy, pdraw->drawable, target_msc, divisor,
+   return DRI2WaitMSC(pdraw->psc->dpy, pdraw->xDrawable, target_msc, divisor,
 		      remainder, ust, msc, sbc);
 }
 
@@ -220,7 +220,7 @@ static int
 dri2WaitForSBC(__GLXDRIdrawable *pdraw, int64_t target_sbc, int64_t *ust,
 	       int64_t *msc, int64_t *sbc)
 {
-   return DRI2WaitSBC(pdraw->psc->dpy, pdraw->drawable, target_sbc, ust, msc,
+   return DRI2WaitSBC(pdraw->psc->dpy, pdraw->xDrawable, target_sbc, ust, msc,
 		      sbc);
 }
 
@@ -381,7 +381,7 @@ dri2SwapBuffers(__GLXDRIdrawable *pdraw, int64_t target_msc, int64_t divisor,
        return 0;
     }
 
-    DRI2SwapBuffers(pdraw->psc->dpy, pdraw->drawable, target_msc, divisor,
+    DRI2SwapBuffers(pdraw->psc->dpy, pdraw->xDrawable, target_msc, divisor,
 		    remainder, &ret);
 
 #if __DRI2_FLUSH_VERSION >= 2
@@ -575,11 +575,19 @@ dri2CreateScreen(__GLXscreenConfigs * psc, int screen,
    psp->swapBuffers = dri2SwapBuffers;
    psp->waitGL = dri2WaitGL;
    psp->waitX = dri2WaitX;
-   psp->getDrawableMSC = dri2DrawableGetMSC;
-   psp->waitForMSC = dri2WaitForMSC;
-   psp->waitForSBC = dri2WaitForSBC;
-   psp->setSwapInterval = dri2SetSwapInterval;
-   psp->getSwapInterval = dri2GetSwapInterval;
+   if (pdp->driMinor >= 2) {
+      psp->getDrawableMSC = dri2DrawableGetMSC;
+      psp->waitForMSC = dri2WaitForMSC;
+      psp->waitForSBC = dri2WaitForSBC;
+      psp->setSwapInterval = dri2SetSwapInterval;
+      psp->getSwapInterval = dri2GetSwapInterval;
+   } else {
+      psp->getDrawableMSC = NULL;
+      psp->waitForMSC = NULL;
+      psp->waitForSBC = NULL;
+      psp->setSwapInterval = NULL;
+      psp->getSwapInterval = NULL;
+   }
 
    /* DRI2 suports SubBuffer through DRI2CopyRegion, so it's always
     * available.*/
diff --git a/src/glx/x11/indirect.c b/src/glx/x11/indirect.c
index ea90ce4463..262637a947 100644
--- a/src/glx/x11/indirect.c
+++ b/src/glx/x11/indirect.c
@@ -47,7 +47,7 @@
 #  else
 #    define FASTCALL
 #  endif
-#  if defined(__GNUC__)
+#  if defined(__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
 #    define NOINLINE __attribute__((noinline))
 #  else
 #    define NOINLINE
diff --git a/src/glx/x11/indirect.h b/src/glx/x11/indirect.h
index 19a8c0d134..9e73b33818 100644
--- a/src/glx/x11/indirect.h
+++ b/src/glx/x11/indirect.h
@@ -37,7 +37,7 @@
  * \author Ian Romanick <idr@us.ibm.com>
  */
 
-#  if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)) && defined(__ELF__)
+#  if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined(__ELF__)
 #    define HIDDEN  __attribute__((visibility("hidden")))
 #  else
 #    define HIDDEN
@@ -47,7 +47,7 @@
 #  else
 #    define FASTCALL
 #  endif
-#  if defined(__GNUC__)
+#  if defined(__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
 #    define NOINLINE __attribute__((noinline))
 #  else
 #    define NOINLINE
diff --git a/src/glx/x11/indirect_size.c b/src/glx/x11/indirect_size.c
index cdaf02ffe6..f8541b5758 100644
--- a/src/glx/x11/indirect_size.c
+++ b/src/glx/x11/indirect_size.c
@@ -29,7 +29,7 @@
 #include <GL/gl.h>
 #include "indirect_size.h"
 
-#  if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)
+#  if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
 #    define PURE __attribute__((pure))
 #  else
 #    define PURE
@@ -41,7 +41,7 @@
 #    define FASTCALL
 #  endif
 
-#  if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)) && defined(__ELF__)
+#  if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined(__ELF__)
 #    define INTERNAL  __attribute__((visibility("internal")))
 #  else
 #    define INTERNAL
diff --git a/src/glx/x11/indirect_size.h b/src/glx/x11/indirect_size.h
index 9ba0bd6907..af0919f964 100644
--- a/src/glx/x11/indirect_size.h
+++ b/src/glx/x11/indirect_size.h
@@ -36,7 +36,7 @@
  * \author Ian Romanick <idr@us.ibm.com>
  */
 
-#  if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)
+#  if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
 #    define PURE __attribute__((pure))
 #  else
 #    define PURE
@@ -48,7 +48,7 @@
 #    define FASTCALL
 #  endif
 
-#  if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)) && defined(__ELF__)
+#  if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined(__ELF__)
 #    define INTERNAL  __attribute__((visibility("internal")))
 #  else
 #    define INTERNAL
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index bac1c3a49c..016f27a6a3 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -295,8 +295,7 @@ cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key)
    bo = brw_upload_cache(&brw->cache, BRW_CC_UNIT,
 			 key, sizeof(*key),
 			 &brw->cc.vp_bo, 1,
-			 &cc, sizeof(cc),
-			 NULL, NULL);
+			 &cc, sizeof(cc));
 
    /* Emit CC viewport relocation */
    dri_bo_emit_reloc(bo,
diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
index af1d975de9..d3275c7a89 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -130,13 +130,14 @@ static void compile_clip_prog( struct brw_context *brw,
    /* Upload
     */
    dri_bo_unreference(brw->clip.prog_bo);
-   brw->clip.prog_bo = brw_upload_cache( &brw->cache,
-					 BRW_CLIP_PROG,
-					 &c.key, sizeof(c.key),
-					 NULL, 0,
-					 program, program_size,
-					 &c.prog_data,
-					 &brw->clip.prog_data );
+   brw->clip.prog_bo = brw_upload_cache_with_auxdata(&brw->cache,
+						     BRW_CLIP_PROG,
+						     &c.key, sizeof(c.key),
+						     NULL, 0,
+						     program, program_size,
+						     &c.prog_data,
+						     sizeof(c.prog_data),
+						     &brw->clip.prog_data);
 }
 
 /* Calculate interpolants for triangle and line rasterization.
diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c b/src/mesa/drivers/dri/i965/brw_clip_state.c
index c8f24a94e4..22df7722b6 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_state.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_state.c
@@ -143,8 +143,7 @@ clip_unit_create_from_key(struct brw_context *brw,
    bo = brw_upload_cache(&brw->cache, BRW_CLIP_UNIT,
 			 key, sizeof(*key),
 			 &brw->clip.prog_bo, 1,
-			 &clip, sizeof(clip),
-			 NULL, NULL);
+			 &clip, sizeof(clip));
 
    /* Emit clip program relocation */
    assert(brw->clip.prog_bo);
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 0dd3087143..79818b92b7 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -131,7 +131,6 @@ struct brw_context;
 #define BRW_NEW_WM_INPUT_DIMENSIONS     0x100
 #define BRW_NEW_PSP                     0x800
 #define BRW_NEW_WM_SURFACES		0x1000
-#define BRW_NEW_FENCE                   0x2000
 #define BRW_NEW_INDICES			0x4000
 #define BRW_NEW_VERTICES		0x8000
 /**
@@ -332,7 +331,6 @@ struct brw_cache {
    struct brw_cache_item **items;
    GLuint size, n_items;
 
-   GLuint aux_size[BRW_MAX_CACHE];
    char *name[BRW_MAX_CACHE];
 
    /* Record of the last BOs chosen for each cache_id.  Used to set
@@ -583,6 +581,7 @@ struct brw_context
 
    struct {
       struct brw_vs_prog_data *prog_data;
+      int8_t *constant_map; /* variable array following prog_data */
 
       dri_bo *prog_bo;
       dri_bo *state_bo;
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index 190310afbb..22e3e732f4 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -256,13 +256,24 @@ static void prepare_constant_buffer(struct brw_context *brw)
        */
       _mesa_load_state_parameters(ctx, vp->program.Base.Parameters); 
 
-      /* XXX just use a memcpy here */
-      for (i = 0; i < nr; i++) {
-         const GLfloat *value = vp->program.Base.Parameters->ParameterValues[i];
-	 buf[offset + i * 4 + 0] = value[0];
-	 buf[offset + i * 4 + 1] = value[1];
-	 buf[offset + i * 4 + 2] = value[2];
-	 buf[offset + i * 4 + 3] = value[3];
+      if (vp->use_const_buffer) {
+	 /* Load the subset of push constants that will get used when
+	  * we also have a pull constant buffer.
+	  */
+	 for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
+	    if (brw->vs.constant_map[i] != -1) {
+	       assert(brw->vs.constant_map[i] <= nr);
+	       memcpy(buf + offset + brw->vs.constant_map[i] * 4,
+		      vp->program.Base.Parameters->ParameterValues[i],
+		      4 * sizeof(float));
+	    }
+	 }
+      } else {
+	 for (i = 0; i < nr; i++) {
+	    memcpy(buf + offset + i * 4,
+		   vp->program.Base.Parameters->ParameterValues[i],
+		   4 * sizeof(float));
+	 }
       }
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 1bc3eccf49..7261b316c1 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -125,12 +125,13 @@ static void compile_gs_prog( struct brw_context *brw,
    /* Upload
     */
    dri_bo_unreference(brw->gs.prog_bo);
-   brw->gs.prog_bo = brw_upload_cache( &brw->cache, BRW_GS_PROG,
-				       &c.key, sizeof(c.key),
-				       NULL, 0,
-				       program, program_size,
-				       &c.prog_data,
-				       &brw->gs.prog_data );
+   brw->gs.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_GS_PROG,
+						   &c.key, sizeof(c.key),
+						   NULL, 0,
+						   program, program_size,
+						   &c.prog_data,
+						   sizeof(c.prog_data),
+						   &brw->gs.prog_data);
 }
 
 static const GLenum gs_prim[GL_POLYGON+1] = {  
diff --git a/src/mesa/drivers/dri/i965/brw_gs_state.c b/src/mesa/drivers/dri/i965/brw_gs_state.c
index 1af5790a67..7d5a944bf7 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_state.c
@@ -108,8 +108,7 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
    bo = brw_upload_cache(&brw->cache, BRW_GS_UNIT,
 			 key, sizeof(*key),
 			 &brw->gs.prog_bo, 1,
-			 &gs, sizeof(gs),
-			 NULL, NULL);
+			 &gs, sizeof(gs));
 
    if (key->prog_active) {
       /* Emit GS program relocation */
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
index 968890f7fb..8e6839b812 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -117,12 +117,13 @@ static void compile_sf_prog( struct brw_context *brw,
    /* Upload
     */
    dri_bo_unreference(brw->sf.prog_bo);
-   brw->sf.prog_bo = brw_upload_cache( &brw->cache, BRW_SF_PROG,
-				       &c.key, sizeof(c.key),
-				       NULL, 0,
-				       program, program_size,
-				       &c.prog_data,
-				       &brw->sf.prog_data );
+   brw->sf.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_SF_PROG,
+						   &c.key, sizeof(c.key),
+						   NULL, 0,
+						   program, program_size,
+						   &c.prog_data,
+						   sizeof(c.prog_data),
+						   &brw->sf.prog_data);
 }
 
 /* Calculate interpolants for triangle and line rasterization.
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index 09223b7cfb..b9b42cd6d5 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -309,8 +309,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    bo = brw_upload_cache(&brw->cache, BRW_SF_UNIT,
 			 key, sizeof(*key),
 			 reloc_bufs, 2,
-			 &sf, sizeof(sf),
-			 NULL, NULL);
+			 &sf, sizeof(sf));
 
    /* STATE_PREFETCH command description describes this state as being
     * something loaded through the GPE (L2 ISC), so it's INSTRUCTION domain.
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 9c9d145c4b..536fe8b249 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -124,16 +124,26 @@ dri_bo *brw_cache_data(struct brw_cache *cache,
 		       dri_bo **reloc_bufs,
 		       GLuint nr_reloc_bufs);
 
-dri_bo *brw_upload_cache( struct brw_cache *cache,
-			  enum brw_cache_id cache_id,
-			  const void *key,
-			  GLuint key_sz,
-			  dri_bo **reloc_bufs,
-			  GLuint nr_reloc_bufs,
-			  const void *data,
-			  GLuint data_sz,
-			  const void *aux,
-			  void *aux_return );
+drm_intel_bo *brw_upload_cache(struct brw_cache *cache,
+			       enum brw_cache_id cache_id,
+			       const void *key,
+			       GLuint key_sz,
+			       dri_bo **reloc_bufs,
+			       GLuint nr_reloc_bufs,
+			       const void *data,
+			       GLuint data_sz);
+
+drm_intel_bo *brw_upload_cache_with_auxdata(struct brw_cache *cache,
+					    enum brw_cache_id cache_id,
+					    const void *key,
+					    GLuint key_sz,
+					    dri_bo **reloc_bufs,
+					    GLuint nr_reloc_bufs,
+					    const void *data,
+					    GLuint data_sz,
+					    const void *aux,
+					    GLuint aux_sz,
+					    void *aux_return);
 
 dri_bo *brw_search_cache( struct brw_cache *cache,
 			  enum brw_cache_id cache_id,
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index e4c9ba7d87..5fc47b0420 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -71,25 +71,23 @@
 
 
 static GLuint
-hash_key(const void *key, GLuint key_size,
-         dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
+hash_key(struct brw_cache_item *item)
 {
-   GLuint *ikey = (GLuint *)key;
-   GLuint hash = 0, i;
+   GLuint *ikey = (GLuint *)item->key;
+   GLuint hash = item->cache_id, i;
 
-   assert(key_size % 4 == 0);
+   assert(item->key_size % 4 == 0);
 
    /* I'm sure this can be improved on:
     */
-   for (i = 0; i < key_size/4; i++) {
+   for (i = 0; i < item->key_size/4; i++) {
       hash ^= ikey[i];
       hash = (hash << 5) | (hash >> 27);
    }
 
    /* Include the BO pointers as key data as well */
-   ikey = (GLuint *)reloc_bufs;
-   key_size = nr_reloc_bufs * sizeof(dri_bo *);
-   for (i = 0; i < key_size/4; i++) {
+   ikey = (GLuint *)item->reloc_bufs;
+   for (i = 0; i < item->nr_reloc_bufs * sizeof(drm_intel_bo *) / 4; i++) {
       hash ^= ikey[i];
       hash = (hash << 5) | (hash >> 27);
    }
@@ -114,11 +112,22 @@ update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
    cache->brw->state.dirty.cache |= 1 << cache_id;
 }
 
+static int
+brw_cache_item_equals(const struct brw_cache_item *a,
+		      const struct brw_cache_item *b)
+{
+   return a->cache_id == b->cache_id &&
+      a->hash == b->hash &&
+      a->key_size == b->key_size &&
+      (memcmp(a->key, b->key, a->key_size) == 0) &&
+      a->nr_reloc_bufs == b->nr_reloc_bufs &&
+      (memcmp(a->reloc_bufs, b->reloc_bufs,
+	      a->nr_reloc_bufs * sizeof(dri_bo *)) == 0);
+}
 
 static struct brw_cache_item *
-search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
-	     GLuint hash, const void *key, GLuint key_size,
-	     dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
+search_cache(struct brw_cache *cache, GLuint hash,
+	     struct brw_cache_item *lookup)
 {
    struct brw_cache_item *c;
 
@@ -133,13 +142,7 @@ search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
 #endif
 
    for (c = cache->items[hash % cache->size]; c; c = c->next) {
-      if (c->cache_id == cache_id &&
-	  c->hash == hash &&
-	  c->key_size == key_size &&
-	  memcmp(c->key, key, key_size) == 0 &&
-	  c->nr_reloc_bufs == nr_reloc_bufs &&
-	  memcmp(c->reloc_bufs, reloc_bufs,
-		 nr_reloc_bufs * sizeof(dri_bo *)) == 0)
+      if (brw_cache_item_equals(lookup, c))
 	 return c;
    }
 
@@ -182,10 +185,18 @@ brw_search_cache(struct brw_cache *cache,
                  void *aux_return)
 {
    struct brw_cache_item *item;
-   GLuint hash = hash_key(key, key_size, reloc_bufs, nr_reloc_bufs);
+   struct brw_cache_item lookup;
+   GLuint hash;
 
-   item = search_cache(cache, cache_id, hash, key, key_size,
-		       reloc_bufs, nr_reloc_bufs);
+   lookup.cache_id = cache_id;
+   lookup.key = key;
+   lookup.key_size = key_size;
+   lookup.reloc_bufs = reloc_bufs;
+   lookup.nr_reloc_bufs = nr_reloc_bufs;
+   hash = hash_key(&lookup);
+   lookup.hash = hash;
+
+   item = search_cache(cache, hash, &lookup);
 
    if (item == NULL)
       return NULL;
@@ -200,26 +211,34 @@ brw_search_cache(struct brw_cache *cache,
 }
 
 
-dri_bo *
-brw_upload_cache( struct brw_cache *cache,
-		  enum brw_cache_id cache_id,
-		  const void *key,
-		  GLuint key_size,
-		  dri_bo **reloc_bufs,
-		  GLuint nr_reloc_bufs,
-		  const void *data,
-		  GLuint data_size,
-		  const void *aux,
-		  void *aux_return )
+drm_intel_bo *
+brw_upload_cache_with_auxdata(struct brw_cache *cache,
+			      enum brw_cache_id cache_id,
+			      const void *key,
+			      GLuint key_size,
+			      dri_bo **reloc_bufs,
+			      GLuint nr_reloc_bufs,
+			      const void *data,
+			      GLuint data_size,
+			      const void *aux,
+			      GLuint aux_size,
+			      void *aux_return)
 {
    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
-   GLuint hash = hash_key(key, key_size, reloc_bufs, nr_reloc_bufs);
+   GLuint hash;
    GLuint relocs_size = nr_reloc_bufs * sizeof(dri_bo *);
-   GLuint aux_size = cache->aux_size[cache_id];
    void *tmp;
    dri_bo *bo;
    int i;
 
+   item->cache_id = cache_id;
+   item->key = key;
+   item->key_size = key_size;
+   item->reloc_bufs = reloc_bufs;
+   item->nr_reloc_bufs = nr_reloc_bufs;
+   hash = hash_key(item);
+   item->hash = hash;
+
    /* Create the buffer object to contain the data */
    bo = dri_bo_alloc(cache->brw->intel.bufmgr,
 		     cache->name[cache_id], data_size, 1 << 6);
@@ -229,19 +248,15 @@ brw_upload_cache( struct brw_cache *cache,
    tmp = _mesa_malloc(key_size + aux_size + relocs_size);
 
    memcpy(tmp, key, key_size);
-   memcpy(tmp + key_size, aux, cache->aux_size[cache_id]);
+   memcpy(tmp + key_size, aux, aux_size);
    memcpy(tmp + key_size + aux_size, reloc_bufs, relocs_size);
    for (i = 0; i < nr_reloc_bufs; i++) {
       if (reloc_bufs[i] != NULL)
 	 dri_bo_reference(reloc_bufs[i]);
    }
 
-   item->cache_id = cache_id;
    item->key = tmp;
-   item->hash = hash;
-   item->key_size = key_size;
    item->reloc_bufs = tmp + key_size + aux_size;
-   item->nr_reloc_bufs = nr_reloc_bufs;
 
    item->bo = bo;
    dri_bo_reference(bo);
@@ -255,7 +270,6 @@ brw_upload_cache( struct brw_cache *cache,
    cache->n_items++;
 
    if (aux_return) {
-      assert(cache->aux_size[cache_id]);
       *(void **)aux_return = (void *)((char *)item->key + item->key_size);
    }
 
@@ -272,6 +286,23 @@ brw_upload_cache( struct brw_cache *cache,
    return bo;
 }
 
+drm_intel_bo *
+brw_upload_cache(struct brw_cache *cache,
+		 enum brw_cache_id cache_id,
+		 const void *key,
+		 GLuint key_size,
+		 dri_bo **reloc_bufs,
+		 GLuint nr_reloc_bufs,
+		 const void *data,
+		 GLuint data_size)
+{
+   return brw_upload_cache_with_auxdata(cache, cache_id,
+					key, key_size,
+					reloc_bufs, nr_reloc_bufs,
+					data, data_size,
+					NULL, 0,
+					NULL);
+}
 
 /**
  * Wrapper around brw_cache_data_sz using the cache_id's canonical key size.
@@ -292,11 +323,18 @@ brw_cache_data(struct brw_cache *cache,
 	       GLuint nr_reloc_bufs)
 {
    dri_bo *bo;
-   struct brw_cache_item *item;
-   GLuint hash = hash_key(data, data_size, reloc_bufs, nr_reloc_bufs);
-
-   item = search_cache(cache, cache_id, hash, data, data_size,
-		       reloc_bufs, nr_reloc_bufs);
+   struct brw_cache_item *item, lookup;
+   GLuint hash;
+
+   lookup.cache_id = cache_id;
+   lookup.key = data;
+   lookup.key_size = data_size;
+   lookup.reloc_bufs = reloc_bufs;
+   lookup.nr_reloc_bufs = nr_reloc_bufs;
+   hash = hash_key(&lookup);
+   lookup.hash = hash;
+
+   item = search_cache(cache, hash, &lookup);
    if (item) {
       update_cache_last(cache, cache_id, item->bo);
       dri_bo_reference(item->bo);
@@ -306,8 +344,7 @@ brw_cache_data(struct brw_cache *cache,
    bo = brw_upload_cache(cache, cache_id,
 			 data, data_size,
 			 reloc_bufs, nr_reloc_bufs,
-			 data, data_size,
-			 NULL, NULL);
+			 data, data_size);
 
    return bo;
 }
@@ -321,11 +358,9 @@ enum pool_type {
 static void
 brw_init_cache_id(struct brw_cache *cache,
                   const char *name,
-                  enum brw_cache_id id,
-                  GLuint aux_size)
+                  enum brw_cache_id id)
 {
    cache->name[id] = strdup(name);
-   cache->aux_size[id] = aux_size;
 }
 
 
@@ -341,80 +376,28 @@ brw_init_non_surface_cache(struct brw_context *brw)
    cache->items = (struct brw_cache_item **)
       _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
 
-   brw_init_cache_id(cache,
-		     "CC_VP",
-		     BRW_CC_VP,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "CC_UNIT",
-		     BRW_CC_UNIT,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "WM_PROG",
-		     BRW_WM_PROG,
-		     sizeof(struct brw_wm_prog_data));
-
-   brw_init_cache_id(cache,
-		     "SAMPLER_DEFAULT_COLOR",
-		     BRW_SAMPLER_DEFAULT_COLOR,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "SAMPLER",
-		     BRW_SAMPLER,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "WM_UNIT",
-		     BRW_WM_UNIT,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "SF_PROG",
-		     BRW_SF_PROG,
-		     sizeof(struct brw_sf_prog_data));
-
-   brw_init_cache_id(cache,
-		     "SF_VP",
-		     BRW_SF_VP,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "SF_UNIT",
-		     BRW_SF_UNIT,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "VS_UNIT",
-		     BRW_VS_UNIT,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "VS_PROG",
-		     BRW_VS_PROG,
-		     sizeof(struct brw_vs_prog_data));
-
-   brw_init_cache_id(cache,
-		     "CLIP_UNIT",
-		     BRW_CLIP_UNIT,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "CLIP_PROG",
-		     BRW_CLIP_PROG,
-		     sizeof(struct brw_clip_prog_data));
-
-   brw_init_cache_id(cache,
-		     "GS_UNIT",
-		     BRW_GS_UNIT,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "GS_PROG",
-		     BRW_GS_PROG,
-		     sizeof(struct brw_gs_prog_data));
+   brw_init_cache_id(cache, "CC_VP", BRW_CC_VP);
+   brw_init_cache_id(cache, "CC_UNIT", BRW_CC_UNIT);
+   brw_init_cache_id(cache, "WM_PROG", BRW_WM_PROG);
+   brw_init_cache_id(cache, "SAMPLER_DEFAULT_COLOR", BRW_SAMPLER_DEFAULT_COLOR);
+   brw_init_cache_id(cache, "SAMPLER", BRW_SAMPLER);
+   brw_init_cache_id(cache, "WM_UNIT", BRW_WM_UNIT);
+   brw_init_cache_id(cache, "SF_PROG", BRW_SF_PROG);
+   brw_init_cache_id(cache, "SF_VP", BRW_SF_VP);
+
+   brw_init_cache_id(cache, "SF_UNIT", BRW_SF_UNIT);
+
+   brw_init_cache_id(cache, "VS_UNIT", BRW_VS_UNIT);
+
+   brw_init_cache_id(cache, "VS_PROG", BRW_VS_PROG);
+
+   brw_init_cache_id(cache, "CLIP_UNIT", BRW_CLIP_UNIT);
+
+   brw_init_cache_id(cache, "CLIP_PROG", BRW_CLIP_PROG);
+
+   brw_init_cache_id(cache, "GS_UNIT", BRW_GS_UNIT);
+
+   brw_init_cache_id(cache, "GS_PROG", BRW_GS_PROG);
 }
 
 
@@ -430,15 +413,8 @@ brw_init_surface_cache(struct brw_context *brw)
    cache->items = (struct brw_cache_item **)
       _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
 
-   brw_init_cache_id(cache,
-		     "SS_SURFACE",
-		     BRW_SS_SURFACE,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "SS_SURF_BIND",
-		     BRW_SS_SURF_BIND,
-		     0);
+   brw_init_cache_id(cache, "SS_SURFACE", BRW_SS_SURFACE);
+   brw_init_cache_id(cache, "SS_SURF_BIND", BRW_SS_SURF_BIND);
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index af8dfb4c15..0ecbef1ef9 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -36,13 +36,7 @@
 #include "intel_batchbuffer.h"
 #include "intel_buffers.h"
 
-/* This is used to initialize brw->state.atoms[].  We could use this
- * list directly except for a single atom, brw_constant_buffer, which
- * has a .dirty value which changes according to the parameters of the
- * current fragment and vertex programs, and so cannot be a static
- * value.
- */
-const struct brw_tracked_state *atoms[] =
+static const struct brw_tracked_state *atoms[] =
 {
    &brw_check_fallback,
 
@@ -208,7 +202,6 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_CONTEXT),
    DEFINE_BIT(BRW_NEW_WM_INPUT_DIMENSIONS),
    DEFINE_BIT(BRW_NEW_PSP),
-   DEFINE_BIT(BRW_NEW_FENCE),
    DEFINE_BIT(BRW_NEW_INDICES),
    DEFINE_BIT(BRW_NEW_INDEX_BUFFER),
    DEFINE_BIT(BRW_NEW_VERTICES),
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index fd055e225e..44b085e214 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -35,6 +35,7 @@
 #include "brw_util.h"
 #include "brw_state.h"
 #include "shader/prog_print.h"
+#include "shader/prog_parameter.h"
 
 
 
@@ -42,9 +43,11 @@ static void do_vs_prog( struct brw_context *brw,
 			struct brw_vertex_program *vp,
 			struct brw_vs_prog_key *key )
 {
+   GLcontext *ctx = &brw->intel.ctx;
    GLuint program_size;
    const GLuint *program;
    struct brw_vs_compile c;
+   int aux_size;
 
    memset(&c, 0, sizeof(c));
    memcpy(&c.key, key, sizeof(*key));
@@ -73,13 +76,27 @@ static void do_vs_prog( struct brw_context *brw,
     */
    program = brw_get_program(&c.func, &program_size);
 
+   /* We upload from &c.prog_data including the constant_map assuming
+    * they're packed together.  It would be nice to have a
+    * compile-time assert macro here.
+    */
+   assert(c.constant_map == (int8_t *)&c.prog_data +
+	  sizeof(c.prog_data));
+   assert(ctx->Const.VertexProgram.MaxNativeParameters ==
+	  ARRAY_SIZE(c.constant_map));
+
+   aux_size = sizeof(c.prog_data);
+   if (c.vp->use_const_buffer)
+      aux_size += c.vp->program.Base.Parameters->NumParameters;
+
    dri_bo_unreference(brw->vs.prog_bo);
-   brw->vs.prog_bo = brw_upload_cache( &brw->cache, BRW_VS_PROG,
-				       &c.key, sizeof(c.key),
-				       NULL, 0,
-				       program, program_size,
-				       &c.prog_data,
-				       &brw->vs.prog_data );
+   brw->vs.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_VS_PROG,
+						   &c.key, sizeof(c.key),
+						   NULL, 0,
+						   program, program_size,
+						   &c.prog_data,
+						   aux_size,
+						   &brw->vs.prog_data);
 }
 
 
@@ -109,6 +126,8 @@ static void brw_upload_vs_prog(struct brw_context *brw)
 				      &brw->vs.prog_data);
    if (brw->vs.prog_bo == NULL)
       do_vs_prog(brw, vp, &key);
+   brw->vs.constant_map = ((int8_t *)brw->vs.prog_data +
+			   sizeof(*brw->vs.prog_data));
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 4a591365c9..95e0501b1e 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -51,6 +51,7 @@ struct brw_vs_compile {
    struct brw_compile func;
    struct brw_vs_prog_key key;
    struct brw_vs_prog_data prog_data;
+   int8_t constant_map[1024];
 
    struct brw_vertex_program *vp;
 
@@ -81,6 +82,8 @@ struct brw_vs_compile {
       GLint index;
       struct brw_reg reg;
    } current_const[3];
+
+   GLboolean needs_stack;
 };
 
 void brw_vs_emit( struct brw_vs_compile *c );
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index 1b84dd505f..52cc04fee8 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -104,9 +104,47 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    /* Vertex program parameters from curbe:
     */
    if (c->vp->use_const_buffer) {
-      /* get constants from a real constant buffer */
-      c->prog_data.curb_read_length = 0;
-      c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
+      int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
+      int constant = 0;
+
+      /* We've got more constants than we can load with the push
+       * mechanism.  This is often correlated with reladdr loads where
+       * we should probably be using a pull mechanism anyway to avoid
+       * excessive reading.  However, the pull mechanism is slow in
+       * general.  So, we try to allocate as many non-reladdr-loaded
+       * constants through the push buffer as we can before giving up.
+       */
+      memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
+      for (i = 0;
+	   i < c->vp->program.Base.NumInstructions && constant < max_constant;
+	   i++) {
+	 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
+	 int arg;
+
+	 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
+	    if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
+		 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
+		 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
+		 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
+		 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
+		inst->SrcReg[arg].RelAddr)
+	       continue;
+
+	    if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
+	       c->constant_map[inst->SrcReg[arg].Index] = constant++;
+	    }
+	 }
+      }
+
+      for (i = 0; i < constant; i++) {
+         c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
+							      (i%2) * 4),
+						 0, 4, 1);
+      }
+      reg += (constant + 1) / 2;
+      c->prog_data.curb_read_length = reg - 1;
+      /* XXX 0 causes a bug elsewhere... */
+      c->prog_data.nr_params = MAX2(constant * 4, 4);
    }
    else {
       /* use a section of the GRF for constants */
@@ -214,8 +252,10 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
       }
    }
 
-   c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
-   reg += 2;
+   if (c->needs_stack) {
+      c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
+      reg += 2;
+   }
 
    /* Some opcodes need an internal temporary:
     */
@@ -762,15 +802,14 @@ get_constant(struct brw_vs_compile *c,
 {
    const struct prog_src_register *src = &inst->SrcReg[argIndex];
    struct brw_compile *p = &c->func;
-   struct brw_reg const_reg;
-   struct brw_reg const2_reg;
-   const GLboolean relAddr = src->RelAddr;
+   struct brw_reg const_reg = c->current_const[argIndex].reg;
 
    assert(argIndex < 3);
 
-   if (c->current_const[argIndex].index != src->Index || relAddr) {
+   if (c->current_const[argIndex].index != src->Index) {
       struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 
+      /* Keep track of the last constant loaded in this slot, for reuse. */
       c->current_const[argIndex].index = src->Index;
 
 #if 0
@@ -779,48 +818,74 @@ get_constant(struct brw_vs_compile *c,
 #endif
       /* need to fetch the constant now */
       brw_dp_READ_4_vs(p,
-                       c->current_const[argIndex].reg,/* writeback dest */
+                       const_reg,                     /* writeback dest */
                        0,                             /* oword */
-                       relAddr,                       /* relative indexing? */
+                       0,                             /* relative indexing? */
                        addrReg,                       /* address register */
                        16 * src->Index,               /* byte offset */
                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
                        );
-
-      if (relAddr) {
-         /* second read */
-         const2_reg = get_tmp(c);
-
-         /* use upper half of address reg for second read */
-         addrReg = stride(addrReg, 0, 4, 0);
-         addrReg.subnr = 16;
-
-         brw_dp_READ_4_vs(p,
-                          const2_reg,              /* writeback dest */
-                          1,                       /* oword */
-                          relAddr,                 /* relative indexing? */
-                          addrReg,                 /* address register */
-                          16 * src->Index,         /* byte offset */
-                          SURF_INDEX_VERT_CONST_BUFFER
-                          );
-      }
    }
 
-   const_reg = c->current_const[argIndex].reg;
+   /* replicate lower four floats into upper half (to get XYZWXYZW) */
+   const_reg = stride(const_reg, 0, 4, 0);
+   const_reg.subnr = 0;
 
-   if (relAddr) {
-      /* merge the two Owords into the constant register */
-      /* const_reg[7..4] = const2_reg[7..4] */
-      brw_MOV(p,
-              suboffset(stride(const_reg, 0, 4, 1), 4),
-              suboffset(stride(const2_reg, 0, 4, 1), 4));
-      release_tmp(c, const2_reg);
-   }
-   else {
-      /* replicate lower four floats into upper half (to get XYZWXYZW) */
-      const_reg = stride(const_reg, 0, 4, 0);
-      const_reg.subnr = 0;
-   }
+   return const_reg;
+}
+
+static struct brw_reg
+get_reladdr_constant(struct brw_vs_compile *c,
+		     const struct prog_instruction *inst,
+		     GLuint argIndex)
+{
+   const struct prog_src_register *src = &inst->SrcReg[argIndex];
+   struct brw_compile *p = &c->func;
+   struct brw_reg const_reg = c->current_const[argIndex].reg;
+   struct brw_reg const2_reg;
+   struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
+
+   assert(argIndex < 3);
+
+   /* Can't reuse a reladdr constant load. */
+   c->current_const[argIndex].index = -1;
+
+ #if 0
+   printf("  fetch const[a0.x+%d] for arg %d into reg %d\n",
+	  src->Index, argIndex, c->current_const[argIndex].reg.nr);
+#endif
+
+   /* fetch the first vec4 */
+   brw_dp_READ_4_vs(p,
+		    const_reg,                     /* writeback dest */
+		    0,                             /* oword */
+		    1,                             /* relative indexing? */
+		    addrReg,                       /* address register */
+		    16 * src->Index,               /* byte offset */
+		    SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
+		    );
+   /* second vec4 */
+   const2_reg = get_tmp(c);
+
+   /* use upper half of address reg for second read */
+   addrReg = stride(addrReg, 0, 4, 0);
+   addrReg.subnr = 16;
+
+   brw_dp_READ_4_vs(p,
+		    const2_reg,              /* writeback dest */
+		    1,                       /* oword */
+		    1,                       /* relative indexing? */
+		    addrReg,                 /* address register */
+		    16 * src->Index,         /* byte offset */
+		    SURF_INDEX_VERT_CONST_BUFFER
+		    );
+
+   /* merge the two Owords into the constant register */
+   /* const_reg[7..4] = const2_reg[7..4] */
+   brw_MOV(p,
+	   suboffset(stride(const_reg, 0, 4, 1), 4),
+	   suboffset(stride(const2_reg, 0, 4, 1), 4));
+   release_tmp(c, const2_reg);
 
    return const_reg;
 }
@@ -928,7 +993,13 @@ get_src_reg( struct brw_vs_compile *c,
    case PROGRAM_ENV_PARAM:
    case PROGRAM_LOCAL_PARAM:
       if (c->vp->use_const_buffer) {
-         return get_constant(c, inst, argIndex);
+	 if (!relAddr && c->constant_map[index] != -1) {
+	    assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
+	    return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
+	 } else if (relAddr)
+	    return get_reladdr_constant(c, inst, argIndex);
+	 else
+	    return get_constant(c, inst, argIndex);
       }
       else if (relAddr) {
          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
@@ -1380,12 +1451,14 @@ void brw_vs_emit(struct brw_vs_compile *c )
 
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_access_mode(p, BRW_ALIGN_16);
-   
-   /* Message registers can't be read, so copy the output into GRF register
-      if they are used in source registers */
+
    for (insn = 0; insn < nr_insns; insn++) {
        GLuint i;
        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
+
+       /* Message registers can't be read, so copy the output into GRF
+	* register if they are used in source registers
+	*/
        for (i = 0; i < 3; i++) {
 	   struct prog_src_register *src = &inst->SrcReg[i];
 	   GLuint index = src->Index;
@@ -1393,12 +1466,23 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	   if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
 	       c->output_regs[index].used_in_src = GL_TRUE;
        }
+
+       switch (inst->Opcode) {
+       case OPCODE_CAL:
+       case OPCODE_RET:
+	  c->needs_stack = GL_TRUE;
+	  break;
+       default:
+	  break;
+       }
    }
 
    /* Static register allocation
     */
    brw_vs_alloc_regs(c);
-   brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
+
+   if (c->needs_stack)
+      brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
 
    for (insn = 0; insn < nr_insns; insn++) {
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c
index 345ffa7ee1..fd9f2fee42 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
@@ -164,8 +164,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    bo = brw_upload_cache(&brw->cache, BRW_VS_UNIT,
 			 key, sizeof(*key),
 			 &brw->vs.prog_bo, 1,
-			 &vs, sizeof(vs),
-			 NULL, NULL);
+			 &vs, sizeof(vs));
 
    /* Emit VS program relocation */
    dri_bo_emit_reloc(bo,
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index 3bc9840a97..3f6e16fcb0 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -168,8 +168,7 @@ brw_vs_get_binding_table(struct brw_context *brw)
       bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
 				  NULL, 0,
 				  brw->vs.surf_bo, BRW_VS_MAX_SURF,
-				  data, data_size,
-				  NULL, NULL);
+				  data, data_size);
 
       /* Emit binding table relocations to surface state */
       for (i = 0; i < BRW_VS_MAX_SURF; i++) {
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index 72749b3859..bb7a293812 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -172,12 +172,6 @@ static void brw_new_batch( struct intel_context *intel )
    }
 }
 
-
-static void brw_note_fence( struct intel_context *intel, GLuint fence )
-{
-   brw_context(&intel->ctx)->state.dirty.brw |= BRW_NEW_FENCE;
-}
-
 static void brw_invalidate_state( struct intel_context *intel, GLuint new_state )
 {
    /* nothing */
@@ -193,7 +187,6 @@ void brwInitVtbl( struct brw_context *brw )
    brw->intel.vtbl.update_texture_state = 0;
 
    brw->intel.vtbl.invalidate_state = brw_invalidate_state;
-   brw->intel.vtbl.note_fence = brw_note_fence;
    brw->intel.vtbl.new_batch = brw_new_batch;
    brw->intel.vtbl.finish_batch = brw_finish_batch;
    brw->intel.vtbl.destroy = brw_destroy_context;
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 6895f64410..fb24379c90 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -199,12 +199,13 @@ static void do_wm_prog( struct brw_context *brw,
    program = brw_get_program(&c->func, &program_size);
 
    dri_bo_unreference(brw->wm.prog_bo);
-   brw->wm.prog_bo = brw_upload_cache( &brw->cache, BRW_WM_PROG,
-				       &c->key, sizeof(c->key),
-				       NULL, 0,
-				       program, program_size,
-				       &c->prog_data,
-				       &brw->wm.prog_data );
+   brw->wm.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_WM_PROG,
+						   &c->key, sizeof(c->key),
+						   NULL, 0,
+						   program, program_size,
+						   &c->prog_data,
+						   sizeof(c->prog_data),
+						   &brw->wm.prog_data);
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index ad267a4e6a..87387b1e2d 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -326,8 +326,7 @@ static void upload_wm_samplers( struct brw_context *brw )
       brw->wm.sampler_bo = brw_upload_cache(&brw->cache, BRW_SAMPLER,
 					    &key, sizeof(key),
 					    brw->wm.sdc_bo, key.sampler_count,
-					    &sampler, sizeof(sampler),
-					    NULL, NULL);
+					    &sampler, sizeof(sampler));
 
       /* Emit SDC relocations */
       for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index d3373ea79e..a7f80db554 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -210,8 +210,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    bo = brw_upload_cache(&brw->cache, BRW_WM_UNIT,
 			 key, sizeof(*key),
 			 reloc_bufs, 3,
-			 &wm, sizeof(wm),
-			 NULL, NULL);
+			 &wm, sizeof(wm));
 
    /* Emit WM program relocation */
    dri_bo_emit_reloc(bo,
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index f26cfabb7d..357c8c90de 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -256,8 +256,7 @@ brw_create_texture_surface( struct brw_context *brw,
    bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
 			 key, sizeof(*key),
 			 &key->bo, key->bo ? 1 : 0,
-			 &surf, sizeof(surf),
-			 NULL, NULL);
+			 &surf, sizeof(surf));
 
    if (key->bo) {
       /* Emit relocation to surface contents */
@@ -351,8 +350,7 @@ brw_create_constant_surface( struct brw_context *brw,
    bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
 			 key, sizeof(*key),
 			 &key->bo, key->bo ? 1 : 0,
-			 &surf, sizeof(surf),
-			 NULL, NULL);
+			 &surf, sizeof(surf));
 
    if (key->bo) {
       /* Emit relocation to surface contents.  Section 5.1.1 of the gen4
@@ -653,8 +651,7 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
                                                BRW_SS_SURFACE,
                                                &key, sizeof(key),
 					       &region_bo, 1,
-					       &surf, sizeof(surf),
-					       NULL, NULL);
+					       &surf, sizeof(surf));
       if (region_bo != NULL) {
 	 /* We might sample from it, and we might render to it, so flag
 	  * them both.  We might be able to figure out from other state
@@ -701,8 +698,7 @@ brw_wm_get_binding_table(struct brw_context *brw)
       bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
 				  NULL, 0,
 				  brw->wm.surf_bo, brw->wm.nr_surfaces,
-				  data, data_size,
-				  NULL, NULL);
+				  data, data_size);
 
       /* Emit binding table relocations to surface state */
       for (i = 0; i < BRW_WM_MAX_SURF; i++) {
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index 3f6634c65a..d52fe2eef2 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -506,27 +506,7 @@ intelFlush(GLcontext * ctx)
 static void
 intel_glFlush(GLcontext *ctx)
 {
-   struct intel_context *intel = intel_context(ctx);
-
    intel_flush(ctx, GL_TRUE);
-
-   /* We're using glFlush as an indicator that a frame is done, which is
-    * what DRI2 does before calling SwapBuffers (and means we should catch
-    * people doing front-buffer rendering, as well)..
-    *
-    * Wait for the swapbuffers before the one we just emitted, so we don't
-    * get too many swaps outstanding for apps that are GPU-heavy but not
-    * CPU-heavy.
-    *
-    * Unfortunately, we don't have a handle to the batch containing the swap,
-    * and getting our hands on that doesn't seem worth it, so we just us the
-    * first batch we emitted after the last swap.
-    */
-   if (intel->first_post_swapbuffers_batch != NULL) {
-      drm_intel_bo_wait_rendering(intel->first_post_swapbuffers_batch);
-      drm_intel_bo_unreference(intel->first_post_swapbuffers_batch);
-      intel->first_post_swapbuffers_batch = NULL;
-   }
 }
 
 void
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index 07207bfbec..6ba281cc14 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -107,7 +107,6 @@ struct intel_context
       void (*finish_batch) (struct intel_context * intel);
       void (*new_batch) (struct intel_context * intel);
       void (*emit_invarient_state) (struct intel_context * intel);
-      void (*note_fence) (struct intel_context *intel, GLuint fence);
       void (*update_texture_state) (struct intel_context * intel);
 
       void (*render_start) (struct intel_context * intel);
diff --git a/src/mesa/drivers/dri/intel/intel_screen.c b/src/mesa/drivers/dri/intel/intel_screen.c
index e240957197..6c2cb3b57e 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.c
+++ b/src/mesa/drivers/dri/intel/intel_screen.c
@@ -128,8 +128,29 @@ intelDRI2Flush(__DRIdrawable *drawable)
 static void
 intelDRI2FlushInvalidate(__DRIdrawable *drawable)
 {
+   struct intel_context *intel = drawable->driContextPriv->driverPrivate;
+
    intelDRI2Flush(drawable);
    drawable->validBuffers = GL_FALSE;
+
+   /* We're using FlushInvalidate as an indicator that a frame is
+    * done.  It's only called immediately after SwapBuffers, so it
+    * won't affect front-buffer rendering or applications explicitly
+    * managing swap regions using MESA_copy_buffer.
+    *
+    * Wait for the swapbuffers before the one we just emitted, so we don't
+    * get too many swaps outstanding for apps that are GPU-heavy but not
+    * CPU-heavy.
+    *
+    * Unfortunately, we don't have a handle to the batch containing the swap,
+    * and getting our hands on that doesn't seem worth it, so we just use the
+    * first batch we emitted after the last swap.
+    */
+   if (intel->first_post_swapbuffers_batch != NULL) {
+      drm_intel_bo_wait_rendering(intel->first_post_swapbuffers_batch);
+      drm_intel_bo_unreference(intel->first_post_swapbuffers_batch);
+      intel->first_post_swapbuffers_batch = NULL;
+   }
 }
 
 static const struct __DRI2flushExtensionRec intelFlushExtension = {
diff --git a/src/mesa/drivers/dri/r200/Makefile b/src/mesa/drivers/dri/r200/Makefile
index 8212dc1203..ca33faff87 100644
--- a/src/mesa/drivers/dri/r200/Makefile
+++ b/src/mesa/drivers/dri/r200/Makefile
@@ -29,8 +29,8 @@ RADEON_COMMON_SOURCES = \
 	radeon_mipmap_tree.c \
 	radeon_queryobj.c \
 	radeon_span.c \
-	radeon_texture.c
-
+	radeon_texture.c \
+	radeon_tex_copy.c
 
 DRIVER_SOURCES = r200_context.c \
 		 r200_ioctl.c \
@@ -46,6 +46,7 @@ DRIVER_SOURCES = r200_context.c \
 		 r200_sanity.c \
 		 r200_fragshader.c \
 		 r200_vertprog.c \
+		 r200_blit.c \
 		 radeon_screen.c \
 		 $(EGL_SOURCES) \
 		 $(RADEON_COMMON_SOURCES) \
diff --git a/src/mesa/drivers/dri/r200/r200_blit.c b/src/mesa/drivers/dri/r200/r200_blit.c
new file mode 100644
index 0000000000..f899f7efdc
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/r200_blit.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright (C) 2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_common.h"
+#include "r200_context.h"
+#include "r200_blit.h"
+
+static inline uint32_t cmdpacket0(struct radeon_screen *rscrn,
+                                  int reg, int count)
+{
+    if (count)
+	    return CP_PACKET0(reg, count - 1);
+    return CP_PACKET2;
+}
+
+/* common formats supported as both textures and render targets */
+static unsigned is_blit_supported(gl_format mesa_format)
+{
+    /* XXX others?  BE/LE? */
+    switch (mesa_format) {
+    case MESA_FORMAT_ARGB8888:
+    case MESA_FORMAT_XRGB8888:
+    case MESA_FORMAT_RGB565:
+    case MESA_FORMAT_ARGB4444:
+    case MESA_FORMAT_ARGB1555:
+    case MESA_FORMAT_A8:
+	    break;
+    default:
+	    return 0;
+    }
+
+    /* ??? */
+    if (_mesa_get_format_bits(mesa_format, GL_DEPTH_BITS) > 0)
+	    return 0;
+
+    return 1;
+}
+
+static inline void emit_vtx_state(struct r200_context *r200)
+{
+    BATCH_LOCALS(&r200->radeon);
+
+    BEGIN_BATCH(14);
+    if (r200->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+	    OUT_BATCH_REGVAL(R200_SE_VAP_CNTL_STATUS, 0);
+    } else {
+	    OUT_BATCH_REGVAL(R200_SE_VAP_CNTL_STATUS, RADEON_TCL_BYPASS);
+    }
+    OUT_BATCH_REGVAL(R200_SE_VAP_CNTL, (R200_VAP_FORCE_W_TO_ONE |
+					(9 << R200_VAP_VF_MAX_VTX_NUM__SHIFT)));
+    OUT_BATCH_REGVAL(R200_SE_VTX_STATE_CNTL, 0);
+    OUT_BATCH_REGVAL(R200_SE_VTE_CNTL, 0);
+    OUT_BATCH_REGVAL(R200_SE_VTX_FMT_0, R200_VTX_XY);
+    OUT_BATCH_REGVAL(R200_SE_VTX_FMT_1, (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
+    OUT_BATCH_REGVAL(RADEON_SE_CNTL, (RADEON_DIFFUSE_SHADE_GOURAUD |
+				      RADEON_BFACE_SOLID |
+				      RADEON_FFACE_SOLID |
+				      RADEON_VTX_PIX_CENTER_OGL |
+				      RADEON_ROUND_MODE_ROUND |
+				      RADEON_ROUND_PREC_4TH_PIX));
+    END_BATCH();
+}
+
+static void inline emit_tx_setup(struct r200_context *r200,
+				 gl_format mesa_format,
+				 struct radeon_bo *bo,
+				 intptr_t offset,
+				 unsigned width,
+				 unsigned height,
+				 unsigned pitch)
+{
+    uint32_t txformat = R200_TXFORMAT_NON_POWER2;
+    BATCH_LOCALS(&r200->radeon);
+
+    assert(width <= 2047);
+    assert(height <= 2047);
+    assert(offset % 32 == 0);
+
+    /* XXX others?  BE/LE? */
+    switch (mesa_format) {
+    case MESA_FORMAT_ARGB8888:
+	    txformat |= R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP;
+	    break;
+    case MESA_FORMAT_XRGB8888:
+	    txformat |= R200_TXFORMAT_ARGB8888;
+	    break;
+    case MESA_FORMAT_RGB565:
+	    txformat |= R200_TXFORMAT_RGB565;
+	    break;
+    case MESA_FORMAT_ARGB4444:
+	    txformat |= R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP;
+	    break;
+    case MESA_FORMAT_ARGB1555:
+	    txformat |= R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP;
+	    break;
+    case MESA_FORMAT_A8:
+	    txformat |= R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP;
+	    break;
+    default:
+	    break;
+    }
+
+    BEGIN_BATCH(28);
+    OUT_BATCH_REGVAL(RADEON_PP_CNTL, RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
+    OUT_BATCH_REGVAL(R200_PP_CNTL_X, 0);
+    OUT_BATCH_REGVAL(R200_PP_TXMULTI_CTL_0, 0);
+    OUT_BATCH_REGVAL(R200_PP_TXCBLEND_0, (R200_TXC_ARG_A_ZERO |
+					  R200_TXC_ARG_B_ZERO |
+					  R200_TXC_ARG_C_R0_COLOR |
+					  R200_TXC_OP_MADD));
+    OUT_BATCH_REGVAL(R200_PP_TXCBLEND2_0, R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
+    OUT_BATCH_REGVAL(R200_PP_TXABLEND_0, (R200_TXA_ARG_A_ZERO |
+					  R200_TXA_ARG_B_ZERO |
+					  R200_TXA_ARG_C_R0_ALPHA |
+					  R200_TXA_OP_MADD));
+    OUT_BATCH_REGVAL(R200_PP_TXABLEND2_0, R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
+    OUT_BATCH_REGVAL(R200_PP_TXFILTER_0, (R200_CLAMP_S_CLAMP_LAST |
+					  R200_CLAMP_T_CLAMP_LAST |
+					  R200_MAG_FILTER_NEAREST |
+					  R200_MIN_FILTER_NEAREST));
+    OUT_BATCH_REGVAL(R200_PP_TXFORMAT_0, txformat);
+    OUT_BATCH_REGVAL(R200_PP_TXFORMAT_X_0, 0);
+    OUT_BATCH_REGVAL(R200_PP_TXSIZE_0, ((width - 1) |
+					((height - 1) << RADEON_TEX_VSIZE_SHIFT)));
+    OUT_BATCH_REGVAL(R200_PP_TXPITCH_0, pitch * _mesa_get_format_bytes(mesa_format) - 32);
+
+    OUT_BATCH_REGSEQ(R200_PP_TXOFFSET_0, 1);
+    OUT_BATCH_RELOC(0, bo, 0, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+
+    END_BATCH();
+}
+
+static inline void emit_cb_setup(struct r200_context *r200,
+				 struct radeon_bo *bo,
+				 intptr_t offset,
+				 gl_format mesa_format,
+				 unsigned pitch,
+				 unsigned width,
+				 unsigned height)
+{
+    uint32_t dst_pitch = pitch;
+    uint32_t dst_format = 0;
+    BATCH_LOCALS(&r200->radeon);
+
+    /* XXX others?  BE/LE? */
+    switch (mesa_format) {
+    case MESA_FORMAT_ARGB8888:
+    case MESA_FORMAT_XRGB8888:
+	    dst_format = RADEON_COLOR_FORMAT_ARGB8888;
+	    break;
+    case MESA_FORMAT_RGB565:
+	    dst_format = RADEON_COLOR_FORMAT_RGB565;
+	    break;
+    case MESA_FORMAT_ARGB4444:
+	    dst_format = RADEON_COLOR_FORMAT_ARGB4444;
+	    break;
+    case MESA_FORMAT_ARGB1555:
+	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
+	    break;
+    case MESA_FORMAT_A8:
+	    dst_format = RADEON_COLOR_FORMAT_RGB8;
+	    break;
+    default:
+	    break;
+    }
+
+    BEGIN_BATCH_NO_AUTOSTATE(22);
+    OUT_BATCH_REGVAL(R200_RE_AUX_SCISSOR_CNTL, 0);
+    OUT_BATCH_REGVAL(R200_RE_CNTL, 0);
+    OUT_BATCH_REGVAL(RADEON_RE_TOP_LEFT, 0);
+    OUT_BATCH_REGVAL(RADEON_RE_WIDTH_HEIGHT, ((width << RADEON_RE_WIDTH_SHIFT) |
+					      (height << RADEON_RE_HEIGHT_SHIFT)));
+    OUT_BATCH_REGVAL(RADEON_RB3D_PLANEMASK, 0xffffffff);
+    OUT_BATCH_REGVAL(RADEON_RB3D_BLENDCNTL, RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
+    OUT_BATCH_REGVAL(RADEON_RB3D_CNTL, dst_format);
+
+    OUT_BATCH_REGSEQ(RADEON_RB3D_COLOROFFSET, 1);
+    OUT_BATCH_RELOC(0, bo, 0, 0, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0);
+    OUT_BATCH_REGSEQ(RADEON_RB3D_COLORPITCH, 1);
+    OUT_BATCH_RELOC(dst_pitch, bo, dst_pitch, 0, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0);
+
+    END_BATCH();
+}
+
+static GLboolean validate_buffers(struct r200_context *r200,
+                                  struct radeon_bo *src_bo,
+                                  struct radeon_bo *dst_bo)
+{
+    int ret;
+    radeon_cs_space_add_persistent_bo(r200->radeon.cmdbuf.cs,
+                                      src_bo, RADEON_GEM_DOMAIN_VRAM, 0);
+
+    radeon_cs_space_add_persistent_bo(r200->radeon.cmdbuf.cs,
+                                      dst_bo, 0, RADEON_GEM_DOMAIN_VRAM);
+
+    ret = radeon_cs_space_check_with_bo(r200->radeon.cmdbuf.cs,
+                                        first_elem(&r200->radeon.dma.reserved)->bo,
+                                        RADEON_GEM_DOMAIN_GTT, 0);
+    if (ret)
+        return GL_FALSE;
+
+    return GL_TRUE;
+}
+
+/**
+ * Calculate texcoords for given image region.
+ * Output values are [minx, maxx, miny, maxy]
+ */
+static inline void calc_tex_coords(float img_width, float img_height,
+				   float x, float y,
+				   float reg_width, float reg_height,
+				   unsigned flip_y, float *buf)
+{
+    buf[0] = x / img_width;
+    buf[1] = buf[0] + reg_width / img_width;
+    buf[2] = y / img_height;
+    buf[3] = buf[2] + reg_height / img_height;
+    if (flip_y)
+    {
+        buf[2] = 1.0 - buf[1];
+        buf[3] = 1.0 - buf[3];
+    }
+}
+
+static inline void emit_draw_packet(struct r200_context *r200,
+				    unsigned src_width, unsigned src_height,
+				    unsigned src_x_offset, unsigned src_y_offset,
+				    unsigned dst_x_offset, unsigned dst_y_offset,
+				    unsigned reg_width, unsigned reg_height,
+				    unsigned flip_y)
+{
+    float texcoords[4];
+    float verts[12];
+    BATCH_LOCALS(&r200->radeon);
+
+    calc_tex_coords(src_width, src_height,
+                    src_x_offset, src_y_offset,
+                    reg_width, reg_height,
+                    flip_y, texcoords);
+
+    verts[0] = dst_x_offset;
+    verts[1] = dst_y_offset + reg_height;
+    verts[2] = texcoords[0];
+    verts[3] = texcoords[3];
+
+    verts[4] = dst_x_offset + reg_width;
+    verts[5] = dst_y_offset + reg_height;
+    verts[6] = texcoords[1];
+    verts[7] = texcoords[3];
+
+    verts[8] = dst_x_offset + reg_width;
+    verts[9] = dst_y_offset;
+    verts[10] = texcoords[1];
+    verts[11] = texcoords[2];
+
+    BEGIN_BATCH(14);
+    OUT_BATCH(R200_CP_CMD_3D_DRAW_IMMD_2 | (12 << 16));
+    OUT_BATCH(RADEON_CP_VC_CNTL_PRIM_WALK_RING |
+	      RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
+              (3 << 16));
+    OUT_BATCH_TABLE(verts, 12);
+    END_BATCH();
+}
+
+/**
+ * Copy a region of [@a width x @a height] pixels from source buffer
+ * to destination buffer.
+ * @param[in] r200 r200 context
+ * @param[in] src_bo source radeon buffer object
+ * @param[in] src_offset offset of the source image in the @a src_bo
+ * @param[in] src_mesaformat source image format
+ * @param[in] src_pitch aligned source image width
+ * @param[in] src_width source image width
+ * @param[in] src_height source image height
+ * @param[in] src_x_offset x offset in the source image
+ * @param[in] src_y_offset y offset in the source image
+ * @param[in] dst_bo destination radeon buffer object
+ * @param[in] dst_offset offset of the destination image in the @a dst_bo
+ * @param[in] dst_mesaformat destination image format
+ * @param[in] dst_pitch aligned destination image width
+ * @param[in] dst_width destination image width
+ * @param[in] dst_height destination image height
+ * @param[in] dst_x_offset x offset in the destination image
+ * @param[in] dst_y_offset y offset in the destination image
+ * @param[in] width region width
+ * @param[in] height region height
+ * @param[in] flip_y set if y coords of the source image need to be flipped
+ */
+unsigned r200_blit(GLcontext *ctx,
+                   struct radeon_bo *src_bo,
+                   intptr_t src_offset,
+                   gl_format src_mesaformat,
+                   unsigned src_pitch,
+                   unsigned src_width,
+                   unsigned src_height,
+                   unsigned src_x_offset,
+                   unsigned src_y_offset,
+                   struct radeon_bo *dst_bo,
+                   intptr_t dst_offset,
+                   gl_format dst_mesaformat,
+                   unsigned dst_pitch,
+                   unsigned dst_width,
+                   unsigned dst_height,
+                   unsigned dst_x_offset,
+                   unsigned dst_y_offset,
+                   unsigned reg_width,
+                   unsigned reg_height,
+                   unsigned flip_y)
+{
+    struct r200_context *r200 = R200_CONTEXT(ctx);
+
+    if (!is_blit_supported(dst_mesaformat))
+        return GL_FALSE;
+
+    /* Make sure that colorbuffer has even width - hw limitation */
+    if (dst_pitch % 2 > 0)
+        ++dst_pitch;
+
+    /* Rendering to small buffer doesn't work.
+     * Looks like a hw limitation.
+     */
+    if (dst_pitch < 32)
+        return GL_FALSE;
+
+    /* Need to clamp the region size to make sure
+     * we don't read outside of the source buffer
+     * or write outside of the destination buffer.
+     */
+    if (reg_width + src_x_offset > src_width)
+        reg_width = src_width - src_x_offset;
+    if (reg_height + src_y_offset > src_height)
+        reg_height = src_height - src_y_offset;
+    if (reg_width + dst_x_offset > dst_width)
+        reg_width = dst_width - dst_x_offset;
+    if (reg_height + dst_y_offset > dst_height)
+        reg_height = dst_height - dst_y_offset;
+
+    if (src_bo == dst_bo) {
+        return GL_FALSE;
+    }
+
+    if (0) {
+        fprintf(stderr, "src: size [%d x %d], pitch %d, "
+                "offset [%d x %d], format %s, bo %p\n",
+                src_width, src_height, src_pitch,
+                src_x_offset, src_y_offset,
+                _mesa_get_format_name(src_mesaformat),
+                src_bo);
+        fprintf(stderr, "dst: pitch %d, offset[%d x %d], format %s, bo %p\n",
+                dst_pitch, dst_x_offset, dst_y_offset,
+                _mesa_get_format_name(dst_mesaformat), dst_bo);
+        fprintf(stderr, "region: %d x %d\n", reg_width, reg_height);
+    }
+
+    /* Flush is needed to make sure that source buffer has correct data */
+    radeonFlush(r200->radeon.glCtx);
+
+    rcommonEnsureCmdBufSpace(&r200->radeon, 78, __FUNCTION__);
+
+    if (!validate_buffers(r200, src_bo, dst_bo))
+        return GL_FALSE;
+
+    /* 14 */
+    emit_vtx_state(r200);
+    /* 28 */
+    emit_tx_setup(r200, src_mesaformat, src_bo, src_offset, src_width, src_height, src_pitch);
+    /* 22 */
+    emit_cb_setup(r200, dst_bo, dst_offset, dst_mesaformat, dst_pitch, dst_width, dst_height);
+    /* 14 */
+    emit_draw_packet(r200, src_width, src_height,
+                     src_x_offset, src_y_offset,
+                     dst_x_offset, dst_y_offset,
+                     reg_width, reg_height,
+                     flip_y);
+
+    radeonFlush(ctx);
+
+    return GL_TRUE;
+}
diff --git a/src/mesa/drivers/dri/r200/r200_blit.h b/src/mesa/drivers/dri/r200/r200_blit.h
new file mode 100644
index 0000000000..38487266ae
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/r200_blit.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef R200_BLIT_H
+#define R200_BLIT_H
+
+void r200_blit_init(struct r200_context *r200);
+
+unsigned r200_blit(GLcontext *ctx,
+                   struct radeon_bo *src_bo,
+                   intptr_t src_offset,
+                   gl_format src_mesaformat,
+                   unsigned src_pitch,
+                   unsigned src_width,
+                   unsigned src_height,
+                   unsigned src_x_offset,
+                   unsigned src_y_offset,
+                   struct radeon_bo *dst_bo,
+                   intptr_t dst_offset,
+                   gl_format dst_mesaformat,
+                   unsigned dst_pitch,
+                   unsigned dst_width,
+                   unsigned dst_height,
+                   unsigned dst_x_offset,
+                   unsigned dst_y_offset,
+                   unsigned width,
+                   unsigned height,
+                   unsigned flip_y);
+
+#endif // R200_BLIT_H
diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
index f34e319222..3d6d0f5ec0 100644
--- a/src/mesa/drivers/dri/r200/r200_context.c
+++ b/src/mesa/drivers/dri/r200/r200_context.c
@@ -61,6 +61,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_maos.h"
 #include "r200_vertprog.h"
 #include "radeon_queryobj.h"
+#include "r200_blit.h"
 
 #include "radeon_span.h"
 
@@ -268,6 +269,7 @@ static void r200_init_vtbl(radeonContextPtr radeon)
    radeon->vtbl.fallback = r200Fallback;
    radeon->vtbl.update_scissor = r200_vtbl_update_scissor;
    radeon->vtbl.emit_query_finish = r200_emit_query_finish;
+   radeon->vtbl.blit = r200_blit;
 }
 
 
@@ -294,6 +296,7 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    if ( !rmesa )
       return GL_FALSE;
 
+   rmesa->radeon.radeonScreen = screen;
    r200_init_vtbl(&rmesa->radeon);
    /* init exp fog table data */
    r200InitStaticFogData();
@@ -326,10 +329,14 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    r200InitDriverFuncs(&functions);
    r200InitIoctlFuncs(&functions);
    r200InitStateFuncs(&functions);
-   r200InitTextureFuncs(&functions);
+   r200InitTextureFuncs(&rmesa->radeon, &functions);
    r200InitShaderFuncs(&functions);
    radeonInitQueryObjFunctions(&functions);
 
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+	   r200_init_texcopy_functions(&functions);
+   }
+
    if (!radeonInitContext(&rmesa->radeon, &functions,
 			  glVisual, driContextPriv,
 			  sharedContextPrivate)) {
diff --git a/src/mesa/drivers/dri/r200/r200_context.h b/src/mesa/drivers/dri/r200/r200_context.h
index 17e4d8962e..a9dce310ae 100644
--- a/src/mesa/drivers/dri/r200/r200_context.h
+++ b/src/mesa/drivers/dri/r200/r200_context.h
@@ -645,6 +645,8 @@ extern GLboolean r200MakeCurrent( __DRIcontext *driContextPriv,
 				  __DRIdrawable *driReadPriv );
 extern GLboolean r200UnbindContext( __DRIcontext *driContextPriv );
 
+extern void r200_init_texcopy_functions(struct dd_function_table *table);
+
 /* ================================================================
  * Debugging:
  */
diff --git a/src/mesa/drivers/dri/r200/r200_tex.c b/src/mesa/drivers/dri/r200/r200_tex.c
index 5b87ba6ccd..0916df6476 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.c
+++ b/src/mesa/drivers/dri/r200/r200_tex.c
@@ -477,7 +477,7 @@ static struct gl_texture_object *r200NewTextureObject(GLcontext * ctx,
 
 
 
-void r200InitTextureFuncs( struct dd_function_table *functions )
+void r200InitTextureFuncs( radeonContextPtr radeon, struct dd_function_table *functions )
 {
    /* Note: we only plug in the functions we implement in the driver
     * since _mesa_init_driver_functions() was already called.
@@ -511,6 +511,11 @@ void r200InitTextureFuncs( struct dd_function_table *functions )
    functions->CompressedTexImage2D	= radeonCompressedTexImage2D;
    functions->CompressedTexSubImage2D	= radeonCompressedTexSubImage2D;
 
+   if (radeon->radeonScreen->kernel_mm) {
+      functions->CopyTexImage2D = radeonCopyTexImage2D;
+      functions->CopyTexSubImage2D = radeonCopyTexSubImage2D;
+   }
+
    functions->GenerateMipmap = radeonGenerateMipmap;
 
    functions->NewTextureImage = radeonNewTextureImage;
diff --git a/src/mesa/drivers/dri/r200/r200_tex.h b/src/mesa/drivers/dri/r200/r200_tex.h
index e122de6e5e..1a1e7038df 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.h
+++ b/src/mesa/drivers/dri/r200/r200_tex.h
@@ -48,7 +48,7 @@ extern int r200UploadTexImages( r200ContextPtr rmesa, radeonTexObjPtr t, GLuint
 
 extern void r200DestroyTexObj( r200ContextPtr rmesa, radeonTexObjPtr t );
 
-extern void r200InitTextureFuncs( struct dd_function_table *functions );
+extern void r200InitTextureFuncs( radeonContextPtr radeon, struct dd_function_table *functions );
 
 extern void r200UpdateFragmentShader( GLcontext *ctx );
 
diff --git a/src/mesa/drivers/dri/r200/radeon_tex_copy.c b/src/mesa/drivers/dri/r200/radeon_tex_copy.c
new file mode 120000
index 0000000000..dfa5ba34e6
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_tex_copy.c
@@ -0,0 +1 @@
+../radeon/radeon_tex_copy.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/Makefile b/src/mesa/drivers/dri/r300/Makefile
index be005bd164..0d0fbcc408 100644
--- a/src/mesa/drivers/dri/r300/Makefile
+++ b/src/mesa/drivers/dri/r300/Makefile
@@ -39,7 +39,8 @@ RADEON_COMMON_SOURCES = \
 	radeon_mipmap_tree.c \
 	radeon_span.c \
 	radeon_queryobj.c \
-	radeon_texture.c
+	radeon_texture.c \
+	radeon_tex_copy.c
 
 DRIVER_SOURCES = \
 		 radeon_screen.c \
@@ -50,7 +51,6 @@ DRIVER_SOURCES = \
 		 r300_state.c \
 		 r300_render.c \
 		 r300_tex.c \
-		 r300_texcopy.c \
 		 r300_texstate.c \
 		 r300_vertprog.c \
 		 r300_fragprog_common.c \
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
index 731adc1af2..f27f858652 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
@@ -23,6 +23,8 @@
 #ifndef RADEON_COMPILER_H
 #define RADEON_COMPILER_H
 
+#include "../../../../main/compiler.h"
+
 #include "memory_pool.h"
 #include "radeon_code.h"
 #include "radeon_program.h"
diff --git a/src/mesa/drivers/dri/r300/r300_blit.c b/src/mesa/drivers/dri/r300/r300_blit.c
index 2eec27e900..e24c7955d4 100644
--- a/src/mesa/drivers/dri/r300/r300_blit.c
+++ b/src/mesa/drivers/dri/r300/r300_blit.c
@@ -150,8 +150,8 @@ static void r300_emit_tx_setup(struct r300_context *r300,
                      (R300_TX_CLAMP_TO_EDGE  << R300_TX_WRAP_T_SHIFT) |
                      (R300_TX_CLAMP_TO_EDGE  << R300_TX_WRAP_R_SHIFT) |
                      R300_TX_MIN_FILTER_MIP_NONE |
-                     R300_TX_MIN_FILTER_LINEAR |
-                     R300_TX_MAG_FILTER_LINEAR |
+                     R300_TX_MIN_FILTER_NEAREST |
+                     R300_TX_MAG_FILTER_NEAREST |
                      (0 << 28));
     OUT_BATCH_REGVAL(R300_TX_FILTER1_0, 0);
     OUT_BATCH_REGVAL(R300_TX_SIZE_0,
@@ -403,9 +403,8 @@ static void calc_tex_coords(float img_width, float img_height,
     buf[3] = buf[2] + reg_height / img_height;
     if (flip_y)
     {
-        float tmp = buf[2];
-        buf[2] = 1.0 - buf[3];
-        buf[3] = 1.0 - tmp;
+        buf[2] = 1.0 - buf[2];
+        buf[3] = 1.0 - buf[3];
     }
 }
 
@@ -424,13 +423,13 @@ static void emit_draw_packet(struct r300_context *r300,
                     flip_y, texcoords);
 
     float verts[] = { dst_x_offset, dst_y_offset,
-                      texcoords[0], texcoords[3],
-                      dst_x_offset, dst_y_offset + reg_height,
                       texcoords[0], texcoords[2],
+                      dst_x_offset, dst_y_offset + reg_height,
+                      texcoords[0], texcoords[3],
                       dst_x_offset + reg_width, dst_y_offset + reg_height,
-                      texcoords[1], texcoords[2],
+                      texcoords[1], texcoords[3],
                       dst_x_offset + reg_width, dst_y_offset,
-                      texcoords[1], texcoords[3] };
+                      texcoords[1], texcoords[2] };
 
     BATCH_LOCALS(&r300->radeon);
 
@@ -495,6 +494,27 @@ static void emit_cb_setup(struct r300_context *r300,
     END_BATCH();
 }
 
+static unsigned is_blit_supported(gl_format dst_format)
+{
+    switch (dst_format) {
+        case MESA_FORMAT_RGB565:
+        case MESA_FORMAT_ARGB1555:
+        case MESA_FORMAT_RGBA8888:
+        case MESA_FORMAT_RGBA8888_REV:
+        case MESA_FORMAT_ARGB8888:
+        case MESA_FORMAT_ARGB8888_REV:
+        case MESA_FORMAT_XRGB8888:
+            break;
+        default:
+            return 0;
+    }
+
+    if (_mesa_get_format_bits(dst_format, GL_DEPTH_BITS) > 0)
+        return 0;
+
+    return 1;
+}
+
 /**
  * Copy a region of [@a width x @a height] pixels from source buffer
  * to destination buffer.
@@ -519,29 +539,31 @@ static void emit_cb_setup(struct r300_context *r300,
  * @param[in] height region height
  * @param[in] flip_y set if y coords of the source image need to be flipped
  */
-GLboolean r300_blit(struct r300_context *r300,
-                    struct radeon_bo *src_bo,
-                    intptr_t src_offset,
-                    gl_format src_mesaformat,
-                    unsigned src_pitch,
-                    unsigned src_width,
-                    unsigned src_height,
-                    unsigned src_x_offset,
-                    unsigned src_y_offset,
-                    struct radeon_bo *dst_bo,
-                    intptr_t dst_offset,
-                    gl_format dst_mesaformat,
-                    unsigned dst_pitch,
-                    unsigned dst_width,
-                    unsigned dst_height,
-                    unsigned dst_x_offset,
-                    unsigned dst_y_offset,
-                    unsigned reg_width,
-                    unsigned reg_height,
-                    unsigned flip_y)
+unsigned r300_blit(GLcontext *ctx,
+                   struct radeon_bo *src_bo,
+                   intptr_t src_offset,
+                   gl_format src_mesaformat,
+                   unsigned src_pitch,
+                   unsigned src_width,
+                   unsigned src_height,
+                   unsigned src_x_offset,
+                   unsigned src_y_offset,
+                   struct radeon_bo *dst_bo,
+                   intptr_t dst_offset,
+                   gl_format dst_mesaformat,
+                   unsigned dst_pitch,
+                   unsigned dst_width,
+                   unsigned dst_height,
+                   unsigned dst_x_offset,
+                   unsigned dst_y_offset,
+                   unsigned reg_width,
+                   unsigned reg_height,
+                   unsigned flip_y)
 {
-    if (_mesa_get_format_bits(src_mesaformat, GL_DEPTH_BITS) > 0)
-        return GL_FALSE;
+    r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+    if (!is_blit_supported(dst_mesaformat))
+        return 0;
 
     /* Make sure that colorbuffer has even width - hw limitation */
     if (dst_pitch % 2 > 0)
@@ -551,7 +573,7 @@ GLboolean r300_blit(struct r300_context *r300,
      * Looks like a hw limitation.
      */
     if (dst_pitch < 32)
-        return GL_FALSE;
+        return 0;
 
     /* Need to clamp the region size to make sure
      * we don't read outside of the source buffer
@@ -567,6 +589,10 @@ GLboolean r300_blit(struct r300_context *r300,
         reg_height = dst_height - dst_y_offset;
 
     if (src_bo == dst_bo) {
+        return 0;
+    }
+
+    if (src_offset % 32 || dst_offset % 32) {
         return GL_FALSE;
     }
 
@@ -587,7 +613,7 @@ GLboolean r300_blit(struct r300_context *r300,
     radeonFlush(r300->radeon.glCtx);
 
     if (!validate_buffers(r300, src_bo, dst_bo))
-        return GL_FALSE;
+        return 0;
 
     rcommonEnsureCmdBufSpace(&r300->radeon, 200, __FUNCTION__);
 
@@ -618,5 +644,5 @@ GLboolean r300_blit(struct r300_context *r300,
 
     radeonFlush(r300->radeon.glCtx);
 
-    return GL_TRUE;
-}
-\ No newline at end of file
+    return 1;
+}
diff --git a/src/mesa/drivers/dri/r300/r300_blit.h b/src/mesa/drivers/dri/r300/r300_blit.h
index dc21e88098..735acaddd7 100644
--- a/src/mesa/drivers/dri/r300/r300_blit.h
+++ b/src/mesa/drivers/dri/r300/r300_blit.h
@@ -30,25 +30,25 @@
 
 void r300_blit_init(struct r300_context *r300);
 
-GLboolean r300_blit(struct r300_context *r300,
-                    struct radeon_bo *src_bo,
-                    intptr_t src_offset,
-                    gl_format src_mesaformat,
-                    unsigned src_pitch,
-                    unsigned src_width,
-                    unsigned src_height,
-                    unsigned src_x_offset,
-                    unsigned src_y_offset,
-                    struct radeon_bo *dst_bo,
-                    intptr_t dst_offset,
-                    gl_format dst_mesaformat,
-                    unsigned dst_pitch,
-                    unsigned dst_width,
-                    unsigned dst_height,
-                    unsigned dst_x_offset,
-                    unsigned dst_y_offset,
-                    unsigned width,
-                    unsigned height,
-                    unsigned flip_y);
+unsigned r300_blit(GLcontext *ctx,
+                   struct radeon_bo *src_bo,
+                   intptr_t src_offset,
+                   gl_format src_mesaformat,
+                   unsigned src_pitch,
+                   unsigned src_width,
+                   unsigned src_height,
+                   unsigned src_x_offset,
+                   unsigned src_y_offset,
+                   struct radeon_bo *dst_bo,
+                   intptr_t dst_offset,
+                   gl_format dst_mesaformat,
+                   unsigned dst_pitch,
+                   unsigned dst_width,
+                   unsigned dst_height,
+                   unsigned dst_x_offset,
+                   unsigned dst_y_offset,
+                   unsigned reg_width,
+                   unsigned reg_height,
+                   unsigned flip_y);
 
 #endif // R300_BLIT_H
 \ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index 1f6ccf6ddc..bb0e6db313 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -93,8 +93,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "main/remap_helper.h"
 
-void r300_init_texcopy_functions(struct dd_function_table *table);
-
 static const struct dri_extension card_extensions[] = {
   /* *INDENT-OFF* */
   {"GL_ARB_depth_texture",		NULL},
@@ -326,6 +324,8 @@ static void r300_init_vtbl(radeonContextPtr radeon)
 			radeon->vtbl.emit_query_finish = rv530_emit_query_finish_single_z;
 	} else
 		radeon->vtbl.emit_query_finish = r300_emit_query_finish;
+
+    radeon->vtbl.blit = r300_blit;
 }
 
 static void r300InitConstValues(GLcontext *ctx, radeonScreenPtr screen)
@@ -488,15 +488,11 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 	_mesa_init_driver_functions(&functions);
 	r300InitIoctlFuncs(&functions);
 	r300InitStateFuncs(&functions);
-	r300InitTextureFuncs(&functions);
+	r300InitTextureFuncs(&r300->radeon, &functions);
 	r300InitShaderFuncs(&functions);
 	radeonInitQueryObjFunctions(&functions);
 	radeonInitBufferObjectFuncs(&functions);
 
-	if (r300->radeon.radeonScreen->kernel_mm) {
-		r300_init_texcopy_functions(&functions);
-	}
-
 	if (!radeonInitContext(&r300->radeon, &functions,
 			       glVisual, driContextPriv,
 			       sharedContextPrivate)) {
diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
index 546cd8ddde..78ab43a99f 100644
--- a/src/mesa/drivers/dri/r300/r300_context.h
+++ b/src/mesa/drivers/dri/r300/r300_context.h
@@ -554,8 +554,6 @@ extern void r300InitShaderFunctions(r300ContextPtr r300);
 
 extern void r300InitDraw(GLcontext *ctx);
 
-extern void r300_init_texcopy_functions(struct dd_function_table *table);
-
 #define r300PackFloat32 radeonPackFloat32
 #define r300PackFloat24 radeonPackFloat24
 
diff --git a/src/mesa/drivers/dri/r300/r300_tex.c b/src/mesa/drivers/dri/r300/r300_tex.c
index 963f648cb1..eb5d2d5004 100644
--- a/src/mesa/drivers/dri/r300/r300_tex.c
+++ b/src/mesa/drivers/dri/r300/r300_tex.c
@@ -312,7 +312,7 @@ static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
 	return &t->base;
 }
 
-void r300InitTextureFuncs(struct dd_function_table *functions)
+void r300InitTextureFuncs(radeonContextPtr radeon, struct dd_function_table *functions)
 {
 	/* Note: we only plug in the functions we implement in the driver
 	 * since _mesa_init_driver_functions() was already called.
@@ -340,6 +340,11 @@ void r300InitTextureFuncs(struct dd_function_table *functions)
 	functions->CompressedTexImage2D = radeonCompressedTexImage2D;
 	functions->CompressedTexSubImage2D = radeonCompressedTexSubImage2D;
 
+	if (radeon->radeonScreen->kernel_mm) {
+		functions->CopyTexImage2D = radeonCopyTexImage2D;
+		functions->CopyTexSubImage2D = radeonCopyTexSubImage2D;
+	}
+
 	functions->GenerateMipmap = radeonGenerateMipmap;
 
 	driInitTextureFormats();
diff --git a/src/mesa/drivers/dri/r300/r300_tex.h b/src/mesa/drivers/dri/r300/r300_tex.h
index 6ede0fe25c..9694e703b8 100644
--- a/src/mesa/drivers/dri/r300/r300_tex.h
+++ b/src/mesa/drivers/dri/r300/r300_tex.h
@@ -49,7 +49,7 @@ extern void r300SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 
 extern GLboolean r300ValidateBuffers(GLcontext * ctx);
 
-extern void r300InitTextureFuncs(struct dd_function_table *functions);
+extern void r300InitTextureFuncs(radeonContextPtr radeon, struct dd_function_table *functions);
 
 int32_t r300TranslateTexFormat(gl_format mesaFormat);
 
diff --git a/src/mesa/drivers/dri/r300/radeon_tex_copy.c b/src/mesa/drivers/dri/r300/radeon_tex_copy.c
new file mode 120000
index 0000000000..dfa5ba34e6
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_tex_copy.c
@@ -0,0 +1 @@
+../radeon/radeon_tex_copy.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/Makefile b/src/mesa/drivers/dri/r600/Makefile
index 26f47b7268..e55d0babd8 100644
--- a/src/mesa/drivers/dri/r600/Makefile
+++ b/src/mesa/drivers/dri/r600/Makefile
@@ -39,7 +39,8 @@ RADEON_COMMON_SOURCES = \
 	radeon_mipmap_tree.c \
 	radeon_span.c \
 	radeon_texture.c \
-	radeon_queryobj.c
+	radeon_queryobj.c \
+	radeon_tex_copy.c
 
 DRIVER_SOURCES = \
 		 radeon_screen.c \
@@ -59,6 +60,7 @@ DRIVER_SOURCES = \
 		 r700_render.c   \
 		 r600_tex.c      \
 		 r600_texstate.c      \
+		 r600_blit.c     \
 		 r700_debug.c    \
 		 $(RADEON_COMMON_SOURCES) \
 		 $(EGL_SOURCES) \
diff --git a/src/mesa/drivers/dri/r600/r600_blit.c b/src/mesa/drivers/dri/r600/r600_blit.c
new file mode 100644
index 0000000000..d7cd59ade6
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_blit.c
@@ -0,0 +1,1660 @@
+/*
+ * Copyright (C) 2009 Advanced Micro Devices, Inc.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_common.h"
+#include "r600_context.h"
+
+#include "r600_blit.h"
+#include "r600_blit_shaders.h"
+#include "r600_cmdbuf.h"
+
+/* common formats supported as both textures and render targets */
+static unsigned is_blit_supported(gl_format mesa_format)
+{
+    switch (mesa_format) {
+    case MESA_FORMAT_RGBA8888:
+    case MESA_FORMAT_SIGNED_RGBA8888:
+    case MESA_FORMAT_RGBA8888_REV:
+    case MESA_FORMAT_SIGNED_RGBA8888_REV:
+    case MESA_FORMAT_ARGB8888:
+    case MESA_FORMAT_XRGB8888:
+    case MESA_FORMAT_ARGB8888_REV:
+    case MESA_FORMAT_XRGB8888_REV:
+    case MESA_FORMAT_RGB565:
+    case MESA_FORMAT_RGB565_REV:
+    case MESA_FORMAT_ARGB4444:
+    case MESA_FORMAT_ARGB4444_REV:
+    case MESA_FORMAT_ARGB1555:
+    case MESA_FORMAT_ARGB1555_REV:
+    case MESA_FORMAT_AL88:
+    case MESA_FORMAT_AL88_REV:
+    case MESA_FORMAT_RGB332:
+    case MESA_FORMAT_A8:
+    case MESA_FORMAT_I8:
+    case MESA_FORMAT_CI8:
+    case MESA_FORMAT_L8:
+    case MESA_FORMAT_RGBA_FLOAT32:
+    case MESA_FORMAT_RGBA_FLOAT16:
+    case MESA_FORMAT_ALPHA_FLOAT32:
+    case MESA_FORMAT_ALPHA_FLOAT16:
+    case MESA_FORMAT_LUMINANCE_FLOAT32:
+    case MESA_FORMAT_LUMINANCE_FLOAT16:
+    case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT32:
+    case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT16:
+    case MESA_FORMAT_INTENSITY_FLOAT32: /* X, X, X, X */
+    case MESA_FORMAT_INTENSITY_FLOAT16: /* X, X, X, X */
+    case MESA_FORMAT_X8_Z24:
+    case MESA_FORMAT_S8_Z24:
+    case MESA_FORMAT_Z24_S8:
+    case MESA_FORMAT_Z16:
+    case MESA_FORMAT_Z32:
+    case MESA_FORMAT_SRGBA8:
+    case MESA_FORMAT_SLA8:
+    case MESA_FORMAT_SL8:
+	    break;
+    default:
+	    return 0;
+    }
+
+    /* ??? */
+    /* not sure blit to depth works or not yet */
+    if (_mesa_get_format_bits(mesa_format, GL_DEPTH_BITS) > 0)
+	    return 0;
+
+    return 1;
+}
+
+static inline void
+set_render_target(context_t *context, struct radeon_bo *bo, gl_format mesa_format,
+                  int nPitchInPixel, int w, int h, intptr_t dst_offset)
+{
+    uint32_t cb_color0_base, cb_color0_size = 0, cb_color0_info = 0, cb_color0_view = 0;
+    int id = 0;
+    uint32_t comp_swap, format;
+    BATCH_LOCALS(&context->radeon);
+
+    cb_color0_base = dst_offset / 256;
+
+    SETfield(cb_color0_size, (nPitchInPixel / 8) - 1,
+             PITCH_TILE_MAX_shift, PITCH_TILE_MAX_mask);
+    SETfield(cb_color0_size, ((nPitchInPixel * h) / 64) - 1,
+             SLICE_TILE_MAX_shift, SLICE_TILE_MAX_mask);
+
+    SETfield(cb_color0_info, ENDIAN_NONE, ENDIAN_shift, ENDIAN_mask);
+    SETfield(cb_color0_info, ARRAY_LINEAR_GENERAL,
+             CB_COLOR0_INFO__ARRAY_MODE_shift, CB_COLOR0_INFO__ARRAY_MODE_mask);
+
+    SETbit(cb_color0_info, BLEND_BYPASS_bit);
+
+    switch(mesa_format) {
+    case MESA_FORMAT_RGBA8888:
+            format = COLOR_8_8_8_8;
+            comp_swap = SWAP_STD_REV;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_SIGNED_RGBA8888:
+            format = COLOR_8_8_8_8;
+            comp_swap = SWAP_STD_REV;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_SNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_RGBA8888_REV:
+            format = COLOR_8_8_8_8;
+            comp_swap = SWAP_STD;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_SIGNED_RGBA8888_REV:
+            format = COLOR_8_8_8_8;
+            comp_swap = SWAP_STD;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_SNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_ARGB8888:
+    case MESA_FORMAT_XRGB8888:
+            format = COLOR_8_8_8_8;
+            comp_swap = SWAP_ALT;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_ARGB8888_REV:
+    case MESA_FORMAT_XRGB8888_REV:
+            format = COLOR_8_8_8_8;
+            comp_swap = SWAP_ALT_REV;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_RGB565:
+            format = COLOR_5_6_5;
+            comp_swap = SWAP_STD_REV;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_RGB565_REV:
+            format = COLOR_5_6_5;
+            comp_swap = SWAP_STD;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_ARGB4444:
+            format = COLOR_4_4_4_4;
+            comp_swap = SWAP_ALT;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_ARGB4444_REV:
+            format = COLOR_4_4_4_4;
+            comp_swap = SWAP_ALT_REV;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_ARGB1555:
+            format = COLOR_1_5_5_5;
+            comp_swap = SWAP_ALT;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_ARGB1555_REV:
+            format = COLOR_1_5_5_5;
+            comp_swap = SWAP_ALT_REV;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_AL88:
+            format = COLOR_8_8;
+            comp_swap = SWAP_STD;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_AL88_REV:
+            format = COLOR_8_8;
+            comp_swap = SWAP_STD_REV;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_RGB332:
+            format = COLOR_3_3_2;
+            comp_swap = SWAP_STD_REV;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_A8:
+            format = COLOR_8;
+            comp_swap = SWAP_ALT_REV;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_I8:
+    case MESA_FORMAT_CI8:
+            format = COLOR_8;
+            comp_swap = SWAP_STD;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_L8:
+            format = COLOR_8;
+            comp_swap = SWAP_ALT;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_RGBA_FLOAT32:
+            format = COLOR_32_32_32_32_FLOAT;
+            comp_swap = SWAP_STD_REV;
+	    SETbit(cb_color0_info, BLEND_FLOAT32_bit);
+	    CLEARbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_FLOAT, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_RGBA_FLOAT16:
+            format = COLOR_16_16_16_16_FLOAT;
+            comp_swap = SWAP_STD_REV;
+	    CLEARbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_FLOAT, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_ALPHA_FLOAT32:
+            format = COLOR_32_FLOAT;
+            comp_swap = SWAP_ALT_REV;
+	    SETbit(cb_color0_info, BLEND_FLOAT32_bit);
+	    CLEARbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_FLOAT, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_ALPHA_FLOAT16:
+            format = COLOR_16_FLOAT;
+            comp_swap = SWAP_ALT_REV;
+	    CLEARbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_FLOAT, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_LUMINANCE_FLOAT32:
+            format = COLOR_32_FLOAT;
+            comp_swap = SWAP_ALT;
+	    SETbit(cb_color0_info, BLEND_FLOAT32_bit);
+	    CLEARbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_FLOAT, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_LUMINANCE_FLOAT16:
+            format = COLOR_16_FLOAT;
+            comp_swap = SWAP_ALT;
+	    CLEARbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_FLOAT, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT32:
+            format = COLOR_32_32_FLOAT;
+            comp_swap = SWAP_ALT_REV;
+	    SETbit(cb_color0_info, BLEND_FLOAT32_bit);
+	    CLEARbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_FLOAT, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT16:
+            format = COLOR_16_16_FLOAT;
+            comp_swap = SWAP_ALT_REV;
+	    CLEARbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_FLOAT, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_INTENSITY_FLOAT32: /* X, X, X, X */
+            format = COLOR_32_FLOAT;
+            comp_swap = SWAP_STD;
+	    SETbit(cb_color0_info, BLEND_FLOAT32_bit);
+	    CLEARbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_FLOAT, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_INTENSITY_FLOAT16: /* X, X, X, X */
+            format = COLOR_16_FLOAT;
+            comp_swap = SWAP_STD;
+	    CLEARbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_FLOAT, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_X8_Z24:
+    case MESA_FORMAT_S8_Z24:
+            format = COLOR_8_24;
+            comp_swap = SWAP_STD;
+	    SETfield(cb_color0_info, ARRAY_1D_TILED_THIN1,
+		     CB_COLOR0_INFO__ARRAY_MODE_shift, CB_COLOR0_INFO__ARRAY_MODE_mask);
+	    CLEARbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_Z24_S8:
+            format = COLOR_24_8;
+            comp_swap = SWAP_STD;
+	    SETfield(cb_color0_info, ARRAY_1D_TILED_THIN1,
+		     CB_COLOR0_INFO__ARRAY_MODE_shift, CB_COLOR0_INFO__ARRAY_MODE_mask);
+	    CLEARbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_Z16:
+            format = COLOR_16;
+            comp_swap = SWAP_STD;
+	    SETfield(cb_color0_info, ARRAY_1D_TILED_THIN1,
+		     CB_COLOR0_INFO__ARRAY_MODE_shift, CB_COLOR0_INFO__ARRAY_MODE_mask);
+	    CLEARbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_Z32:
+            format = COLOR_32;
+            comp_swap = SWAP_STD;
+	    SETfield(cb_color0_info, ARRAY_1D_TILED_THIN1,
+		     CB_COLOR0_INFO__ARRAY_MODE_shift, CB_COLOR0_INFO__ARRAY_MODE_mask);
+	    CLEARbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_SRGBA8:
+            format = COLOR_8_8_8_8;
+            comp_swap = SWAP_STD_REV;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_SRGB, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_SLA8:
+            format = COLOR_8_8;
+            comp_swap = SWAP_ALT_REV;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_SRGB, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    case MESA_FORMAT_SL8:
+            format = COLOR_8;
+            comp_swap = SWAP_ALT_REV;
+	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
+	    SETfield(cb_color0_info, NUMBER_SRGB, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+            break;
+    default:
+            fprintf(stderr,"Invalid format for copy %s\n",_mesa_get_format_name(mesa_format));
+            assert("Invalid format for US output\n");
+            return;
+    }
+
+    SETfield(cb_color0_info, format, CB_COLOR0_INFO__FORMAT_shift,
+             CB_COLOR0_INFO__FORMAT_mask);
+    SETfield(cb_color0_info, comp_swap, COMP_SWAP_shift, COMP_SWAP_mask);
+
+    BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+    R600_OUT_BATCH_REGSEQ(CB_COLOR0_BASE + (4 * id), 1);
+    R600_OUT_BATCH(cb_color0_base);
+    R600_OUT_BATCH_RELOC(0,
+			 bo,
+			 0,
+			 0, RADEON_GEM_DOMAIN_VRAM | RADEON_GEM_DOMAIN_GTT, 0);
+    END_BATCH();
+
+    if ((context->radeon.radeonScreen->chip_family > CHIP_FAMILY_R600) &&
+	(context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)) {
+	    BEGIN_BATCH_NO_AUTOSTATE(2);
+	    R600_OUT_BATCH(CP_PACKET3(R600_IT_SURFACE_BASE_UPDATE, 0));
+	    R600_OUT_BATCH((2 << id));
+	    END_BATCH();
+    }
+
+    /* Set CMASK & TILE buffer to the offset of color buffer as
+     * we don't use those this shouldn't cause any issue and we
+     * then have a valid cmd stream
+     */
+    BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+    R600_OUT_BATCH_REGSEQ(CB_COLOR0_TILE + (4 * id), 1);
+    R600_OUT_BATCH(cb_color0_base);
+    R600_OUT_BATCH_RELOC(0,
+			 bo,
+			 0,
+			 0, RADEON_GEM_DOMAIN_VRAM | RADEON_GEM_DOMAIN_GTT, 0);
+    END_BATCH();
+    BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+    R600_OUT_BATCH_REGSEQ(CB_COLOR0_FRAG + (4 * id), 1);
+    R600_OUT_BATCH(cb_color0_base);
+    R600_OUT_BATCH_RELOC(0,
+			 bo,
+			 0,
+			 0, RADEON_GEM_DOMAIN_VRAM | RADEON_GEM_DOMAIN_GTT, 0);
+    END_BATCH();
+
+    BEGIN_BATCH_NO_AUTOSTATE(12);
+    R600_OUT_BATCH_REGVAL(CB_COLOR0_SIZE + (4 * id), cb_color0_size);
+    R600_OUT_BATCH_REGVAL(CB_COLOR0_VIEW + (4 * id), cb_color0_view);
+    R600_OUT_BATCH_REGVAL(CB_COLOR0_INFO + (4 * id), cb_color0_info);
+    R600_OUT_BATCH_REGVAL(CB_COLOR0_MASK + (4 * id), 0);
+    END_BATCH();
+
+    COMMIT_BATCH();
+
+}
+
+static inline void load_shaders(GLcontext * ctx)
+{
+
+    radeonContextPtr radeonctx = RADEON_CONTEXT(ctx);
+    context_t *context = R700_CONTEXT(ctx);
+    int i, size;
+    uint32_t *shader;
+
+    if (context->blit_bo_loaded == 1)
+        return;
+
+    size = 4096;
+    context->blit_bo = radeon_bo_open(radeonctx->radeonScreen->bom, 0,
+                                      size, 256, RADEON_GEM_DOMAIN_GTT, 0);
+    radeon_bo_map(context->blit_bo, 1);
+    shader = context->blit_bo->ptr;
+
+    for(i=0; i<sizeof(r6xx_vs)/4; i++) {
+        shader[128+i] = r6xx_vs[i];
+    }
+    for(i=0; i<sizeof(r6xx_ps)/4; i++) {
+        shader[256+i] = r6xx_ps[i];
+    }
+
+    radeon_bo_unmap(context->blit_bo);
+    context->blit_bo_loaded = 1;
+
+}
+
+static inline void
+set_shaders(context_t *context)
+{
+    struct radeon_bo * pbo = context->blit_bo;
+    BATCH_LOCALS(&context->radeon);
+
+    uint32_t sq_pgm_start_fs = (512 >> 8);
+    uint32_t sq_pgm_resources_fs = 0;
+    uint32_t sq_pgm_cf_offset_fs = 0;
+
+    uint32_t sq_pgm_start_vs = (512 >> 8);
+    uint32_t sq_pgm_resources_vs = (1 << NUM_GPRS_shift);
+    uint32_t sq_pgm_cf_offset_vs = 0;
+
+    uint32_t sq_pgm_start_ps = (1024 >> 8);
+    uint32_t sq_pgm_resources_ps = (1 << NUM_GPRS_shift);
+    uint32_t sq_pgm_cf_offset_ps = 0;
+    uint32_t sq_pgm_exports_ps = (1 << 1);
+
+    r700SyncSurf(context, pbo, RADEON_GEM_DOMAIN_GTT, 0, SH_ACTION_ENA_bit);
+
+    /* FS */
+    BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+    R600_OUT_BATCH_REGSEQ(SQ_PGM_START_FS, 1);
+    R600_OUT_BATCH(sq_pgm_start_fs);
+    R600_OUT_BATCH_RELOC(sq_pgm_start_fs,
+			 pbo,
+			 sq_pgm_start_fs,
+			 RADEON_GEM_DOMAIN_GTT, 0, 0);
+    END_BATCH();
+
+    BEGIN_BATCH_NO_AUTOSTATE(6);
+    R600_OUT_BATCH_REGVAL(SQ_PGM_RESOURCES_FS, sq_pgm_resources_fs);
+    R600_OUT_BATCH_REGVAL(SQ_PGM_CF_OFFSET_FS, sq_pgm_cf_offset_fs);
+    END_BATCH();
+
+    /* VS */
+    BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+    R600_OUT_BATCH_REGSEQ(SQ_PGM_START_VS, 1);
+    R600_OUT_BATCH(sq_pgm_start_vs);
+    R600_OUT_BATCH_RELOC(sq_pgm_start_vs,
+		         pbo,
+		         sq_pgm_start_vs,
+		         RADEON_GEM_DOMAIN_GTT, 0, 0);
+    END_BATCH();
+
+    BEGIN_BATCH_NO_AUTOSTATE(6);
+    R600_OUT_BATCH_REGVAL(SQ_PGM_RESOURCES_VS, sq_pgm_resources_vs);
+    R600_OUT_BATCH_REGVAL(SQ_PGM_CF_OFFSET_VS, sq_pgm_cf_offset_vs);
+    END_BATCH();
+
+    /* PS */
+    BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+    R600_OUT_BATCH_REGSEQ(SQ_PGM_START_PS, 1);
+    R600_OUT_BATCH(sq_pgm_start_ps);
+    R600_OUT_BATCH_RELOC(sq_pgm_start_ps,
+		         pbo,
+		         sq_pgm_start_ps,
+		         RADEON_GEM_DOMAIN_GTT, 0, 0);
+    END_BATCH();
+
+    BEGIN_BATCH_NO_AUTOSTATE(9);
+    R600_OUT_BATCH_REGVAL(SQ_PGM_RESOURCES_PS, sq_pgm_resources_ps);
+    R600_OUT_BATCH_REGVAL(SQ_PGM_EXPORTS_PS, sq_pgm_exports_ps);
+    R600_OUT_BATCH_REGVAL(SQ_PGM_CF_OFFSET_PS, sq_pgm_cf_offset_ps);
+    END_BATCH();
+
+    BEGIN_BATCH_NO_AUTOSTATE(18);
+    R600_OUT_BATCH_REGVAL(SPI_VS_OUT_CONFIG, 0); //EXPORT_COUNT is - 1
+    R600_OUT_BATCH_REGVAL(SPI_VS_OUT_ID_0, 0);
+    R600_OUT_BATCH_REGVAL(SPI_PS_INPUT_CNTL_0, SEL_CENTROID_bit);
+    R600_OUT_BATCH_REGVAL(SPI_PS_IN_CONTROL_0, (1 << NUM_INTERP_shift));
+    R600_OUT_BATCH_REGVAL(SPI_PS_IN_CONTROL_1, 0);
+    R600_OUT_BATCH_REGVAL(SPI_INTERP_CONTROL_0, 0);
+    END_BATCH();
+
+    COMMIT_BATCH();
+
+}
+
+static inline void
+set_vtx_resource(context_t *context)
+{
+    struct radeon_bo *bo = context->blit_bo;
+    BATCH_LOCALS(&context->radeon);
+
+    BEGIN_BATCH_NO_AUTOSTATE(6);
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 1));
+    R600_OUT_BATCH(mmSQ_VTX_BASE_VTX_LOC - ASIC_CTL_CONST_BASE_INDEX);
+    R600_OUT_BATCH(0);
+
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 1));
+    R600_OUT_BATCH(mmSQ_VTX_START_INST_LOC - ASIC_CTL_CONST_BASE_INDEX);
+    R600_OUT_BATCH(0);
+    END_BATCH();
+    COMMIT_BATCH();
+
+    if ((context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV610) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV620) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RS780) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RS880) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV710))
+	    r700SyncSurf(context, bo, RADEON_GEM_DOMAIN_GTT, 0, TC_ACTION_ENA_bit);
+    else
+	    r700SyncSurf(context, bo, RADEON_GEM_DOMAIN_GTT, 0, VC_ACTION_ENA_bit);
+
+    BEGIN_BATCH_NO_AUTOSTATE(9 + 2);
+
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_RESOURCE, 7));
+    R600_OUT_BATCH(SQ_FETCH_RESOURCE_VS_OFFSET * FETCH_RESOURCE_STRIDE);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(48 - 1);
+    R600_OUT_BATCH(16 << SQ_VTX_CONSTANT_WORD2_0__STRIDE_shift);
+    R600_OUT_BATCH(1 << MEM_REQUEST_SIZE_shift);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(SQ_TEX_VTX_VALID_BUFFER << SQ_TEX_RESOURCE_WORD6_0__TYPE_shift);
+    R600_OUT_BATCH_RELOC(SQ_VTX_CONSTANT_WORD0_0,
+                         bo,
+                         SQ_VTX_CONSTANT_WORD0_0,
+                         RADEON_GEM_DOMAIN_GTT, 0, 0);
+    END_BATCH();
+    COMMIT_BATCH();
+
+}
+
+static inline void
+set_tex_resource(context_t * context,
+		 gl_format mesa_format, struct radeon_bo *bo, int w, int h,
+		 int TexelPitch, intptr_t src_offset)
+{
+    uint32_t sq_tex_resource0, sq_tex_resource1, sq_tex_resource2, sq_tex_resource4, sq_tex_resource6;
+
+    sq_tex_resource0 = sq_tex_resource1 = sq_tex_resource2 = sq_tex_resource4 = sq_tex_resource6 = 0;
+    BATCH_LOCALS(&context->radeon);
+
+    SETfield(sq_tex_resource0, SQ_TEX_DIM_2D, DIM_shift, DIM_mask);
+    SETfield(sq_tex_resource0, ARRAY_LINEAR_GENERAL,
+                 SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_shift,
+                 SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_mask);
+
+    switch (mesa_format) {
+    case MESA_FORMAT_RGBA8888:
+    case MESA_FORMAT_SIGNED_RGBA8888:
+	    SETfield(sq_tex_resource1, FMT_8_8_8_8,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_W,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Z,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    if (mesa_format == MESA_FORMAT_SIGNED_RGBA8888) {
+		    SETfield(sq_tex_resource4, SQ_FORMAT_COMP_SIGNED,
+			     FORMAT_COMP_X_shift, FORMAT_COMP_X_mask);
+		    SETfield(sq_tex_resource4, SQ_FORMAT_COMP_SIGNED,
+			     FORMAT_COMP_Y_shift, FORMAT_COMP_Y_mask);
+		    SETfield(sq_tex_resource4, SQ_FORMAT_COMP_SIGNED,
+			     FORMAT_COMP_Z_shift, FORMAT_COMP_Z_mask);
+		    SETfield(sq_tex_resource4, SQ_FORMAT_COMP_SIGNED,
+			     FORMAT_COMP_W_shift, FORMAT_COMP_W_mask);
+	    }
+	    break;
+    case MESA_FORMAT_RGBA8888_REV:
+    case MESA_FORMAT_SIGNED_RGBA8888_REV:
+	    SETfield(sq_tex_resource1, FMT_8_8_8_8,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Z,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_W,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    if (mesa_format == MESA_FORMAT_SIGNED_RGBA8888_REV) {
+		    SETfield(sq_tex_resource4, SQ_FORMAT_COMP_SIGNED,
+			     FORMAT_COMP_X_shift, FORMAT_COMP_X_mask);
+		    SETfield(sq_tex_resource4, SQ_FORMAT_COMP_SIGNED,
+			     FORMAT_COMP_Y_shift, FORMAT_COMP_Y_mask);
+		    SETfield(sq_tex_resource4, SQ_FORMAT_COMP_SIGNED,
+			     FORMAT_COMP_Z_shift, FORMAT_COMP_Z_mask);
+		    SETfield(sq_tex_resource4, SQ_FORMAT_COMP_SIGNED,
+			     FORMAT_COMP_W_shift, FORMAT_COMP_W_mask);
+	    }
+	    break;
+    case MESA_FORMAT_ARGB8888:
+	    SETfield(sq_tex_resource1, FMT_8_8_8_8,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_Z,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_W,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_XRGB8888:
+	    SETfield(sq_tex_resource1, FMT_8_8_8_8,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_Z,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_1,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_ARGB8888_REV:
+	    SETfield(sq_tex_resource1, FMT_8_8_8_8,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Z,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_W,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_XRGB8888_REV:
+	    SETfield(sq_tex_resource1, FMT_8_8_8_8,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_1,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Z,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_W,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_RGB565:
+	    SETfield(sq_tex_resource1, FMT_5_6_5,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_Z,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_1,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_RGB565_REV:
+	    SETfield(sq_tex_resource1, FMT_5_6_5,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Z,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_1,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_ARGB4444:
+	    SETfield(sq_tex_resource1, FMT_4_4_4_4,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_Z,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_W,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_ARGB4444_REV:
+	    SETfield(sq_tex_resource1, FMT_4_4_4_4,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Z,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_W,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_ARGB1555:
+	    SETfield(sq_tex_resource1, FMT_1_5_5_5,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_Z,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_W,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_ARGB1555_REV:
+	    SETfield(sq_tex_resource1, FMT_1_5_5_5,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Z,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_W,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_AL88:
+    case MESA_FORMAT_AL88_REV: /* TODO : Check this. */
+	    SETfield(sq_tex_resource1, FMT_8_8,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_RGB332:
+	    SETfield(sq_tex_resource1, FMT_3_3_2,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_Z,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_1,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_A8: /* ZERO, ZERO, ZERO, X */
+	    SETfield(sq_tex_resource1, FMT_8,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_0,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_0,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_0,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_L8: /* X, X, X, ONE */
+	    SETfield(sq_tex_resource1, FMT_8,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_1,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_I8: /* X, X, X, X */
+    case MESA_FORMAT_CI8:
+	    SETfield(sq_tex_resource1, FMT_8,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_RGBA_FLOAT32:
+	    SETfield(sq_tex_resource1, FMT_32_32_32_32_FLOAT,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Z,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_W,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_RGBA_FLOAT16:
+	    SETfield(sq_tex_resource1, FMT_16_16_16_16_FLOAT,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Z,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_W,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_ALPHA_FLOAT32: /* ZERO, ZERO, ZERO, X */
+	    SETfield(sq_tex_resource1, FMT_32_FLOAT,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_0,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_0,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_0,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_ALPHA_FLOAT16: /* ZERO, ZERO, ZERO, X */
+	    SETfield(sq_tex_resource1, FMT_16_FLOAT,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_0,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_0,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_0,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_LUMINANCE_FLOAT32: /* X, X, X, ONE */
+	    SETfield(sq_tex_resource1, FMT_32_FLOAT,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_1,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_LUMINANCE_FLOAT16: /* X, X, X, ONE */
+	    SETfield(sq_tex_resource1, FMT_16_FLOAT,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_1,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT32:
+	    SETfield(sq_tex_resource1, FMT_32_32_FLOAT,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT16:
+	    SETfield(sq_tex_resource1, FMT_16_16_FLOAT,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_INTENSITY_FLOAT32: /* X, X, X, X */
+	    SETfield(sq_tex_resource1, FMT_32_FLOAT,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_INTENSITY_FLOAT16: /* X, X, X, X */
+	    SETfield(sq_tex_resource1, FMT_16_FLOAT,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_Z16:
+	    SETbit(sq_tex_resource0, TILE_TYPE_bit);
+	    SETfield(sq_tex_resource0, ARRAY_1D_TILED_THIN1,
+		     SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_shift,
+		     SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_mask);
+	    SETfield(sq_tex_resource1, FMT_16,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_X8_Z24:
+	    SETbit(sq_tex_resource0, TILE_TYPE_bit);
+	    SETfield(sq_tex_resource0, ARRAY_1D_TILED_THIN1,
+		     SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_shift,
+		     SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_mask);
+	    SETfield(sq_tex_resource1, FMT_8_24,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_1,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_0,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_1,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_S8_Z24:
+	    SETbit(sq_tex_resource0, TILE_TYPE_bit);
+	    SETfield(sq_tex_resource0, ARRAY_1D_TILED_THIN1,
+		     SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_shift,
+		     SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_mask);
+	    SETfield(sq_tex_resource1, FMT_8_24,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_0,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_1,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_Z24_S8:
+	    SETbit(sq_tex_resource0, TILE_TYPE_bit);
+	    SETfield(sq_tex_resource0, ARRAY_1D_TILED_THIN1,
+		     SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_shift,
+		     SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_mask);
+	    SETfield(sq_tex_resource1, FMT_24_8,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_0,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_1,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_Z32:
+	    SETbit(sq_tex_resource0, TILE_TYPE_bit);
+	    SETfield(sq_tex_resource0, ARRAY_1D_TILED_THIN1,
+		     SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_shift,
+		     SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_mask);
+	    SETfield(sq_tex_resource1, FMT_32,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_S8:
+	    SETbit(sq_tex_resource0, TILE_TYPE_bit);
+	    SETfield(sq_tex_resource0, ARRAY_1D_TILED_THIN1,
+		     SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_shift,
+		     SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_mask);
+	    SETfield(sq_tex_resource1, FMT_8,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    break;
+    case MESA_FORMAT_SRGBA8:
+	    SETfield(sq_tex_resource1, FMT_8_8_8_8,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_W,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Z,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    SETbit(sq_tex_resource4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit);
+	    break;
+    case MESA_FORMAT_SLA8:
+	    SETfield(sq_tex_resource1, FMT_8_8,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_Y,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    SETbit(sq_tex_resource4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit);
+	    break;
+    case MESA_FORMAT_SL8: /* X, X, X, ONE */
+	    SETfield(sq_tex_resource1, FMT_8,
+		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_1,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+	    SETbit(sq_tex_resource4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit);
+	    break;
+    default:
+            fprintf(stderr,"Invalid format for copy %s\n",_mesa_get_format_name(mesa_format));
+            assert("Invalid format for US output\n");
+            return;
+    };
+
+    SETfield(sq_tex_resource0, (TexelPitch/8)-1, PITCH_shift, PITCH_mask);
+    SETfield(sq_tex_resource0, w - 1, TEX_WIDTH_shift, TEX_WIDTH_mask);
+    SETfield(sq_tex_resource1, h - 1, TEX_HEIGHT_shift, TEX_HEIGHT_mask);
+
+    sq_tex_resource2 = src_offset / 256;
+
+    SETfield(sq_tex_resource6, SQ_TEX_VTX_VALID_TEXTURE,
+             SQ_TEX_RESOURCE_WORD6_0__TYPE_shift,
+             SQ_TEX_RESOURCE_WORD6_0__TYPE_mask);
+
+    r700SyncSurf(context, bo,
+                 RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM,
+		 0, TC_ACTION_ENA_bit);
+
+    BEGIN_BATCH_NO_AUTOSTATE(9 + 4);
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_RESOURCE, 7));
+    R600_OUT_BATCH(0 * 7);
+
+    R600_OUT_BATCH(sq_tex_resource0);
+    R600_OUT_BATCH(sq_tex_resource1);
+    R600_OUT_BATCH(sq_tex_resource2);
+    R600_OUT_BATCH(0); //SQ_TEX_RESOURCE3
+    R600_OUT_BATCH(sq_tex_resource4);
+    R600_OUT_BATCH(0); //SQ_TEX_RESOURCE5
+    R600_OUT_BATCH(sq_tex_resource6);
+    R600_OUT_BATCH_RELOC(0,
+		     bo,
+		     0,
+		     RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+    R600_OUT_BATCH_RELOC(0,
+		     bo,
+		     0,
+		     RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+    END_BATCH();
+    COMMIT_BATCH();
+}
+
+static inline void
+set_tex_sampler(context_t * context)
+{
+    uint32_t sq_tex_sampler_word0 = 0, sq_tex_sampler_word1 = 0, sq_tex_sampler_word2 = 0;
+    int i = 0;
+
+    SETbit(sq_tex_sampler_word2, SQ_TEX_SAMPLER_WORD2_0__TYPE_bit);
+
+    BATCH_LOCALS(&context->radeon);
+
+    BEGIN_BATCH_NO_AUTOSTATE(5);
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_SAMPLER, 3));
+    R600_OUT_BATCH(i * 3);
+    R600_OUT_BATCH(sq_tex_sampler_word0);
+    R600_OUT_BATCH(sq_tex_sampler_word1);
+    R600_OUT_BATCH(sq_tex_sampler_word2);
+    END_BATCH();
+
+}
+
+static inline void
+set_scissors(context_t *context, int x1, int y1, int x2, int y2)
+{
+    BATCH_LOCALS(&context->radeon);
+
+    BEGIN_BATCH_NO_AUTOSTATE(17);
+    R600_OUT_BATCH_REGSEQ(PA_SC_SCREEN_SCISSOR_TL, 2);
+    R600_OUT_BATCH((x1 << 0) | (y1 << 16));
+    R600_OUT_BATCH((x2 << 0) | (y2 << 16));
+
+    R600_OUT_BATCH_REGSEQ(PA_SC_WINDOW_OFFSET, 3);
+    R600_OUT_BATCH(0); //PA_SC_WINDOW_OFFSET
+    R600_OUT_BATCH((x1 << 0) | (y1 << 16) | (WINDOW_OFFSET_DISABLE_bit)); //PA_SC_WINDOW_SCISSOR_TL
+    R600_OUT_BATCH((x2 << 0) | (y2 << 16));
+
+    R600_OUT_BATCH_REGSEQ(PA_SC_GENERIC_SCISSOR_TL, 2);
+    R600_OUT_BATCH((x1 << 0) | (y1 << 16) | (WINDOW_OFFSET_DISABLE_bit));
+    R600_OUT_BATCH((x2 << 0) | (y2 << 16));
+
+    /* XXX 16 of these PA_SC_VPORT_SCISSOR_0_TL_num ... */
+    R600_OUT_BATCH_REGSEQ(PA_SC_VPORT_SCISSOR_0_TL, 2 );
+    R600_OUT_BATCH((x1 << 0) | (y1 << 16) | (WINDOW_OFFSET_DISABLE_bit));
+    R600_OUT_BATCH((x2 << 0) | (y2 << 16));
+    END_BATCH();
+
+    COMMIT_BATCH();
+
+}
+
+static inline void
+set_vb_data(context_t * context, int src_x, int src_y, int dst_x, int dst_y,
+            int w, int h, int src_h, unsigned flip_y)
+{
+    float *vb;
+    radeon_bo_map(context->blit_bo, 1);
+    vb = context->blit_bo->ptr;
+
+    vb[0] = (float)(dst_x);
+    vb[1] = (float)(dst_y);
+    vb[2] = (float)(src_x);
+    vb[3] = (flip_y) ? (float)(src_h - src_y) : (float)src_y;
+
+    vb[4] = (float)(dst_x);
+    vb[5] = (float)(dst_y + h);
+    vb[6] = (float)(src_x);
+    vb[7] = (flip_y) ? (float)(src_h - (src_y + h)) : (float)(src_y + h);
+
+    vb[8] = (float)(dst_x + w);
+    vb[9] = (float)(dst_y + h);
+    vb[10] = (float)(src_x + w);
+    vb[11] = (flip_y) ? (float)(src_h - (src_y + h)) : (float)(src_y + h);
+
+    radeon_bo_unmap(context->blit_bo);
+
+}
+
+static inline void
+draw_auto(context_t *context)
+{
+    BATCH_LOCALS(&context->radeon);
+    uint32_t vgt_primitive_type = 0, vgt_index_type = 0, vgt_draw_initiator = 0, vgt_num_indices;
+
+    SETfield(vgt_primitive_type, DI_PT_RECTLIST,
+             VGT_PRIMITIVE_TYPE__PRIM_TYPE_shift,
+             VGT_PRIMITIVE_TYPE__PRIM_TYPE_mask);
+    SETfield(vgt_index_type, DI_INDEX_SIZE_16_BIT, INDEX_TYPE_shift,
+             INDEX_TYPE_mask);
+    SETfield(vgt_draw_initiator, DI_MAJOR_MODE_0, MAJOR_MODE_shift,
+             MAJOR_MODE_mask);
+    SETfield(vgt_draw_initiator, DI_SRC_SEL_AUTO_INDEX, SOURCE_SELECT_shift,
+             SOURCE_SELECT_mask);
+
+    vgt_num_indices = 3;
+
+    BEGIN_BATCH_NO_AUTOSTATE(10);
+    // prim
+    R600_OUT_BATCH_REGSEQ(VGT_PRIMITIVE_TYPE, 1);
+    R600_OUT_BATCH(vgt_primitive_type);
+    // index type
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_INDEX_TYPE, 0));
+    R600_OUT_BATCH(vgt_index_type);
+    // num instances
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_NUM_INSTANCES, 0));
+    R600_OUT_BATCH(1);
+    //
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_DRAW_INDEX_AUTO, 1));
+    R600_OUT_BATCH(vgt_num_indices);
+    R600_OUT_BATCH(vgt_draw_initiator);
+
+    END_BATCH();
+    COMMIT_BATCH();
+}
+
+static inline void
+set_default_state(context_t *context)
+{
+    int ps_prio = 0;
+    int vs_prio = 1;
+    int gs_prio = 2;
+    int es_prio = 3;
+    int num_ps_gprs;
+    int num_vs_gprs;
+    int num_gs_gprs;
+    int num_es_gprs;
+    int num_temp_gprs;
+    int num_ps_threads;
+    int num_vs_threads;
+    int num_gs_threads;
+    int num_es_threads;
+    int num_ps_stack_entries;
+    int num_vs_stack_entries;
+    int num_gs_stack_entries;
+    int num_es_stack_entries;
+    uint32_t sq_config, sq_gpr_resource_mgmt_1, sq_gpr_resource_mgmt_2;
+    uint32_t sq_thread_resource_mgmt, sq_stack_resource_mgmt_1, sq_stack_resource_mgmt_2;
+    uint32_t ta_cntl_aux, db_watermarks, sq_dyn_gpr_cntl_ps_flush_req, db_debug;
+    BATCH_LOCALS(&context->radeon);
+
+    switch (context->radeon.radeonScreen->chip_family) {
+    case CHIP_FAMILY_R600:
+	    num_ps_gprs = 192;
+	    num_vs_gprs = 56;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 136;
+	    num_vs_threads = 48;
+	    num_gs_threads = 4;
+	    num_es_threads = 4;
+	    num_ps_stack_entries = 128;
+	    num_vs_stack_entries = 128;
+	    num_gs_stack_entries = 0;
+	    num_es_stack_entries = 0;
+	    break;
+    case CHIP_FAMILY_RV630:
+    case CHIP_FAMILY_RV635:
+	    num_ps_gprs = 84;
+	    num_vs_gprs = 36;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 144;
+	    num_vs_threads = 40;
+	    num_gs_threads = 4;
+	    num_es_threads = 4;
+	    num_ps_stack_entries = 40;
+	    num_vs_stack_entries = 40;
+	    num_gs_stack_entries = 32;
+	    num_es_stack_entries = 16;
+	    break;
+    case CHIP_FAMILY_RV610:
+    case CHIP_FAMILY_RV620:
+    case CHIP_FAMILY_RS780:
+    case CHIP_FAMILY_RS880:
+    default:
+	    num_ps_gprs = 84;
+	    num_vs_gprs = 36;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 136;
+	    num_vs_threads = 48;
+	    num_gs_threads = 4;
+	    num_es_threads = 4;
+	    num_ps_stack_entries = 40;
+	    num_vs_stack_entries = 40;
+	    num_gs_stack_entries = 32;
+	    num_es_stack_entries = 16;
+	    break;
+    case CHIP_FAMILY_RV670:
+	    num_ps_gprs = 144;
+	    num_vs_gprs = 40;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 136;
+	    num_vs_threads = 48;
+	    num_gs_threads = 4;
+	    num_es_threads = 4;
+	    num_ps_stack_entries = 40;
+	    num_vs_stack_entries = 40;
+	    num_gs_stack_entries = 32;
+	    num_es_stack_entries = 16;
+	    break;
+    case CHIP_FAMILY_RV770:
+	    num_ps_gprs = 192;
+	    num_vs_gprs = 56;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 188;
+	    num_vs_threads = 60;
+	    num_gs_threads = 0;
+	    num_es_threads = 0;
+	    num_ps_stack_entries = 256;
+	    num_vs_stack_entries = 256;
+	    num_gs_stack_entries = 0;
+	    num_es_stack_entries = 0;
+	    break;
+    case CHIP_FAMILY_RV730:
+    case CHIP_FAMILY_RV740:
+	    num_ps_gprs = 84;
+	    num_vs_gprs = 36;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 188;
+	    num_vs_threads = 60;
+	    num_gs_threads = 0;
+	    num_es_threads = 0;
+	    num_ps_stack_entries = 128;
+	    num_vs_stack_entries = 128;
+	    num_gs_stack_entries = 0;
+	    num_es_stack_entries = 0;
+	    break;
+    case CHIP_FAMILY_RV710:
+	    num_ps_gprs = 192;
+	    num_vs_gprs = 56;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 144;
+	    num_vs_threads = 48;
+	    num_gs_threads = 0;
+	    num_es_threads = 0;
+	    num_ps_stack_entries = 128;
+	    num_vs_stack_entries = 128;
+	    num_gs_stack_entries = 0;
+	    num_es_stack_entries = 0;
+	    break;
+    }
+
+    sq_config = 0;
+    if ((context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV610) ||
+        (context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV620) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RS780) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RS880) ||
+        (context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV710))
+	    CLEARbit(sq_config, VC_ENABLE_bit);
+    else
+	    SETbit(sq_config, VC_ENABLE_bit);
+    SETbit(sq_config, DX9_CONSTS_bit);
+    SETbit(sq_config, ALU_INST_PREFER_VECTOR_bit);
+    SETfield(sq_config, ps_prio, PS_PRIO_shift, PS_PRIO_mask);
+    SETfield(sq_config, vs_prio, VS_PRIO_shift, VS_PRIO_mask);
+    SETfield(sq_config, gs_prio, GS_PRIO_shift, GS_PRIO_mask);
+    SETfield(sq_config, es_prio, ES_PRIO_shift, ES_PRIO_mask);
+
+    sq_gpr_resource_mgmt_1 = 0;
+    SETfield(sq_gpr_resource_mgmt_1, num_ps_gprs, NUM_PS_GPRS_shift, NUM_PS_GPRS_mask);
+    SETfield(sq_gpr_resource_mgmt_1, num_vs_gprs, NUM_VS_GPRS_shift, NUM_VS_GPRS_mask);
+    SETfield(sq_gpr_resource_mgmt_1, num_temp_gprs,
+	     NUM_CLAUSE_TEMP_GPRS_shift, NUM_CLAUSE_TEMP_GPRS_mask);
+
+    sq_gpr_resource_mgmt_2 = 0;
+    SETfield(sq_gpr_resource_mgmt_2, num_gs_gprs, NUM_GS_GPRS_shift, NUM_GS_GPRS_mask);
+    SETfield(sq_gpr_resource_mgmt_2, num_es_gprs, NUM_ES_GPRS_shift, NUM_ES_GPRS_mask);
+
+    sq_thread_resource_mgmt = 0;
+    SETfield(sq_thread_resource_mgmt, num_ps_threads,
+	     NUM_PS_THREADS_shift, NUM_PS_THREADS_mask);
+    SETfield(sq_thread_resource_mgmt, num_vs_threads,
+	     NUM_VS_THREADS_shift, NUM_VS_THREADS_mask);
+    SETfield(sq_thread_resource_mgmt, num_gs_threads,
+	     NUM_GS_THREADS_shift, NUM_GS_THREADS_mask);
+    SETfield(sq_thread_resource_mgmt, num_es_threads,
+	     NUM_ES_THREADS_shift, NUM_ES_THREADS_mask);
+
+    sq_stack_resource_mgmt_1 = 0;
+    SETfield(sq_stack_resource_mgmt_1, num_ps_stack_entries,
+	     NUM_PS_STACK_ENTRIES_shift, NUM_PS_STACK_ENTRIES_mask);
+    SETfield(sq_stack_resource_mgmt_1, num_vs_stack_entries,
+	     NUM_VS_STACK_ENTRIES_shift, NUM_VS_STACK_ENTRIES_mask);
+
+    sq_stack_resource_mgmt_2 = 0;
+    SETfield(sq_stack_resource_mgmt_2, num_gs_stack_entries,
+	     NUM_GS_STACK_ENTRIES_shift, NUM_GS_STACK_ENTRIES_mask);
+    SETfield(sq_stack_resource_mgmt_2, num_es_stack_entries,
+	     NUM_ES_STACK_ENTRIES_shift, NUM_ES_STACK_ENTRIES_mask);
+
+    ta_cntl_aux = 0;
+    SETfield(ta_cntl_aux, 28, TD_FIFO_CREDIT_shift, TD_FIFO_CREDIT_mask);
+    db_watermarks = 0;
+    SETfield(db_watermarks, 4, DEPTH_FREE_shift, DEPTH_FREE_mask);
+    SETfield(db_watermarks, 16, DEPTH_FLUSH_shift, DEPTH_FLUSH_mask);
+    SETfield(db_watermarks, 0, FORCE_SUMMARIZE_shift, FORCE_SUMMARIZE_mask);
+    SETfield(db_watermarks, 4, DEPTH_PENDING_FREE_shift, DEPTH_PENDING_FREE_mask);
+    sq_dyn_gpr_cntl_ps_flush_req = 0;
+    db_debug = 0;
+    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770) {
+	    SETfield(ta_cntl_aux, 3, GRADIENT_CREDIT_shift, GRADIENT_CREDIT_mask);
+	    db_debug = 0x82000000;
+	    SETfield(db_watermarks, 16, DEPTH_CACHELINE_FREE_shift, DEPTH_CACHELINE_FREE_mask);
+    } else {
+	    SETfield(ta_cntl_aux, 2, GRADIENT_CREDIT_shift, GRADIENT_CREDIT_mask);
+	    SETfield(db_watermarks, 4, DEPTH_CACHELINE_FREE_shift, DEPTH_CACHELINE_FREE_mask);
+	    SETbit(sq_dyn_gpr_cntl_ps_flush_req, VS_PC_LIMIT_ENABLE_bit);
+    }
+
+    BEGIN_BATCH_NO_AUTOSTATE(117);
+    R600_OUT_BATCH_REGSEQ(SQ_CONFIG, 6);
+    R600_OUT_BATCH(sq_config);
+    R600_OUT_BATCH(sq_gpr_resource_mgmt_1);
+    R600_OUT_BATCH(sq_gpr_resource_mgmt_2);
+    R600_OUT_BATCH(sq_thread_resource_mgmt);
+    R600_OUT_BATCH(sq_stack_resource_mgmt_1);
+    R600_OUT_BATCH(sq_stack_resource_mgmt_2);
+
+    R600_OUT_BATCH_REGVAL(TA_CNTL_AUX, ta_cntl_aux);
+    R600_OUT_BATCH_REGVAL(VC_ENHANCE, 0);
+    R600_OUT_BATCH_REGVAL(R7xx_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, sq_dyn_gpr_cntl_ps_flush_req);
+    R600_OUT_BATCH_REGVAL(DB_DEBUG, db_debug);
+    R600_OUT_BATCH_REGVAL(DB_WATERMARKS, db_watermarks);
+
+    R600_OUT_BATCH_REGSEQ(SQ_ESGS_RING_ITEMSIZE, 9);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+
+    R600_OUT_BATCH_REGVAL(CB_CLRCMP_CONTROL,
+                         (CLRCMP_SEL_SRC << CLRCMP_FCN_SEL_shift));
+    R600_OUT_BATCH_REGVAL(SQ_VTX_BASE_VTX_LOC, 0);
+    R600_OUT_BATCH_REGVAL(SQ_VTX_START_INST_LOC, 0);
+    R600_OUT_BATCH_REGVAL(DB_DEPTH_INFO, 0);
+    R600_OUT_BATCH_REGVAL(DB_DEPTH_CONTROL, 0);
+    R600_OUT_BATCH_REGVAL(CB_SHADER_MASK, (OUTPUT0_ENABLE_mask));
+    R600_OUT_BATCH_REGVAL(CB_TARGET_MASK, (TARGET0_ENABLE_mask));
+    R600_OUT_BATCH_REGVAL(R7xx_CB_SHADER_CONTROL, (RT0_ENABLE_bit));
+    R600_OUT_BATCH_REGVAL(CB_COLOR_CONTROL, (0xcc << ROP3_shift));
+
+    R600_OUT_BATCH_REGVAL(PA_CL_VTE_CNTL, VTX_XY_FMT_bit);
+    R600_OUT_BATCH_REGVAL(PA_CL_VS_OUT_CNTL, 0);
+    R600_OUT_BATCH_REGVAL(PA_CL_CLIP_CNTL, CLIP_DISABLE_bit);
+    R600_OUT_BATCH_REGVAL(PA_SU_SC_MODE_CNTL, (FACE_bit) |
+        (POLYMODE_PTYPE__TRIANGLES << POLYMODE_FRONT_PTYPE_shift) |
+        (POLYMODE_PTYPE__TRIANGLES << POLYMODE_BACK_PTYPE_shift));
+    R600_OUT_BATCH_REGVAL(PA_SU_VTX_CNTL, (PIX_CENTER_bit) |
+        (X_ROUND_TO_EVEN << PA_SU_VTX_CNTL__ROUND_MODE_shift) |
+        (X_1_256TH << QUANT_MODE_shift));
+
+    R600_OUT_BATCH_REGSEQ(VGT_MAX_VTX_INDX, 4);
+    R600_OUT_BATCH(2048);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+
+    R600_OUT_BATCH_REGSEQ(VGT_OUTPUT_PATH_CNTL, 13);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+
+    R600_OUT_BATCH_REGVAL(VGT_PRIMITIVEID_EN, 0);
+    R600_OUT_BATCH_REGVAL(VGT_MULTI_PRIM_IB_RESET_EN, 0);
+    R600_OUT_BATCH_REGVAL(VGT_INSTANCE_STEP_RATE_0, 0);
+    R600_OUT_BATCH_REGVAL(VGT_INSTANCE_STEP_RATE_1, 0);
+
+    R600_OUT_BATCH_REGSEQ(VGT_STRMOUT_EN, 3);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+
+    R600_OUT_BATCH_REGVAL(VGT_STRMOUT_BUFFER_EN, 0);
+
+    END_BATCH();
+    COMMIT_BATCH();
+}
+
+static GLboolean validate_buffers(context_t *rmesa,
+                                  struct radeon_bo *src_bo,
+                                  struct radeon_bo *dst_bo)
+{
+    int ret;
+    radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+                                      src_bo, RADEON_GEM_DOMAIN_VRAM, 0);
+
+    radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+                                      dst_bo, 0, RADEON_GEM_DOMAIN_VRAM);
+
+    radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+                                      rmesa->blit_bo, RADEON_GEM_DOMAIN_GTT, 0);
+
+    ret = radeon_cs_space_check_with_bo(rmesa->radeon.cmdbuf.cs,
+					rmesa->blit_bo,
+					RADEON_GEM_DOMAIN_GTT, 0);
+    if (ret)
+        return GL_FALSE;
+
+    ret = radeon_cs_space_check_with_bo(rmesa->radeon.cmdbuf.cs,
+                                        first_elem(&rmesa->radeon.dma.reserved)->bo,
+                                        RADEON_GEM_DOMAIN_GTT, 0);
+    if (ret)
+        return GL_FALSE;
+
+    return GL_TRUE;
+}
+
+unsigned r600_blit(GLcontext *ctx,
+                   struct radeon_bo *src_bo,
+                   intptr_t src_offset,
+                   gl_format src_mesaformat,
+                   unsigned src_pitch,
+                   unsigned src_width,
+                   unsigned src_height,
+                   unsigned src_x,
+                   unsigned src_y,
+                   struct radeon_bo *dst_bo,
+                   intptr_t dst_offset,
+                   gl_format dst_mesaformat,
+                   unsigned dst_pitch,
+                   unsigned dst_width,
+                   unsigned dst_height,
+                   unsigned dst_x,
+                   unsigned dst_y,
+                   unsigned w,
+                   unsigned h,
+                   unsigned flip_y)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    int id = 0;
+
+    if (!is_blit_supported(dst_mesaformat))
+        return GL_FALSE;
+
+    if (src_bo == dst_bo) {
+        return GL_FALSE;
+    }
+
+    if (src_offset % 256 || dst_offset % 256) {
+        return GL_FALSE;
+    }
+
+    if (0) {
+        fprintf(stderr, "src: width %d, height %d, pitch %d vs %d, format %s\n",
+                src_width, src_height, src_pitch,
+                _mesa_format_row_stride(src_mesaformat, src_width),
+                _mesa_get_format_name(src_mesaformat));
+        fprintf(stderr, "dst: width %d, height %d, pitch %d, format %s\n",
+                dst_width, dst_height,
+                _mesa_format_row_stride(dst_mesaformat, dst_width),
+                _mesa_get_format_name(dst_mesaformat));
+    }
+
+    /* Flush is needed to make sure that source buffer has correct data */
+    radeonFlush(ctx);
+
+    rcommonEnsureCmdBufSpace(&context->radeon, 304, __FUNCTION__);
+
+    /* load shaders */
+    load_shaders(context->radeon.glCtx);
+
+    if (!validate_buffers(context, src_bo, dst_bo))
+        return GL_FALSE;
+
+    /* set clear state */
+    /* 117 */
+    set_default_state(context);
+
+    /* shaders */
+    /* 72 */
+    set_shaders(context);
+
+    /* src */
+    /* 20 */
+    set_tex_resource(context, src_mesaformat, src_bo,
+		     src_width, src_height, src_pitch, src_offset);
+
+    /* 5 */
+    set_tex_sampler(context);
+
+    /* dst */
+    /* 27 */
+    set_render_target(context, dst_bo, dst_mesaformat,
+		      dst_pitch, dst_width, dst_height, dst_offset);
+    /* scissors */
+    /* 17 */
+    set_scissors(context, dst_x, dst_y, dst_x + dst_width, dst_y + dst_height);
+
+    set_vb_data(context, src_x, src_y, dst_x, dst_y, w, h, src_height, flip_y);
+    /* Vertex buffer setup */
+    /* 24 */
+    set_vtx_resource(context);
+
+    /* draw */
+    /* 10 */
+    draw_auto(context);
+
+    /* 7 */
+    r700SyncSurf(context, dst_bo, 0,
+                 RADEON_GEM_DOMAIN_VRAM|RADEON_GEM_DOMAIN_GTT,
+		 CB_ACTION_ENA_bit | (1 << (id + 6)));
+
+    /* 5 */
+    r700WaitForIdleClean(context);
+
+    radeonFlush(ctx);
+
+    return GL_TRUE;
+}
diff --git a/src/mesa/drivers/dri/r600/r600_blit.h b/src/mesa/drivers/dri/r600/r600_blit.h
new file mode 100644
index 0000000000..f280e23489
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_blit.h
@@ -0,0 +1,21 @@
+unsigned r600_blit(GLcontext *ctx,
+                   struct radeon_bo *src_bo,
+                   intptr_t src_offset,
+                   gl_format src_mesaformat,
+                   unsigned src_pitch,
+                   unsigned src_width,
+                   unsigned src_height,
+                   unsigned src_x_offset,
+                   unsigned src_y_offset,
+                   struct radeon_bo *dst_bo,
+                   intptr_t dst_offset,
+                   gl_format dst_mesaformat,
+                   unsigned dst_pitch,
+                   unsigned dst_width,
+                   unsigned dst_height,
+                   unsigned dst_x_offset,
+                   unsigned dst_y_offset,
+                   unsigned w,
+                   unsigned h,
+                   unsigned flip_y);
+
diff --git a/src/mesa/drivers/dri/r600/r600_blit_shaders.h b/src/mesa/drivers/dri/r600/r600_blit_shaders.h
new file mode 100644
index 0000000000..492dde9636
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_blit_shaders.h
@@ -0,0 +1,28 @@
+const uint32_t r6xx_vs[] =
+{
+        0x00000004, // CF_DWORD0(ADDR(4))
+        0x81000000, // SQ_CF_INST_VTX COUNT(1)
+        0x0000203c, // CF_EXP_IMP CF_POS0 SQ_EXPORT_POS RW_GPR(0) ELEM_SIZE(0)
+        0x94000b08, // SQ_CF_INST_EXPORT_DONE SWZ XY01 BARRIER(1)
+        0x00004000, // CF_EXP_IMP 0 SQ_EXPORT_PARAM RW_GPR(0) ELEM_SIZE(0)
+        0x14200b1a, // SQ_CF_INST_EXPORT_DONE SWZ ZW01 EOP(1) BARRIER(0)
+        0x00000000,
+        0x00000000,
+        0x3c000000, // SQ_VTX_INST_FETCH BUFFER_ID(0) MEGA_FETCH_COUNT(16)
+        0x68cd1000, // DST_GPR(0) DST_SWZ: XYZW DATA_FORMAT(35) SQ_NUM_FORMAT_SCALED SQ_FORMAT_COMP_SIGNED
+        0x00080000, // ENDIAN_SWAP(SQ_ENDIAN_NONE) MEGA_FETCH(1)
+        0x00000000, // VTX_DWORD_PAD
+};
+
+const uint32_t r6xx_ps[] =
+{
+        0x00000002, // CF_DWORD0 AADR(2)
+        0x80800000, // SQ_CF_INST_TEX COUNT(1)
+        0x00000000, // CF_ALLOC_IMP_EXP0 SQ_EXPORT_PIXEL RW_GPR(0) ELEM_SIZE(0)
+        0x94200688, // SQ_CF_INST_EXPORT_DONE EOP(1) BARRIER(1) SWZ: XYZW
+        0x00000010, // SQ_TEX_INST_SAMPLE SRC_GPR(0) RESOURCE_ID(0)
+        0x000d1000, // DST_GPR(0) SWZ: XYZW TEX_UNNORMALIZED
+        0xb0800000, // SAMPLER_ID(0) SRC_SWZ XYZW
+        0x00000000, // TEX_DWORD_PAD
+};
+
diff --git a/src/mesa/drivers/dri/r600/r600_context.c b/src/mesa/drivers/dri/r600/r600_context.c
index cb549497f5..68112c49dc 100644
--- a/src/mesa/drivers/dri/r600/r600_context.c
+++ b/src/mesa/drivers/dri/r600/r600_context.c
@@ -65,6 +65,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r600_emit.h"
 #include "radeon_bocs_wrapper.h"
 #include "radeon_queryobj.h"
+#include "r600_blit.h"
 
 #include "r700_state.h"
 #include "r700_ioctl.h"
@@ -240,6 +241,7 @@ static void r600_init_vtbl(radeonContextPtr radeon)
 	radeon->vtbl.pre_emit_atoms = r600_vtbl_pre_emit_atoms;
 	radeon->vtbl.fallback = r600_fallback;
 	radeon->vtbl.emit_query_finish = r600_emit_query_finish;
+	radeon->vtbl.blit = r600_blit;
 }
 
 static void r600InitConstValues(GLcontext *ctx, radeonScreenPtr screen)
@@ -378,7 +380,7 @@ GLboolean r600CreateContext(const __GLcontextModes * glVisual,
 	_mesa_init_driver_functions(&functions);
 
 	r700InitStateFuncs(&functions);
-	r600InitTextureFuncs(&functions);
+	r600InitTextureFuncs(&r600->radeon, &functions);
 	r700InitShaderFuncs(&functions);
 	radeonInitQueryObjFunctions(&functions);
 	r700InitIoctlFuncs(&functions);
diff --git a/src/mesa/drivers/dri/r600/r600_context.h b/src/mesa/drivers/dri/r600/r600_context.h
index a1b4af715e..72c8c869b7 100644
--- a/src/mesa/drivers/dri/r600/r600_context.h
+++ b/src/mesa/drivers/dri/r600/r600_context.h
@@ -148,6 +148,8 @@ struct r600_context {
 	GLint      nNumActiveAos;
 	StreamDesc stream_desc[VERT_ATTRIB_MAX];
     struct r700_index_buffer ind_buf;
+	struct radeon_bo *blit_bo;
+	GLboolean blit_bo_loaded;
 };
 
 #define R700_CONTEXT(ctx)		((context_t *)(ctx->DriverCtx))
@@ -178,6 +180,8 @@ extern GLboolean r700SyncSurf(context_t *context,
 			      uint32_t write_domain,
 			      uint32_t sync_type);
 
+extern void r700WaitForIdleClean(context_t *context);
+
 extern void r700Start3D(context_t *context);
 extern void r600InitAtoms(context_t *context);
 extern void r700InitDraw(GLcontext *ctx);
diff --git a/src/mesa/drivers/dri/r600/r600_tex.c b/src/mesa/drivers/dri/r600/r600_tex.c
index f745fe3e8a..71dfd7e059 100644
--- a/src/mesa/drivers/dri/r600/r600_tex.c
+++ b/src/mesa/drivers/dri/r600/r600_tex.c
@@ -396,7 +396,7 @@ static struct gl_texture_object *r600NewTextureObject(GLcontext * ctx,
 	return &t->base;
 }
 
-void r600InitTextureFuncs(struct dd_function_table *functions)
+void r600InitTextureFuncs(radeonContextPtr radeon, struct dd_function_table *functions)
 {
 	/* Note: we only plug in the functions we implement in the driver
 	 * since _mesa_init_driver_functions() was already called.
@@ -424,6 +424,11 @@ void r600InitTextureFuncs(struct dd_function_table *functions)
 	functions->CompressedTexImage2D = radeonCompressedTexImage2D;
 	functions->CompressedTexSubImage2D = radeonCompressedTexSubImage2D;
 
+	if (radeon->radeonScreen->kernel_mm) {
+		functions->CopyTexImage2D = radeonCopyTexImage2D;
+		functions->CopyTexSubImage2D = radeonCopyTexSubImage2D;
+	}
+
 	functions->GenerateMipmap = radeonGenerateMipmap;
 
 	driInitTextureFormats();
diff --git a/src/mesa/drivers/dri/r600/r600_tex.h b/src/mesa/drivers/dri/r600/r600_tex.h
index fb0e1a023e..c2141ef5e5 100644
--- a/src/mesa/drivers/dri/r600/r600_tex.h
+++ b/src/mesa/drivers/dri/r600/r600_tex.h
@@ -58,6 +58,6 @@ extern void r600SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 
 extern GLboolean r600ValidateBuffers(GLcontext * ctx);
 
-extern void r600InitTextureFuncs(struct dd_function_table *functions);
+extern void r600InitTextureFuncs(radeonContextPtr radeon, struct dd_function_table *functions);
 
 #endif				/* __r600_TEX_H__ */
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.c b/src/mesa/drivers/dri/r600/r700_assembler.c
index 0ff16b4ddd..c01b2fbb14 100644
--- a/src/mesa/drivers/dri/r600/r700_assembler.c
+++ b/src/mesa/drivers/dri/r600/r700_assembler.c
@@ -4469,7 +4469,7 @@ GLboolean assemble_TEX(r700_AssemblerBase *pAsm)
 	}
 	pAsm->D2.dst2.SaturateMode = 1;
 
-	pAsm->S[0].src.rtype = pAsm->D.dst.rtype;
+	pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
 	pAsm->S[0].src.reg = pAsm->D.dst.reg;
 	noswizzle_PVSSRC(&(pAsm->S[0].src));
 	noneg_PVSSRC(&(pAsm->S[0].src));
@@ -5090,15 +5090,15 @@ void add_return_inst(r700_AssemblerBase *pAsm)
 {
     if(GL_FALSE == add_cf_instruction(pAsm) )
     {
-        return GL_FALSE;
+        return;
     }
     //pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 1;
     pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 0;
-    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0;
     pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
 
     pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
-    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
     pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_RETURN;
     pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
 
@@ -5302,7 +5302,7 @@ GLboolean assemble_CAL(r700_AssemblerBase *pAsm,
 
 GLboolean setRetInLoopFlag(r700_AssemblerBase *pAsm, GLuint flagValue)
 {
-    GLfloat fLiteral[2] = {0.1, 0.0};
+    /*GLfloat fLiteral[2] = {0.1, 0.0};*/
 
     pAsm->D.dst.opcode   = SQ_OP2_INST_MOV;
     pAsm->D.dst.op3      = 0;
@@ -5353,7 +5353,7 @@ GLboolean setRetInLoopFlag(r700_AssemblerBase *pAsm, GLuint flagValue)
 
 GLboolean testFlag(r700_AssemblerBase *pAsm)
 {
-    GLfloat fLiteral[2] = {0.1, 0.0};
+    /*GLfloat fLiteral[2] = {0.1, 0.0};*/
 
     //Test flag
     GLuint tmp = gethelpr(pAsm);
@@ -6123,7 +6123,7 @@ GLboolean callPreSub(r700_AssemblerBase* pAsm,
 
     R700ControlFlowGenericClause* prelude_cf_ptr = NULL;
 
-    /* copy srcs to presub inputs */  
+    /* copy srcs to presub inputs */
     pAsm->alu_x_opcode = SQ_CF_INST_ALU;
     for(i=0; i<uNumValidSrc; i++)
     {
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.h b/src/mesa/drivers/dri/r600/r700_assembler.h
index 56baf5b0d9..0064d0814f 100644
--- a/src/mesa/drivers/dri/r600/r700_assembler.h
+++ b/src/mesa/drivers/dri/r600/r700_assembler.h
@@ -619,6 +619,7 @@ GLboolean assemble_RCP(r700_AssemblerBase *pAsm);
 GLboolean assemble_RSQ(r700_AssemblerBase *pAsm);
 GLboolean assemble_SCS(r700_AssemblerBase *pAsm);
 GLboolean assemble_SGE(r700_AssemblerBase *pAsm);
+GLboolean assemble_CONT(r700_AssemblerBase *pAsm);
 
 GLboolean assemble_LOGIC(r700_AssemblerBase *pAsm, BITS opcode);
 GLboolean assemble_LOGIC_PRED(r700_AssemblerBase *pAsm, BITS opcode); 
diff --git a/src/mesa/drivers/dri/r600/r700_chip.c b/src/mesa/drivers/dri/r600/r700_chip.c
index 3bc2d2ba02..1a1a87c3cf 100644
--- a/src/mesa/drivers/dri/r600/r700_chip.c
+++ b/src/mesa/drivers/dri/r600/r700_chip.c
@@ -453,13 +453,31 @@ static void r700SendRenderTargetState(GLcontext *ctx, struct radeon_state_atom *
 		R600_OUT_BATCH((2 << id));
 		END_BATCH();
 	}
+	/* Set CMASK & TILE buffer to the offset of color buffer as
+	 * we don't use those this shouldn't cause any issue and we
+	 * then have a valid cmd stream
+	 */
+	BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+	R600_OUT_BATCH_REGSEQ(CB_COLOR0_TILE + (4 * id), 1);
+	R600_OUT_BATCH(r700->render_target[id].CB_COLOR0_TILE.u32All);
+	R600_OUT_BATCH_RELOC(r700->render_target[id].CB_COLOR0_BASE.u32All,
+			     rrb->bo,
+			     r700->render_target[id].CB_COLOR0_BASE.u32All,
+			     0, RADEON_GEM_DOMAIN_VRAM, 0);
+	END_BATCH();
+	BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+	R600_OUT_BATCH_REGSEQ(CB_COLOR0_FRAG + (4 * id), 1);
+	R600_OUT_BATCH(r700->render_target[id].CB_COLOR0_FRAG.u32All);
+	R600_OUT_BATCH_RELOC(r700->render_target[id].CB_COLOR0_BASE.u32All,
+			     rrb->bo,
+			     r700->render_target[id].CB_COLOR0_BASE.u32All,
+			     0, RADEON_GEM_DOMAIN_VRAM, 0);
+        END_BATCH();
 
-        BEGIN_BATCH_NO_AUTOSTATE(18);
+        BEGIN_BATCH_NO_AUTOSTATE(12);
 	R600_OUT_BATCH_REGVAL(CB_COLOR0_SIZE + (4 * id), r700->render_target[id].CB_COLOR0_SIZE.u32All);
 	R600_OUT_BATCH_REGVAL(CB_COLOR0_VIEW + (4 * id), r700->render_target[id].CB_COLOR0_VIEW.u32All);
 	R600_OUT_BATCH_REGVAL(CB_COLOR0_INFO + (4 * id), r700->render_target[id].CB_COLOR0_INFO.u32All);
-	R600_OUT_BATCH_REGVAL(CB_COLOR0_TILE + (4 * id), r700->render_target[id].CB_COLOR0_TILE.u32All);
-	R600_OUT_BATCH_REGVAL(CB_COLOR0_FRAG + (4 * id), r700->render_target[id].CB_COLOR0_FRAG.u32All);
 	R600_OUT_BATCH_REGVAL(CB_COLOR0_MASK + (4 * id), r700->render_target[id].CB_COLOR0_MASK.u32All);
         END_BATCH();
 
diff --git a/src/mesa/drivers/dri/r600/r700_render.c b/src/mesa/drivers/dri/r600/r700_render.c
index eab27cbd84..3a6210c53a 100644
--- a/src/mesa/drivers/dri/r600/r700_render.c
+++ b/src/mesa/drivers/dri/r600/r700_render.c
@@ -422,7 +422,7 @@ static void r700RunRenderPrimitiveImmediate(GLcontext * ctx, int start, int end,
 }
 
 /* start 3d, idle, cb/db flush */
-#define PRE_EMIT_STATE_BUFSZ 10 + 5 + 14
+#define PRE_EMIT_STATE_BUFSZ 10 + 5 + 18
 
 static GLuint r700PredictRenderSize(GLcontext* ctx,
 				    const struct _mesa_prim *prim,
diff --git a/src/mesa/drivers/dri/r600/radeon_tex_copy.c b/src/mesa/drivers/dri/r600/radeon_tex_copy.c
new file mode 120000
index 0000000000..dfa5ba34e6
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_tex_copy.c
@@ -0,0 +1 @@
+../radeon/radeon_tex_copy.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/radeon/Makefile b/src/mesa/drivers/dri/radeon/Makefile
index 2b2f2c4aa7..c776be0e60 100644
--- a/src/mesa/drivers/dri/radeon/Makefile
+++ b/src/mesa/drivers/dri/radeon/Makefile
@@ -26,7 +26,8 @@ RADEON_COMMON_SOURCES = \
 	radeon_mipmap_tree.c \
 	radeon_queryobj.c \
 	radeon_span.c \
-	radeon_texture.c
+	radeon_texture.c \
+	radeon_tex_copy.c
 
 DRIVER_SOURCES = \
 	radeon_context.c \
@@ -40,6 +41,7 @@ DRIVER_SOURCES = \
 	radeon_swtcl.c \
 	radeon_maos.c \
 	radeon_sanity.c \
+	radeon_blit.c \
 	$(RADEON_COMMON_SOURCES)
 
 C_SOURCES = \
diff --git a/src/mesa/drivers/dri/radeon/radeon_blit.c b/src/mesa/drivers/dri/radeon/radeon_blit.c
new file mode 100644
index 0000000000..0df4fbb33c
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_blit.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright (C) 2010 Advanced Micro Devices, Inc.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_common.h"
+#include "radeon_context.h"
+#include "radeon_blit.h"
+
+static inline uint32_t cmdpacket0(struct radeon_screen *rscrn,
+                                  int reg, int count)
+{
+    if (count)
+	    return CP_PACKET0(reg, count - 1);
+    return CP_PACKET2;
+}
+
+/* common formats supported as both textures and render targets */
+static unsigned is_blit_supported(gl_format mesa_format)
+{
+    /* XXX others?  BE/LE? */
+    switch (mesa_format) {
+    case MESA_FORMAT_ARGB8888:
+    case MESA_FORMAT_XRGB8888:
+    case MESA_FORMAT_RGB565:
+    case MESA_FORMAT_ARGB4444:
+    case MESA_FORMAT_ARGB1555:
+    case MESA_FORMAT_A8:
+	    break;
+    default:
+	    return 0;
+    }
+
+    /* ??? */
+    if (_mesa_get_format_bits(mesa_format, GL_DEPTH_BITS) > 0)
+	    return 0;
+
+    return 1;
+}
+
+static inline void emit_vtx_state(struct r100_context *r100)
+{
+    BATCH_LOCALS(&r100->radeon);
+
+    BEGIN_BATCH(8);
+    if (r100->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+	    OUT_BATCH_REGVAL(RADEON_SE_CNTL_STATUS, 0);
+    } else {
+	    OUT_BATCH_REGVAL(RADEON_SE_CNTL_STATUS, RADEON_TCL_BYPASS);
+
+    }
+    OUT_BATCH_REGVAL(RADEON_SE_COORD_FMT, (RADEON_VTX_XY_PRE_MULT_1_OVER_W0 |
+					   RADEON_TEX1_W_ROUTING_USE_W0));
+    OUT_BATCH_REGVAL(RADEON_SE_VTX_FMT, RADEON_SE_VTX_FMT_XY | RADEON_SE_VTX_FMT_ST0);
+    OUT_BATCH_REGVAL(RADEON_SE_CNTL, (RADEON_DIFFUSE_SHADE_GOURAUD |
+				      RADEON_BFACE_SOLID |
+				      RADEON_FFACE_SOLID |
+				      RADEON_VTX_PIX_CENTER_OGL |
+				      RADEON_ROUND_MODE_ROUND |
+				      RADEON_ROUND_PREC_4TH_PIX));
+    END_BATCH();
+}
+
+static void inline emit_tx_setup(struct r100_context *r100,
+				 gl_format mesa_format,
+				 struct radeon_bo *bo,
+				 intptr_t offset,
+				 unsigned width,
+				 unsigned height,
+				 unsigned pitch)
+{
+    uint32_t txformat = RADEON_TXFORMAT_NON_POWER2;
+    BATCH_LOCALS(&r100->radeon);
+
+    assert(width <= 2047);
+    assert(height <= 2047);
+    assert(offset % 32 == 0);
+
+    /* XXX others?  BE/LE? */
+    switch (mesa_format) {
+    case MESA_FORMAT_ARGB8888:
+	    txformat |= RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP;
+	    break;
+    case MESA_FORMAT_XRGB8888:
+	    txformat |= RADEON_TXFORMAT_ARGB8888;
+	    break;
+    case MESA_FORMAT_RGB565:
+	    txformat |= RADEON_TXFORMAT_RGB565;
+	    break;
+    case MESA_FORMAT_ARGB4444:
+	    txformat |= RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP;
+	    break;
+    case MESA_FORMAT_ARGB1555:
+	    txformat |= RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP;
+	    break;
+    case MESA_FORMAT_A8:
+	    txformat |= RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP;
+	    break;
+    default:
+	    break;
+    }
+
+    BEGIN_BATCH(18);
+    OUT_BATCH_REGVAL(RADEON_PP_CNTL, RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
+    OUT_BATCH_REGVAL(RADEON_PP_TXCBLEND_0, (RADEON_COLOR_ARG_A_ZERO |
+					    RADEON_COLOR_ARG_B_ZERO |
+					    RADEON_COLOR_ARG_C_T0_COLOR |
+					    RADEON_BLEND_CTL_ADD |
+					    RADEON_CLAMP_TX));
+    OUT_BATCH_REGVAL(RADEON_PP_TXABLEND_0, (RADEON_ALPHA_ARG_A_ZERO |
+					    RADEON_ALPHA_ARG_B_ZERO |
+					    RADEON_ALPHA_ARG_C_T0_ALPHA |
+					    RADEON_BLEND_CTL_ADD |
+					    RADEON_CLAMP_TX));
+    OUT_BATCH_REGVAL(RADEON_PP_TXFILTER_0, (RADEON_CLAMP_S_CLAMP_LAST |
+					    RADEON_CLAMP_T_CLAMP_LAST |
+					    RADEON_MAG_FILTER_NEAREST |
+					    RADEON_MIN_FILTER_NEAREST));
+    OUT_BATCH_REGVAL(RADEON_PP_TXFORMAT_0, txformat);
+    OUT_BATCH_REGVAL(RADEON_PP_TEX_SIZE_0, ((width - 1) |
+					    ((height - 1) << RADEON_TEX_VSIZE_SHIFT)));
+    OUT_BATCH_REGVAL(RADEON_PP_TEX_PITCH_0, pitch * _mesa_get_format_bytes(mesa_format) - 32);
+
+    OUT_BATCH_REGSEQ(RADEON_PP_TXOFFSET_0, 1);
+    OUT_BATCH_RELOC(0, bo, 0, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+
+    END_BATCH();
+}
+
+static inline void emit_cb_setup(struct r100_context *r100,
+				 struct radeon_bo *bo,
+				 intptr_t offset,
+				 gl_format mesa_format,
+				 unsigned pitch,
+				 unsigned width,
+				 unsigned height)
+{
+    uint32_t dst_pitch = pitch;
+    uint32_t dst_format = 0;
+    BATCH_LOCALS(&r100->radeon);
+
+    /* XXX others?  BE/LE? */
+    switch (mesa_format) {
+    case MESA_FORMAT_ARGB8888:
+    case MESA_FORMAT_XRGB8888:
+	    dst_format = RADEON_COLOR_FORMAT_ARGB8888;
+	    break;
+    case MESA_FORMAT_RGB565:
+	    dst_format = RADEON_COLOR_FORMAT_RGB565;
+	    break;
+    case MESA_FORMAT_ARGB4444:
+	    dst_format = RADEON_COLOR_FORMAT_ARGB4444;
+	    break;
+    case MESA_FORMAT_ARGB1555:
+	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
+	    break;
+    case MESA_FORMAT_A8:
+	    dst_format = RADEON_COLOR_FORMAT_RGB8;
+	    break;
+    default:
+	    break;
+    }
+
+    BEGIN_BATCH_NO_AUTOSTATE(18);
+    OUT_BATCH_REGVAL(RADEON_RE_TOP_LEFT, 0);
+    OUT_BATCH_REGVAL(RADEON_RE_WIDTH_HEIGHT, ((width << RADEON_RE_WIDTH_SHIFT) |
+					      (height << RADEON_RE_HEIGHT_SHIFT)));
+    OUT_BATCH_REGVAL(RADEON_RB3D_PLANEMASK, 0xffffffff);
+    OUT_BATCH_REGVAL(RADEON_RB3D_BLENDCNTL, RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);
+    OUT_BATCH_REGVAL(RADEON_RB3D_CNTL, dst_format);
+
+    OUT_BATCH_REGSEQ(RADEON_RB3D_COLOROFFSET, 1);
+    OUT_BATCH_RELOC(0, bo, 0, 0, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0);
+    OUT_BATCH_REGSEQ(RADEON_RB3D_COLORPITCH, 1);
+    OUT_BATCH_RELOC(dst_pitch, bo, dst_pitch, 0, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0);
+
+    END_BATCH();
+}
+
+static GLboolean validate_buffers(struct r100_context *r100,
+                                  struct radeon_bo *src_bo,
+                                  struct radeon_bo *dst_bo)
+{
+    int ret;
+    radeon_cs_space_add_persistent_bo(r100->radeon.cmdbuf.cs,
+                                      src_bo, RADEON_GEM_DOMAIN_VRAM, 0);
+
+    radeon_cs_space_add_persistent_bo(r100->radeon.cmdbuf.cs,
+                                      dst_bo, 0, RADEON_GEM_DOMAIN_VRAM);
+
+    ret = radeon_cs_space_check_with_bo(r100->radeon.cmdbuf.cs,
+                                        first_elem(&r100->radeon.dma.reserved)->bo,
+                                        RADEON_GEM_DOMAIN_GTT, 0);
+    if (ret)
+        return GL_FALSE;
+
+    return GL_TRUE;
+}
+
+/**
+ * Calculate texcoords for given image region.
+ * Output values are [minx, maxx, miny, maxy]
+ */
+static inline void calc_tex_coords(float img_width, float img_height,
+				   float x, float y,
+				   float reg_width, float reg_height,
+				   unsigned flip_y, float *buf)
+{
+    buf[0] = x / img_width;
+    buf[1] = buf[0] + reg_width / img_width;
+    buf[2] = y / img_height;
+    buf[3] = buf[2] + reg_height / img_height;
+    if (flip_y)
+    {
+        buf[2] = 1.0 - buf[2];
+        buf[3] = 1.0 - buf[3];
+    }
+}
+
+static inline void emit_draw_packet(struct r100_context *r100,
+				    unsigned src_width, unsigned src_height,
+				    unsigned src_x_offset, unsigned src_y_offset,
+				    unsigned dst_x_offset, unsigned dst_y_offset,
+				    unsigned reg_width, unsigned reg_height,
+				    unsigned flip_y)
+{
+    float texcoords[4];
+    float verts[12];
+    BATCH_LOCALS(&r100->radeon);
+
+    calc_tex_coords(src_width, src_height,
+                    src_x_offset, src_y_offset,
+                    reg_width, reg_height,
+                    flip_y, texcoords);
+
+    verts[0] = dst_x_offset;
+    verts[1] = dst_y_offset + reg_height;
+    verts[2] = texcoords[0];
+    verts[3] = texcoords[3];
+
+    verts[4] = dst_x_offset + reg_width;
+    verts[5] = dst_y_offset + reg_height;
+    verts[6] = texcoords[1];
+    verts[7] = texcoords[3];
+
+    verts[8] = dst_x_offset + reg_width;
+    verts[9] = dst_y_offset;
+    verts[10] = texcoords[1];
+    verts[11] = texcoords[2];
+
+    BEGIN_BATCH(15);
+    OUT_BATCH(RADEON_CP_PACKET3_3D_DRAW_IMMD | (13 << 16));
+    OUT_BATCH(RADEON_CP_VC_FRMT_XY | RADEON_CP_VC_FRMT_ST0);
+    OUT_BATCH(RADEON_CP_VC_CNTL_PRIM_WALK_RING |
+	      RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
+	      RADEON_CP_VC_CNTL_MAOS_ENABLE |
+	      RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
+              (3 << 16));
+    OUT_BATCH_TABLE(verts, 12);
+    END_BATCH();
+}
+
+/**
+ * Copy a region of [@a width x @a height] pixels from source buffer
+ * to destination buffer.
+ * @param[in] r100 r100 context
+ * @param[in] src_bo source radeon buffer object
+ * @param[in] src_offset offset of the source image in the @a src_bo
+ * @param[in] src_mesaformat source image format
+ * @param[in] src_pitch aligned source image width
+ * @param[in] src_width source image width
+ * @param[in] src_height source image height
+ * @param[in] src_x_offset x offset in the source image
+ * @param[in] src_y_offset y offset in the source image
+ * @param[in] dst_bo destination radeon buffer object
+ * @param[in] dst_offset offset of the destination image in the @a dst_bo
+ * @param[in] dst_mesaformat destination image format
+ * @param[in] dst_pitch aligned destination image width
+ * @param[in] dst_width destination image width
+ * @param[in] dst_height destination image height
+ * @param[in] dst_x_offset x offset in the destination image
+ * @param[in] dst_y_offset y offset in the destination image
+ * @param[in] width region width
+ * @param[in] height region height
+ * @param[in] flip_y set if y coords of the source image need to be flipped
+ */
+unsigned r100_blit(GLcontext *ctx,
+                   struct radeon_bo *src_bo,
+                   intptr_t src_offset,
+                   gl_format src_mesaformat,
+                   unsigned src_pitch,
+                   unsigned src_width,
+                   unsigned src_height,
+                   unsigned src_x_offset,
+                   unsigned src_y_offset,
+                   struct radeon_bo *dst_bo,
+                   intptr_t dst_offset,
+                   gl_format dst_mesaformat,
+                   unsigned dst_pitch,
+                   unsigned dst_width,
+                   unsigned dst_height,
+                   unsigned dst_x_offset,
+                   unsigned dst_y_offset,
+                   unsigned reg_width,
+                   unsigned reg_height,
+                   unsigned flip_y)
+{
+    struct r100_context *r100 = R100_CONTEXT(ctx);
+
+    if (!is_blit_supported(dst_mesaformat))
+        return GL_FALSE;
+
+    /* Make sure that colorbuffer has even width - hw limitation */
+    if (dst_pitch % 2 > 0)
+        ++dst_pitch;
+
+    /* Rendering to small buffer doesn't work.
+     * Looks like a hw limitation.
+     */
+    if (dst_pitch < 32)
+        return GL_FALSE;
+
+    /* Need to clamp the region size to make sure
+     * we don't read outside of the source buffer
+     * or write outside of the destination buffer.
+     */
+    if (reg_width + src_x_offset > src_width)
+        reg_width = src_width - src_x_offset;
+    if (reg_height + src_y_offset > src_height)
+        reg_height = src_height - src_y_offset;
+    if (reg_width + dst_x_offset > dst_width)
+        reg_width = dst_width - dst_x_offset;
+    if (reg_height + dst_y_offset > dst_height)
+        reg_height = dst_height - dst_y_offset;
+
+    if (src_bo == dst_bo) {
+        return GL_FALSE;
+    }
+
+    if (src_offset % 32 || dst_offset % 32) {
+        return GL_FALSE;
+    }
+
+    if (0) {
+        fprintf(stderr, "src: size [%d x %d], pitch %d, "
+                "offset [%d x %d], format %s, bo %p\n",
+                src_width, src_height, src_pitch,
+                src_x_offset, src_y_offset,
+                _mesa_get_format_name(src_mesaformat),
+                src_bo);
+        fprintf(stderr, "dst: pitch %d, offset[%d x %d], format %s, bo %p\n",
+                dst_pitch, dst_x_offset, dst_y_offset,
+                _mesa_get_format_name(dst_mesaformat), dst_bo);
+        fprintf(stderr, "region: %d x %d\n", reg_width, reg_height);
+    }
+
+    /* Flush is needed to make sure that source buffer has correct data */
+    radeonFlush(ctx);
+
+    rcommonEnsureCmdBufSpace(&r100->radeon, 59, __FUNCTION__);
+
+    if (!validate_buffers(r100, src_bo, dst_bo))
+        return GL_FALSE;
+
+    /* 8 */
+    emit_vtx_state(r100);
+    /* 18 */
+    emit_tx_setup(r100, src_mesaformat, src_bo, src_offset, src_width, src_height, src_pitch);
+    /* 18 */
+    emit_cb_setup(r100, dst_bo, dst_offset, dst_mesaformat, dst_pitch, dst_width, dst_height);
+    /* 15 */
+    emit_draw_packet(r100, src_width, src_height,
+                     src_x_offset, src_y_offset,
+                     dst_x_offset, dst_y_offset,
+                     reg_width, reg_height,
+                     flip_y);
+
+    radeonFlush(ctx);
+
+    return GL_TRUE;
+}
diff --git a/src/mesa/drivers/dri/radeon/radeon_blit.h b/src/mesa/drivers/dri/radeon/radeon_blit.h
new file mode 100644
index 0000000000..d36366ff79
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_blit.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2010 Advanced Micro Devices, Inc.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef RADEON_BLIT_H
+#define RADEON_BLIT_H
+
+void r100_blit_init(struct r100_context *r100);
+
+unsigned r100_blit(GLcontext *ctx,
+                   struct radeon_bo *src_bo,
+                   intptr_t src_offset,
+                   gl_format src_mesaformat,
+                   unsigned src_pitch,
+                   unsigned src_width,
+                   unsigned src_height,
+                   unsigned src_x_offset,
+                   unsigned src_y_offset,
+                   struct radeon_bo *dst_bo,
+                   intptr_t dst_offset,
+                   gl_format dst_mesaformat,
+                   unsigned dst_pitch,
+                   unsigned dst_width,
+                   unsigned dst_height,
+                   unsigned dst_x_offset,
+                   unsigned dst_y_offset,
+                   unsigned width,
+                   unsigned height,
+                   unsigned flip_y);
+
+#endif // RADEON_BLIT_H
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.h b/src/mesa/drivers/dri/radeon/radeon_common_context.h
index ab79d2dc0f..e397ee8c22 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.h
@@ -518,6 +518,26 @@ struct radeon_context {
 	   void (*free_context)(GLcontext *ctx);
 	   void (*emit_query_finish)(radeonContextPtr radeon);
 	   void (*update_scissor)(GLcontext *ctx);
+	   unsigned (*blit)(GLcontext *ctx,
+                        struct radeon_bo *src_bo,
+                        intptr_t src_offset,
+                        gl_format src_mesaformat,
+                        unsigned src_pitch,
+                        unsigned src_width,
+                        unsigned src_height,
+                        unsigned src_x_offset,
+                        unsigned src_y_offset,
+                        struct radeon_bo *dst_bo,
+                        intptr_t dst_offset,
+                        gl_format dst_mesaformat,
+                        unsigned dst_pitch,
+                        unsigned dst_width,
+                        unsigned dst_height,
+                        unsigned dst_x_offset,
+                        unsigned dst_y_offset,
+                        unsigned reg_width,
+                        unsigned reg_height,
+                        unsigned flip_y);
    } vtbl;
 };
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.c b/src/mesa/drivers/dri/radeon/radeon_context.c
index 3cd305b0a2..6c08a90bbd 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_context.c
@@ -63,6 +63,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_tcl.h"
 #include "radeon_maos.h"
 #include "radeon_queryobj.h"
+#include "radeon_blit.h"
 
 #define need_GL_ARB_occlusion_query
 #define need_GL_EXT_blend_minmax
@@ -202,6 +203,7 @@ static void r100_init_vtbl(radeonContextPtr radeon)
    radeon->vtbl.fallback = radeonFallback;
    radeon->vtbl.free_context = r100_vtbl_free_context;
    radeon->vtbl.emit_query_finish = r100_emit_query_finish;
+   radeon->vtbl.blit = r100_blit;
 }
 
 /* Create the device specific context.
@@ -228,6 +230,7 @@ r100CreateContext( const __GLcontextModes *glVisual,
    if ( !rmesa )
       return GL_FALSE;
 
+   rmesa->radeon.radeonScreen = screen;
    r100_init_vtbl(&rmesa->radeon);
 
    /* init exp fog table data */
@@ -257,7 +260,7 @@ r100CreateContext( const __GLcontextModes *glVisual,
     * (the texture functions are especially important)
     */
    _mesa_init_driver_functions( &functions );
-   radeonInitTextureFuncs( &functions );
+   radeonInitTextureFuncs( &rmesa->radeon, &functions );
    radeonInitQueryObjFunctions(&functions);
 
    if (!radeonInitContext(&rmesa->radeon, &functions,
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.h b/src/mesa/drivers/dri/radeon/radeon_context.h
index dfedc38bfd..d84760bf74 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_context.h
@@ -453,7 +453,6 @@ struct r100_context {
 extern GLboolean r100CreateContext( const __GLcontextModes *glVisual,
 				    __DRIcontext *driContextPriv,
 				    void *sharedContextPrivate);
-  
 
 
 #endif				/* __RADEON_CONTEXT_H__ */
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.c b/src/mesa/drivers/dri/radeon/radeon_tex.c
index 14163f13af..882ee5c194 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.c
@@ -434,7 +434,7 @@ radeonNewTextureObject( GLcontext *ctx, GLuint name, GLenum target )
 
 
 
-void radeonInitTextureFuncs( struct dd_function_table *functions )
+void radeonInitTextureFuncs( radeonContextPtr radeon, struct dd_function_table *functions )
 {
    functions->ChooseTextureFormat	= radeonChooseTextureFormat_mesa;
    functions->TexImage1D		= radeonTexImage1D;
@@ -455,6 +455,11 @@ void radeonInitTextureFuncs( struct dd_function_table *functions )
    functions->CompressedTexImage2D	= radeonCompressedTexImage2D;
    functions->CompressedTexSubImage2D	= radeonCompressedTexSubImage2D;
 
+   if (radeon->radeonScreen->kernel_mm) {
+      functions->CopyTexImage2D = radeonCopyTexImage2D;
+      functions->CopyTexSubImage2D = radeonCopyTexSubImage2D;
+   }
+
    functions->GenerateMipmap = radeonGenerateMipmap;
 
    functions->NewTextureImage = radeonNewTextureImage;
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.h b/src/mesa/drivers/dri/radeon/radeon_tex.h
index a4aaddc74f..0113ffd3da 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.h
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.h
@@ -52,6 +52,6 @@ extern int radeonUploadTexImages( r100ContextPtr rmesa, radeonTexObjPtr t,
 
 extern void radeonDestroyTexObj( r100ContextPtr rmesa, radeonTexObjPtr t );
 
-extern void radeonInitTextureFuncs( struct dd_function_table *functions );
+extern void radeonInitTextureFuncs( radeonContextPtr radeon, struct dd_function_table *functions );
 
 #endif /* __RADEON_TEX_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_texcopy.c b/src/mesa/drivers/dri/radeon/radeon_tex_copy.c
index ebc9c05b8a..44e144c80f 100644
--- a/src/mesa/drivers/dri/r300/r300_texcopy.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tex_copy.c
@@ -26,7 +26,7 @@
  */
 
 #include "radeon_common.h"
-#include "r300_context.h"
+#include "radeon_texture.h"
 
 #include "main/image.h"
 #include "main/teximage.h"
@@ -34,11 +34,8 @@
 #include "drivers/common/meta.h"
 
 #include "radeon_mipmap_tree.h"
-#include "r300_blit.h"
 #include <main/debug.h>
 
-// TODO:
-// need to pass correct pitch for small dst textures!
 static GLboolean
 do_copy_texsubimage(GLcontext *ctx,
                     GLenum target, GLint level,
@@ -48,13 +45,13 @@ do_copy_texsubimage(GLcontext *ctx,
                     GLint x, GLint y,
                     GLsizei width, GLsizei height)
 {
-    struct r300_context *r300 = R300_CONTEXT(ctx);
+    radeonContextPtr radeon = RADEON_CONTEXT(ctx);
     struct radeon_renderbuffer *rrb;
 
     if (_mesa_get_format_bits(timg->base.TexFormat, GL_DEPTH_BITS) > 0) {
-        rrb = radeon_get_depthbuffer(&r300->radeon);
+        rrb = radeon_get_depthbuffer(radeon);
     } else {
-        rrb = radeon_get_colorbuffer(&r300->radeon);
+        rrb = radeon_get_colorbuffer(radeon);
     }
 
     if (!timg->mt) {
@@ -69,10 +66,6 @@ do_copy_texsubimage(GLcontext *ctx,
     intptr_t src_offset = rrb->draw_offset;
     intptr_t dst_offset = radeon_miptree_image_offset(timg->mt, _mesa_tex_target_to_face(target), level);
 
-    if (src_offset % 32 || dst_offset % 32) {
-        return GL_FALSE;
-    }
-
     if (0) {
         fprintf(stderr, "%s: copying to face %d, level %d\n",
                 __FUNCTION__, _mesa_tex_target_to_face(target), level);
@@ -84,18 +77,19 @@ do_copy_texsubimage(GLcontext *ctx,
     }
 
     /* blit from src buffer to texture */
-    return r300_blit(r300, rrb->bo, src_offset, rrb->base.Format, rrb->pitch/rrb->cpp,
-                     rrb->base.Width, rrb->base.Height, x, y,
-                     timg->mt->bo, dst_offset, timg->base.TexFormat,
-                     timg->base.Width, timg->base.Width, timg->base.Height,
-                     dstx, dsty, width, height, 1);
+    return radeon->vtbl.blit(ctx, rrb->bo, src_offset, rrb->base.Format, rrb->pitch/rrb->cpp,
+                             rrb->base.Width, rrb->base.Height, x, y,
+                             timg->mt->bo, dst_offset, timg->base.TexFormat,
+                             timg->mt->levels[level].rowstride / _mesa_get_format_bytes(timg->base.TexFormat),
+                             timg->base.Width, timg->base.Height,
+                             dstx, dsty, width, height, 1);
 }
 
-static void
-r300CopyTexImage2D(GLcontext *ctx, GLenum target, GLint level,
-                   GLenum internalFormat,
-                   GLint x, GLint y, GLsizei width, GLsizei height,
-                   GLint border)
+void
+radeonCopyTexImage2D(GLcontext *ctx, GLenum target, GLint level,
+                     GLenum internalFormat,
+                     GLint x, GLint y, GLsizei width, GLsizei height,
+                     GLint border)
 {
     struct gl_texture_unit *texUnit = _mesa_get_current_tex_unit(ctx);
     struct gl_texture_object *texObj =
@@ -139,11 +133,11 @@ fail:
                               width, height, border);
 }
 
-static void
-r300CopyTexSubImage2D(GLcontext *ctx, GLenum target, GLint level,
-                      GLint xoffset, GLint yoffset,
-                      GLint x, GLint y,
-                      GLsizei width, GLsizei height)
+void
+radeonCopyTexSubImage2D(GLcontext *ctx, GLenum target, GLint level,
+                        GLint xoffset, GLint yoffset,
+                        GLint x, GLint y,
+                        GLsizei width, GLsizei height)
 {
     struct gl_texture_unit *texUnit = _mesa_get_current_tex_unit(ctx);
     struct gl_texture_object *texObj = _mesa_select_tex_object(ctx, texUnit, target);
@@ -159,10 +153,3 @@ r300CopyTexSubImage2D(GLcontext *ctx, GLenum target, GLint level,
                                      xoffset, yoffset, x, y, width, height);
     }
 }
-
-
-void r300_init_texcopy_functions(struct dd_function_table *table)
-{
-    table->CopyTexImage2D = r300CopyTexImage2D;
-    table->CopyTexSubImage2D = r300CopyTexSubImage2D;
-}
-\ No newline at end of file
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
index 03178116c1..20a27ad9a7 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -197,21 +197,6 @@ void radeonUnmapTexture(GLcontext *ctx, struct gl_texture_object *texObj)
 	radeon_bo_unmap(t->mt->bo);
 }
 
-GLuint radeon_face_for_target(GLenum target)
-{
-	switch (target) {
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		return (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-	default:
-		return 0;
-	}
-}
-
 /**
  * Wraps Mesa's implementation to ensure that the base level image is mapped.
  *
@@ -248,7 +233,7 @@ static void radeon_generate_mipmap(GLcontext *ctx, GLenum target,
 
 void radeonGenerateMipmap(GLcontext* ctx, GLenum target, struct gl_texture_object *texObj)
 {
-	GLuint face = radeon_face_for_target(target);
+	GLuint face = _mesa_tex_target_to_face(target);
 	radeon_texture_image *baseimage = get_radeon_texture_image(texObj->Image[face][texObj->BaseLevel]);
 
 	radeon_teximage_map(baseimage, GL_FALSE);
@@ -710,7 +695,7 @@ static void radeon_teximage(
 	radeon_texture_image* image = get_radeon_texture_image(texImage);
 	GLint postConvWidth = width;
 	GLint postConvHeight = height;
-	GLuint face = radeon_face_for_target(target);
+	GLuint face = _mesa_tex_target_to_face(target);
 
 	{
 		struct radeon_bo *bo;
@@ -863,7 +848,7 @@ static void radeon_texsubimage(GLcontext* ctx, int dims, GLenum target, int leve
 
 	if (RADEON_DEBUG & RADEON_TEXTURE) {
 		fprintf(stderr, "radeon_texsubimage%dd: texObj %p, texImage %p, face %d, level %d\n",
-				dims, texObj, texImage, radeon_face_for_target(target), level);
+				dims, texObj, texImage, _mesa_tex_target_to_face(target), level);
 	}
 
 	t->validated = GL_FALSE;
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.h b/src/mesa/drivers/dri/radeon/radeon_texture.h
index 906daf12d0..f09dd65214 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.h
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.h
@@ -44,7 +44,6 @@ void radeonMapTexture(GLcontext *ctx, struct gl_texture_object *texObj);
 void radeonUnmapTexture(GLcontext *ctx, struct gl_texture_object *texObj);
 void radeonGenerateMipmap(GLcontext* ctx, GLenum target, struct gl_texture_object *texObj);
 int radeon_validate_texture_miptree(GLcontext * ctx, struct gl_texture_object *texObj);
-GLuint radeon_face_for_target(GLenum target);
 
 gl_format radeonChooseTextureFormat_mesa(GLcontext * ctx,
                                          GLint internalFormat,
@@ -126,4 +125,14 @@ void radeonGetCompressedTexImage(GLcontext *ctx, GLenum target, GLint level,
 				 struct gl_texture_object *texObj,
 				 struct gl_texture_image *texImage);
 
+void radeonCopyTexImage2D(GLcontext *ctx, GLenum target, GLint level,
+			GLenum internalFormat,
+			GLint x, GLint y, GLsizei width, GLsizei height,
+			GLint border);
+
+void radeonCopyTexSubImage2D(GLcontext *ctx, GLenum target, GLint level,
+			GLint xoffset, GLint yoffset,
+			GLint x, GLint y,
+			GLsizei width, GLsizei height);
+
 #endif
diff --git a/src/mesa/drivers/dri/radeon/server/radeon_reg.h b/src/mesa/drivers/dri/radeon/server/radeon_reg.h
index e81d7fdcd0..1b33de1edf 100644
--- a/src/mesa/drivers/dri/radeon/server/radeon_reg.h
+++ b/src/mesa/drivers/dri/radeon/server/radeon_reg.h
@@ -1959,7 +1959,30 @@
 #define RADEON_SE_ZBIAS_FACTOR              0x1db0
 #define RADEON_SE_ZBIAS_CONSTANT            0x1db4
 
-
+#define RADEON_SE_VTX_FMT                   0x2080
+#       define RADEON_SE_VTX_FMT_XY         0x00000000
+#       define RADEON_SE_VTX_FMT_W0         0x00000001
+#       define RADEON_SE_VTX_FMT_FPCOLOR    0x00000002
+#       define RADEON_SE_VTX_FMT_FPALPHA    0x00000004
+#       define RADEON_SE_VTX_FMT_PKCOLOR    0x00000008
+#       define RADEON_SE_VTX_FMT_FPSPEC     0x00000010
+#       define RADEON_SE_VTX_FMT_FPFOG      0x00000020
+#       define RADEON_SE_VTX_FMT_PKSPEC     0x00000040
+#       define RADEON_SE_VTX_FMT_ST0        0x00000080
+#       define RADEON_SE_VTX_FMT_ST1        0x00000100
+#       define RADEON_SE_VTX_FMT_Q1         0x00000200
+#       define RADEON_SE_VTX_FMT_ST2        0x00000400
+#       define RADEON_SE_VTX_FMT_Q2         0x00000800
+#       define RADEON_SE_VTX_FMT_ST3        0x00001000
+#       define RADEON_SE_VTX_FMT_Q3         0x00002000
+#       define RADEON_SE_VTX_FMT_Q0         0x00004000
+#       define RADEON_SE_VTX_FMT_BLND_WEIGHT_CNT_MASK  0x00038000
+#       define RADEON_SE_VTX_FMT_N0         0x00040000
+#       define RADEON_SE_VTX_FMT_XY1        0x08000000
+#       define RADEON_SE_VTX_FMT_Z1         0x10000000
+#       define RADEON_SE_VTX_FMT_W1         0x20000000
+#       define RADEON_SE_VTX_FMT_N1         0x40000000
+#       define RADEON_SE_VTX_FMT_Z          0x80000000
 
 				/* Registers for CP and Microcode Engine */
 #define RADEON_CP_ME_RAM_ADDR               0x07d4
diff --git a/src/mesa/glapi/gl_XML.py b/src/mesa/glapi/gl_XML.py
index bafb00306f..a10a35e513 100644
--- a/src/mesa/glapi/gl_XML.py
+++ b/src/mesa/glapi/gl_XML.py
@@ -184,7 +184,7 @@ class gl_print_base:
 		The name is also added to the file's undef_list.
 		"""
 		self.undef_list.append("PURE")
-		print """#  if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)
+		print """#  if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
 #    define PURE __attribute__((pure))
 #  else
 #    define PURE
@@ -224,7 +224,7 @@ class gl_print_base:
 		"""
 
 		self.undef_list.append(S)
-		print """#  if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)) && defined(__ELF__)
+		print """#  if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined(__ELF__)
 #    define %s  __attribute__((visibility("%s")))
 #  else
 #    define %s
@@ -244,7 +244,7 @@ class gl_print_base:
 		"""
 
 		self.undef_list.append("NOINLINE")
-		print """#  if defined(__GNUC__)
+		print """#  if defined(__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
 #    define NOINLINE __attribute__((noinline))
 #  else
 #    define NOINLINE
diff --git a/src/mesa/glapi/glapitemp.h b/src/mesa/glapi/glapitemp.h
index 6767a07673..b8bfcc1a16 100644
--- a/src/mesa/glapi/glapitemp.h
+++ b/src/mesa/glapi/glapitemp.h
@@ -27,7 +27,7 @@
  */
 
 
-#  if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)) && defined(__ELF__)
+#  if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined(__ELF__)
 #    define HIDDEN  __attribute__((visibility("hidden")))
 #  else
 #    define HIDDEN
diff --git a/src/mesa/main/compiler.h b/src/mesa/main/compiler.h
index 4eb249b4af..9eab1ead24 100644
--- a/src/mesa/main/compiler.h
+++ b/src/mesa/main/compiler.h
@@ -173,7 +173,8 @@ extern "C" {
  * We also need to define a USED attribute, so the optimizer doesn't 
  * inline a static function that we later use in an alias. - ajax
  */
-#if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303
+#if (defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303) \
+	|| (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
 #  define PUBLIC __attribute__((visibility("default")))
 #  define USED __attribute__((used))
 #else
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 7b3599f932..4da245ab49 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -861,6 +861,9 @@ _mesa_GenRenderbuffersEXT(GLsizei n, GLuint *renderbuffers)
  *
  * \return one of GL_RGB, GL_RGBA, GL_STENCIL_INDEX, GL_DEPTH_COMPONENT
  *  GL_DEPTH_STENCIL_EXT or zero if error.
+ *
+ * XXX in the future when we support red-only and red-green formats
+ * we'll also return GL_RED and GL_RG.
  */
 GLenum
 _mesa_base_fbo_format(GLcontext *ctx, GLenum internalFormat)
diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c
index 77153889b6..d70cf877e8 100644
--- a/src/mesa/state_tracker/st_atom_constbuf.c
+++ b/src/mesa/state_tracker/st_atom_constbuf.c
@@ -57,7 +57,7 @@ void st_upload_constants( struct st_context *st,
                           unsigned shader_type)
 {
    struct pipe_context *pipe = st->pipe;
-   struct pipe_constant_buffer *cbuf = &st->state.constants[shader_type];
+   struct pipe_buffer **cbuf = &st->state.constants[shader_type];
 
    assert(shader_type == PIPE_SHADER_VERTEX ||
           shader_type == PIPE_SHADER_FRAGMENT);
@@ -71,8 +71,8 @@ void st_upload_constants( struct st_context *st,
       /* We always need to get a new buffer, to keep the drivers simple and
        * avoid gratuitous rendering synchronization.
        */
-      pipe_buffer_reference(&cbuf->buffer, NULL );
-      cbuf->buffer = pipe_buffer_create(pipe->screen, 16,
+      pipe_buffer_reference(cbuf, NULL );
+      *cbuf = pipe_buffer_create(pipe->screen, 16,
                                         PIPE_BUFFER_USAGE_CONSTANT,
 					paramBytes );
 
@@ -84,12 +84,12 @@ void st_upload_constants( struct st_context *st,
       }
 
       /* load Mesa constants into the constant buffer */
-      if (cbuf->buffer)
-         st_no_flush_pipe_buffer_write(st, cbuf->buffer,
+      if (cbuf)
+         st_no_flush_pipe_buffer_write(st, *cbuf,
 				       0, paramBytes,
 				       params->ParameterValues);
 
-      st->pipe->set_constant_buffer(st->pipe, shader_type, 0, cbuf);
+      st->pipe->set_constant_buffer(st->pipe, shader_type, 0, *cbuf);
    }
    else {
       st->constants.tracked_state[shader_type].dirty.mesa = 0x0;
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 46c8cbb309..176f3ea68d 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -35,8 +35,6 @@
  *   Brian Paul
  */
 
-
-
 #include "main/imports.h"
 #include "main/mtypes.h"
 #include "main/macros.h"
@@ -57,9 +55,7 @@
 
 
 
-
-
-/*
+/**
  * Translate fragment program if needed.
  */
 static void
@@ -155,8 +151,10 @@ find_translated_vp(struct st_context *st,
 }
 
 
-
-
+/**
+ * Return pointer to a pass-through fragment shader.
+ * This shader is used when a texture is missing/incomplete.
+ */
 static void *
 get_passthrough_fs(struct st_context *st)
 {
@@ -168,6 +166,11 @@ get_passthrough_fs(struct st_context *st)
    return st->passthrough_fs;
 }
 
+
+/**
+ * Update fragment program state/atom.  This involves translating the
+ * Mesa fragment program into a gallium fragment program and binding it.
+ */
 static void
 update_fp( struct st_context *st )
 {
@@ -191,6 +194,7 @@ update_fp( struct st_context *st )
    }
 }
 
+
 const struct st_tracked_state st_update_fp = {
    "st_update_fp",					/* name */
    {							/* dirty */
@@ -202,7 +206,10 @@ const struct st_tracked_state st_update_fp = {
 
 
 
-
+/**
+ * Update vertex program state/atom.  This involves translating the
+ * Mesa vertex program into a gallium fragment program and binding it.
+ */
 static void
 update_vp( struct st_context *st )
 {
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 145bd62b83..9e6ce30db0 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -218,8 +218,8 @@ static void st_destroy_context_priv( struct st_context *st )
    }
 
    for (i = 0; i < Elements(st->state.constants); i++) {
-      if (st->state.constants[i].buffer) {
-         pipe_buffer_reference(&st->state.constants[i].buffer, NULL);
+      if (st->state.constants[i]) {
+         pipe_buffer_reference(&st->state.constants[i], NULL);
       }
    }
 
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 831909a3f8..2c4943cfb0 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -92,7 +92,7 @@ struct st_context
       struct pipe_sampler_state             samplers[PIPE_MAX_SAMPLERS];
       struct pipe_sampler_state             *sampler_list[PIPE_MAX_SAMPLERS];
       struct pipe_clip_state clip;
-      struct pipe_constant_buffer constants[2];
+      struct pipe_buffer *constants[2];
       struct pipe_framebuffer_state framebuffer;
       struct pipe_texture *sampler_texture[PIPE_MAX_SAMPLERS];
       struct pipe_scissor_state scissor;
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index e54f21be60..b0d5b993a7 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -365,6 +365,7 @@ setup_interleaved_attribs(GLcontext *ctx,
 
       velements[attr].src_offset =
          (unsigned) (arrays[mesaAttr]->Ptr - offset0);
+      velements[attr].instance_divisor = 0;
       velements[attr].vertex_buffer_index = 0;
       velements[attr].nr_components = arrays[mesaAttr]->Size;
       velements[attr].src_format =
@@ -454,6 +455,7 @@ setup_non_interleaved_attribs(GLcontext *ctx,
       /* common-case setup */
       vbuffer[attr].stride = stride; /* in bytes */
       vbuffer[attr].max_index = max_index;
+      velements[attr].instance_divisor = 0;
       velements[attr].vertex_buffer_index = attr;
       velements[attr].nr_components = arrays[mesaAttr]->Size;
       velements[attr].src_format
@@ -522,7 +524,6 @@ st_draw_vbo(GLcontext *ctx,
    struct pipe_context *pipe = ctx->st->pipe;
    const struct st_vertex_program *vp;
    const struct st_vp_varient *vpv;
-   const struct pipe_shader_state *vs;
    struct pipe_vertex_buffer vbuffer[PIPE_MAX_SHADER_INPUTS];
    GLuint attr;
    struct pipe_vertex_element velements[PIPE_MAX_ATTRIBS];
@@ -550,7 +551,6 @@ st_draw_vbo(GLcontext *ctx,
    /* must get these after state validation! */
    vp = ctx->st->vp;
    vpv = ctx->st->vp_varient;
-   vs = &vpv->state;
 
 #if 0
    if (MESA_VERBOSE & VERBOSE_GLSL) {
diff --git a/src/mesa/state_tracker/st_draw_feedback.c b/src/mesa/state_tracker/st_draw_feedback.c
index cfc0caac98..a05d6dd06b 100644
--- a/src/mesa/state_tracker/st_draw_feedback.c
+++ b/src/mesa/state_tracker/st_draw_feedback.c
@@ -177,6 +177,7 @@ st_feedback_draw_vbo(GLcontext *ctx,
       /* common-case setup */
       vbuffers[attr].stride = arrays[mesaAttr]->StrideB; /* in bytes */
       vbuffers[attr].max_index = max_index;
+      velements[attr].instance_divisor = 0;
       velements[attr].vertex_buffer_index = attr;
       velements[attr].nr_components = arrays[mesaAttr]->Size;
       velements[attr].src_format = 
@@ -239,11 +240,11 @@ st_feedback_draw_vbo(GLcontext *ctx,
 
    /* map constant buffers */
    mapped_constants = pipe_buffer_map(pipe->screen,
-                                      st->state.constants[PIPE_SHADER_VERTEX].buffer,
+                                      st->state.constants[PIPE_SHADER_VERTEX],
                                       PIPE_BUFFER_USAGE_CPU_READ);
    draw_set_mapped_constant_buffer(st->draw, PIPE_SHADER_VERTEX,
                                    mapped_constants,
-                                   st->state.constants[PIPE_SHADER_VERTEX].buffer->size);
+                                   st->state.constants[PIPE_SHADER_VERTEX]->size);
 
 
    /* draw here */
@@ -253,7 +254,7 @@ st_feedback_draw_vbo(GLcontext *ctx,
 
 
    /* unmap constant buffers */
-   pipe_buffer_unmap(pipe->screen, st->state.constants[PIPE_SHADER_VERTEX].buffer);
+   pipe_buffer_unmap(pipe->screen, st->state.constants[PIPE_SHADER_VERTEX]);
 
    /*
     * unmap vertex/index buffers
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index e25a613d8a..2a5fb27d8f 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -168,6 +168,7 @@ void st_init_extensions(struct st_context *st)
    ctx->Extensions.EXT_blend_subtract = GL_TRUE;
    ctx->Extensions.EXT_framebuffer_blit = GL_TRUE;
    ctx->Extensions.EXT_framebuffer_object = GL_TRUE;
+   ctx->Extensions.EXT_framebuffer_multisample = GL_TRUE;
    ctx->Extensions.EXT_fog_coord = GL_TRUE;
    ctx->Extensions.EXT_multi_draw_arrays = GL_TRUE;
    ctx->Extensions.EXT_pixel_buffer_object = GL_TRUE;
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index e788008dfe..05b56c9b58 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -48,6 +48,10 @@ struct label {
    unsigned token;
 };
 
+
+/**
+ * Intermediate state used during shader translation.
+ */
 struct st_translate {
    struct ureg_program *ureg;
 
@@ -730,6 +734,7 @@ emit_face_var( struct st_translate *t,
    t->inputs[t->inputMapping[FRAG_ATTRIB_FACE]] = ureg_src(face_temp);
 }
 
+
 static void
 emit_edgeflags( struct st_translate *t,
                  const struct gl_program *program )
@@ -741,6 +746,7 @@ emit_edgeflags( struct st_translate *t,
    ureg_MOV( ureg, edge_dst, edge_src );
 }
 
+
 /**
  * Translate Mesa program to TGSI format.
  * \param program  the program to translate
@@ -758,7 +764,7 @@ emit_edgeflags( struct st_translate *t,
  * \param outputSemanticIndex  the semantic index (ex: which texcoord) for
  *                             each output
  *
- * \return  array of translated tokens, caller's responsibility to free
+ * \return  PIPE_OK or PIPE_ERROR_OUT_OF_MEMORY
  */
 enum pipe_error
 st_translate_mesa_program(
@@ -779,6 +785,7 @@ st_translate_mesa_program(
 {
    struct st_translate translate, *t;
    unsigned i;
+   enum pipe_error ret = PIPE_OK;
 
    t = &translate;
    memset(t, 0, sizeof *t);
@@ -865,8 +872,10 @@ st_translate_mesa_program(
       
       t->constants = CALLOC( program->Parameters->NumParameters,
                              sizeof t->constants[0] );
-      if (t->constants == NULL)
+      if (t->constants == NULL) {
+         ret = PIPE_ERROR_OUT_OF_MEMORY;
          goto out;
+      }
       
       for (i = 0; i < program->Parameters->NumParameters; i++) {
          switch (program->Parameters->Parameters[i].Type) {
@@ -920,8 +929,6 @@ st_translate_mesa_program(
                         t->insn[t->labels[i].branch_target] );
    }
 
-   return PIPE_OK;
-
 out:
    FREE(t->insn);
    FREE(t->labels);
@@ -931,7 +938,7 @@ out:
       debug_printf("%s: translate error flag set\n", __FUNCTION__);
    }
 
-   return PIPE_ERROR_OUT_OF_MEMORY;
+   return ret;
 }
 
 
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 6a869fae90..5c87e47ca3 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -44,7 +44,6 @@
 
 #include "st_debug.h"
 #include "st_context.h"
-#include "st_atom.h"
 #include "st_program.h"
 #include "st_mesa_to_tgsi.h"
 #include "cso_cache/cso_context.h"
diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index 8a3e4cd3ac..b210ac9187 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -35,7 +35,6 @@
 #include "main/texfetch.h"
 #include "main/teximage.h"
 #include "main/texobj.h"
-#include "main/texstore.h"
 
 #undef Elements  /* fix re-defined macro warning */
 
diff --git a/src/mesa/swrast/s_accum.c b/src/mesa/swrast/s_accum.c
index 0e0876efcb..cf53f01b7c 100644
--- a/src/mesa/swrast/s_accum.c
+++ b/src/mesa/swrast/s_accum.c
@@ -27,7 +27,6 @@
 #include "main/context.h"
 #include "main/macros.h"
 #include "main/imports.h"
-#include "main/fbobject.h"
 
 #include "s_accum.h"
 #include "s_context.h"
diff --git a/src/mesa/swrast/s_atifragshader.c b/src/mesa/swrast/s_atifragshader.c
index e88ff19123..353e9999d6 100644
--- a/src/mesa/swrast/s_atifragshader.c
+++ b/src/mesa/swrast/s_atifragshader.c
@@ -23,7 +23,6 @@
 #include "main/colormac.h"
 #include "main/context.h"
 #include "main/macros.h"
-#include "shader/program.h"
 #include "shader/atifragshader.h"
 #include "swrast/s_atifragshader.h"
 
diff --git a/src/mesa/swrast/s_bitmap.c b/src/mesa/swrast/s_bitmap.c
index 46c63aa645..59e26e9ea3 100644
--- a/src/mesa/swrast/s_bitmap.c
+++ b/src/mesa/swrast/s_bitmap.c
@@ -33,7 +33,6 @@
 #include "main/condrender.h"
 #include "main/image.h"
 #include "main/macros.h"
-#include "main/pixel.h"
 
 #include "s_context.h"
 #include "s_span.h"
diff --git a/src/mesa/swrast/s_copypix.c b/src/mesa/swrast/s_copypix.c
index 986b6aff4f..e881d1be30 100644
--- a/src/mesa/swrast/s_copypix.c
+++ b/src/mesa/swrast/s_copypix.c
@@ -28,11 +28,9 @@
 #include "main/colormac.h"
 #include "main/condrender.h"
 #include "main/convolve.h"
-#include "main/histogram.h"
 #include "main/image.h"
 #include "main/macros.h"
 #include "main/imports.h"
-#include "main/pixel.h"
 
 #include "s_context.h"
 #include "s_depth.h"
diff --git a/src/mesa/swrast/s_depth.c b/src/mesa/swrast/s_depth.c
index c37a54eb3e..0b6bb7e3ec 100644
--- a/src/mesa/swrast/s_depth.c
+++ b/src/mesa/swrast/s_depth.c
@@ -28,7 +28,6 @@
 #include "main/formats.h"
 #include "main/macros.h"
 #include "main/imports.h"
-#include "main/fbobject.h"
 
 #include "s_depth.h"
 #include "s_context.h"
diff --git a/src/mesa/swrast/s_drawpix.c b/src/mesa/swrast/s_drawpix.c
index 55a4c4c3c6..248d6cc1c0 100644
--- a/src/mesa/swrast/s_drawpix.c
+++ b/src/mesa/swrast/s_drawpix.c
@@ -31,7 +31,6 @@
 #include "main/image.h"
 #include "main/macros.h"
 #include "main/imports.h"
-#include "main/pixel.h"
 #include "main/state.h"
 
 #include "s_context.h"
diff --git a/src/mesa/swrast/s_feedback.c b/src/mesa/swrast/s_feedback.c
index 47ed25ee10..2e6066983d 100644
--- a/src/mesa/swrast/s_feedback.c
+++ b/src/mesa/swrast/s_feedback.c
@@ -25,7 +25,6 @@
 #include "main/glheader.h"
 #include "main/colormac.h"
 #include "main/context.h"
-#include "main/enums.h"
 #include "main/feedback.h"
 #include "main/macros.h"
 
diff --git a/src/mesa/swrast/s_fragprog.c b/src/mesa/swrast/s_fragprog.c
index a22d34415d..9ac33a26a6 100644
--- a/src/mesa/swrast/s_fragprog.c
+++ b/src/mesa/swrast/s_fragprog.c
@@ -25,7 +25,6 @@
 #include "main/glheader.h"
 #include "main/colormac.h"
 #include "main/context.h"
-#include "main/texstate.h"
 #include "shader/prog_instruction.h"
 
 #include "s_fragprog.h"
diff --git a/src/mesa/swrast/s_lines.c b/src/mesa/swrast/s_lines.c
index 23cb9b57ef..5411229d70 100644
--- a/src/mesa/swrast/s_lines.c
+++ b/src/mesa/swrast/s_lines.c
@@ -29,7 +29,6 @@
 #include "main/macros.h"
 #include "s_aaline.h"
 #include "s_context.h"
-#include "s_depth.h"
 #include "s_feedback.h"
 #include "s_lines.h"
 #include "s_span.h"
diff --git a/src/mesa/swrast/s_points.c b/src/mesa/swrast/s_points.c
index 50ec2063a5..6b955429e9 100644
--- a/src/mesa/swrast/s_points.c
+++ b/src/mesa/swrast/s_points.c
@@ -27,7 +27,6 @@
 #include "main/colormac.h"
 #include "main/context.h"
 #include "main/macros.h"
-#include "main/texstate.h"
 #include "s_context.h"
 #include "s_feedback.h"
 #include "s_points.h"
diff --git a/src/mesa/swrast/s_readpix.c b/src/mesa/swrast/s_readpix.c
index 44a11cd6dd..94fb974eab 100644
--- a/src/mesa/swrast/s_readpix.c
+++ b/src/mesa/swrast/s_readpix.c
@@ -33,7 +33,6 @@
 #include "main/image.h"
 #include "main/macros.h"
 #include "main/imports.h"
-#include "main/pixel.h"
 #include "main/state.h"
 
 #include "s_context.h"
diff --git a/src/mesa/swrast/s_texcombine.c b/src/mesa/swrast/s_texcombine.c
index 889164b986..594b71a03c 100644
--- a/src/mesa/swrast/s_texcombine.c
+++ b/src/mesa/swrast/s_texcombine.c
@@ -29,7 +29,6 @@
 #include "main/colormac.h"
 #include "main/image.h"
 #include "main/imports.h"
-#include "main/pixel.h"
 #include "shader/prog_instruction.h"
 
 #include "s_context.h"
diff --git a/src/mesa/tnl/t_context.c b/src/mesa/tnl/t_context.c
index db21b4589d..5a14e595a0 100644
--- a/src/mesa/tnl/t_context.c
+++ b/src/mesa/tnl/t_context.c
@@ -38,7 +38,6 @@
 #include "tnl.h"
 #include "t_context.h"
 #include "t_pipeline.h"
-#include "t_vp_build.h"
 
 #include "vbo/vbo.h"
 
diff --git a/src/mesa/tnl/t_draw.c b/src/mesa/tnl/t_draw.c
index d31b29b9b4..38757a0e28 100644
--- a/src/mesa/tnl/t_draw.c
+++ b/src/mesa/tnl/t_draw.c
@@ -29,15 +29,11 @@
 #include "main/condrender.h"
 #include "main/context.h"
 #include "main/imports.h"
-#include "main/state.h"
 #include "main/mtypes.h"
 #include "main/macros.h"
 #include "main/enums.h"
 
 #include "t_context.h"
-#include "t_pipeline.h"
-#include "t_vp_build.h"
-#include "t_vertex.h"
 #include "tnl.h"
 
 
diff --git a/src/mesa/tnl/t_pipeline.c b/src/mesa/tnl/t_pipeline.c
index 01b30babb4..946b29e250 100644
--- a/src/mesa/tnl/t_pipeline.c
+++ b/src/mesa/tnl/t_pipeline.c
@@ -28,7 +28,6 @@
 #include "main/glheader.h"
 #include "main/context.h"
 #include "main/imports.h"
-#include "main/state.h"
 #include "main/mtypes.h"
 
 #include "t_context.h"
diff --git a/src/mesa/tnl/t_rasterpos.c b/src/mesa/tnl/t_rasterpos.c
index 99b6787455..13b84a7d77 100644
--- a/src/mesa/tnl/t_rasterpos.c
+++ b/src/mesa/tnl/t_rasterpos.c
@@ -29,7 +29,6 @@
 #include "main/feedback.h"
 #include "main/light.h"
 #include "main/macros.h"
-#include "main/rastpos.h"
 #include "main/simple_list.h"
 #include "main/mtypes.h"
 
diff --git a/src/mesa/tnl/t_vb_program.c b/src/mesa/tnl/t_vb_program.c
index 15a8a67b91..5396548666 100644
--- a/src/mesa/tnl/t_vb_program.c
+++ b/src/mesa/tnl/t_vb_program.c
@@ -40,7 +40,6 @@
 #include "shader/prog_statevars.h"
 #include "shader/prog_execute.h"
 #include "swrast/s_context.h"
-#include "swrast/s_texfilter.h"
 
 #include "tnl/tnl.h"
 #include "tnl/t_context.h"
diff --git a/src/mesa/vbo/vbo_exec.c b/src/mesa/vbo/vbo_exec.c
index e168a89ea5..a057befed0 100644
--- a/src/mesa/vbo/vbo_exec.c
+++ b/src/mesa/vbo/vbo_exec.c
@@ -28,9 +28,6 @@
 
 #include "main/api_arrayelt.h"
 #include "main/glheader.h"
-#include "main/imports.h"
-#include "main/context.h"
-#include "main/macros.h"
 #include "main/mtypes.h"
 #include "main/vtxfmt.h"
 
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 6de8f059b7..2c82f7c9c5 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -35,7 +35,6 @@
 #include "main/bufferobj.h"
 #include "main/enums.h"
 #include "main/macros.h"
-#include "glapi/dispatch.h"
 
 #include "vbo_context.h"
 
diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index 4f43856016..d7dbbceb1b 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -30,7 +30,6 @@
 #include "main/context.h"
 #include "main/enums.h"
 #include "main/state.h"
-#include "main/macros.h"
 
 #include "vbo_context.h"
 
diff --git a/src/mesa/vbo/vbo_save.c b/src/mesa/vbo/vbo_save.c
index 9757c3d9f6..10f705cf84 100644
--- a/src/mesa/vbo/vbo_save.c
+++ b/src/mesa/vbo/vbo_save.c
@@ -28,8 +28,6 @@
 
 #include "main/mtypes.h"
 #include "main/bufferobj.h"
-#include "main/dlist.h"
-#include "main/vtxfmt.h"
 #include "main/imports.h"
 
 #include "vbo_context.h"
diff --git a/src/mesa/vbo/vbo_save_loopback.c b/src/mesa/vbo/vbo_save_loopback.c
index b7a74e4535..f13a16e3b5 100644
--- a/src/mesa/vbo/vbo_save_loopback.c
+++ b/src/mesa/vbo/vbo_save_loopback.c
@@ -29,7 +29,6 @@
 #include "main/glheader.h"
 #include "main/enums.h"
 #include "main/imports.h"
-#include "main/macros.h"
 #include "main/mtypes.h"
 #include "glapi/dispatch.h"
 #include "glapi/glapi.h"
diff --git a/src/mesa/vbo/vbo_split_copy.c b/src/mesa/vbo/vbo_split_copy.c
index c45190b9dd..2ca111217c 100644
--- a/src/mesa/vbo/vbo_split_copy.c
+++ b/src/mesa/vbo/vbo_split_copy.c
@@ -34,7 +34,6 @@
 #include "main/imports.h"
 #include "main/image.h"
 #include "main/macros.h"
-#include "main/enums.h"
 #include "main/mtypes.h"
 
 #include "vbo_split.h"
@@ -221,8 +220,6 @@ begin( struct copy_context *copy, GLenum mode, GLboolean begin_flag )
 {
    struct _mesa_prim *prim = &copy->dstprim[copy->dstprim_nr];
 
-/*    _mesa_printf("begin %s (%d)\n", _mesa_lookup_prim_by_nr(mode), begin_flag); */
-		
    prim->mode = mode;
    prim->begin = begin_flag;
 }
diff --git a/src/mesa/x86/x86_xform.c b/src/mesa/x86/x86_xform.c
index 52f6b25d81..c834e2b468 100644
--- a/src/mesa/x86/x86_xform.c
+++ b/src/mesa/x86/x86_xform.c
@@ -30,7 +30,6 @@
 #include "main/glheader.h"
 #include "main/context.h"
 #include "math/m_xform.h"
-#include "tnl/t_context.h"
 
 #include "x86_xform.h"
 #include "common_x86_asm.h"