303 files changed, 18498 insertions, 8515 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
index 9544e90a96..eb86d83d2a 100644
--- a/src/gallium/auxiliary/Makefile
+++ b/src/gallium/auxiliary/Makefile
@@ -4,8 +4,8 @@ include $(TOP)/configs/current
 LIBNAME = gallium
 
 C_SOURCES = \
-	cso_cache/cso_context.c \
 	cso_cache/cso_cache.c \
+	cso_cache/cso_context.c \
 	cso_cache/cso_hash.c \
 	draw/draw_context.c \
 	draw/draw_gs.c \
@@ -26,7 +26,6 @@ C_SOURCES = \
 	draw/draw_pipe_wide_line.c \
 	draw/draw_pipe_wide_point.c \
 	draw/draw_pt.c \
-	draw/draw_pt_elts.c \
 	draw/draw_pt_emit.c \
 	draw/draw_pt_fetch.c \
 	draw/draw_pt_fetch_emit.c \
@@ -35,24 +34,24 @@ C_SOURCES = \
 	draw/draw_pt_post_vs.c \
 	draw/draw_pt_so_emit.c \
 	draw/draw_pt_util.c \
-	draw/draw_pt_varray.c \
-	draw/draw_pt_vcache.c \
+	draw/draw_pt_vsplit.c \
 	draw/draw_vertex.c \
 	draw/draw_vs.c \
-	draw/draw_vs_varient.c \
 	draw/draw_vs_aos.c \
 	draw/draw_vs_aos_io.c \
 	draw/draw_vs_aos_machine.c \
 	draw/draw_vs_exec.c \
 	draw/draw_vs_ppc.c \
 	draw/draw_vs_sse.c \
+	draw/draw_vs_varient.c \
 	indices/u_indices_gen.c \
 	indices/u_unfilled_gen.c \
 	os/os_misc.c \
+	os/os_stream.c \
 	os/os_stream_log.c \
+	os/os_stream_null.c \
 	os/os_stream_stdc.c \
 	os/os_stream_str.c \
-	os/os_stream_null.c \
 	os/os_time.c \
 	pipebuffer/pb_buffer_fenced.c \
 	pipebuffer/pb_buffer_malloc.c \
@@ -65,17 +64,16 @@ C_SOURCES = \
 	pipebuffer/pb_bufmgr_slab.c \
 	pipebuffer/pb_validate.c \
 	rbug/rbug_connection.c \
+	rbug/rbug_context.c \
 	rbug/rbug_core.c \
+	rbug/rbug_demarshal.c \
 	rbug/rbug_texture.c \
-	rbug/rbug_context.c \
 	rbug/rbug_shader.c \
-	rbug/rbug_demarshal.c \
 	rtasm/rtasm_cpu.c \
 	rtasm/rtasm_execmem.c \
-	rtasm/rtasm_x86sse.c \
 	rtasm/rtasm_ppc.c \
 	rtasm/rtasm_ppc_spe.c \
-	tgsi/tgsi_sanity.c \
+	rtasm/rtasm_x86sse.c \
 	tgsi/tgsi_build.c \
 	tgsi/tgsi_dump.c \
 	tgsi/tgsi_exec.c \
@@ -83,19 +81,22 @@ C_SOURCES = \
 	tgsi/tgsi_iterate.c \
 	tgsi/tgsi_parse.c \
 	tgsi/tgsi_ppc.c \
+	tgsi/tgsi_sanity.c \
 	tgsi/tgsi_scan.c \
 	tgsi/tgsi_sse2.c \
 	tgsi/tgsi_text.c \
 	tgsi/tgsi_transform.c \
 	tgsi/tgsi_ureg.c \
 	tgsi/tgsi_util.c \
-	translate/translate_generic.c \
-	translate/translate_sse.c \
 	translate/translate.c \
 	translate/translate_cache.c \
+	translate/translate_generic.c \
+	translate/translate_sse.c \
 	util/u_debug.c \
-	util/u_debug_symbol.c \
+	util/u_debug_describe.c \
+	util/u_debug_refcnt.c \
 	util/u_debug_stack.c \
+	util/u_debug_symbol.c \
 	util/u_dump_defines.c \
 	util/u_dump_state.c \
 	util/u_bitmask.c \
@@ -118,10 +119,11 @@ C_SOURCES = \
 	util/u_gen_mipmap.c \
 	util/u_half.c \
 	util/u_handle_table.c \
-	util/u_hash_table.c \
 	util/u_hash.c \
+	util/u_hash_table.c \
 	util/u_keymap.c \
 	util/u_linear.c \
+	util/u_linkage.c \
 	util/u_network.c \
 	util/u_math.c \
 	util/u_mempool.c \
@@ -172,10 +174,10 @@ GALLIVM_SOURCES = \
         gallivm/lp_bld_tgsi_soa.c \
         gallivm/lp_bld_type.c \
         draw/draw_llvm.c \
-        draw/draw_vs_llvm.c \
-        draw/draw_pt_fetch_shade_pipeline_llvm.c \
+        draw/draw_llvm_sample.c \
         draw/draw_llvm_translate.c \
-        draw/draw_llvm_sample.c
+        draw/draw_vs_llvm.c \
+        draw/draw_pt_fetch_shade_pipeline_llvm.c
 
 GALLIVM_CPP_SOURCES = \
     gallivm/lp_bld_misc.cpp
diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript
index 3124e20ce8..6210ada990 100644
--- a/src/gallium/auxiliary/SConscript
+++ b/src/gallium/auxiliary/SConscript
@@ -50,10 +50,11 @@ env.Depends('util/u_format_table.c', [
 ])
 
 source = [
-    'cso_cache/cso_context.c',
     'cso_cache/cso_cache.c',
+    'cso_cache/cso_context.c',
     'cso_cache/cso_hash.c',
     'draw/draw_context.c',
+    'draw/draw_gs.c',
     'draw/draw_pipe.c',
     'draw/draw_pipe_aaline.c',
     'draw/draw_pipe_aapoint.c',
@@ -71,7 +72,6 @@ source = [
     'draw/draw_pipe_wide_line.c',
     'draw/draw_pipe_wide_point.c',
     'draw/draw_pt.c',
-    'draw/draw_pt_elts.c',
     'draw/draw_pt_emit.c',
     'draw/draw_pt_fetch.c',
     'draw/draw_pt_fetch_emit.c',
@@ -80,8 +80,7 @@ source = [
     'draw/draw_pt_post_vs.c',
     'draw/draw_pt_so_emit.c',
     'draw/draw_pt_util.c',
-    'draw/draw_pt_varray.c',
-    'draw/draw_pt_vcache.c',
+    'draw/draw_pt_vsplit.c',
     'draw/draw_vertex.c',
     'draw/draw_vs.c',
     'draw/draw_vs_aos.c',
@@ -91,16 +90,16 @@ source = [
     'draw/draw_vs_ppc.c',
     'draw/draw_vs_sse.c',
     'draw/draw_vs_varient.c',
-    'draw/draw_gs.c',
     #'indices/u_indices.c',
     #'indices/u_unfilled_indices.c',
     'indices/u_indices_gen.c',
     'indices/u_unfilled_gen.c',
     'os/os_misc.c',
+    'os/os_stream.c',
     'os/os_stream_log.c',
+    'os/os_stream_null.c',
     'os/os_stream_stdc.c',
     'os/os_stream_str.c',
-    'os/os_stream_null.c',
     'os/os_time.c',
     'pipebuffer/pb_buffer_fenced.c',
     'pipebuffer/pb_buffer_malloc.c',
@@ -112,35 +111,35 @@ source = [
     'pipebuffer/pb_bufmgr_pool.c',
     'pipebuffer/pb_bufmgr_slab.c',
     'pipebuffer/pb_validate.c',
+    'rbug/rbug_connection.c',
+    'rbug/rbug_context.c',
     'rbug/rbug_core.c',
+    'rbug/rbug_demarshal.c',
     'rbug/rbug_shader.c',
-    'rbug/rbug_context.c',
     'rbug/rbug_texture.c',
-    'rbug/rbug_demarshal.c',
-    'rbug/rbug_connection.c',
     'rtasm/rtasm_cpu.c',
     'rtasm/rtasm_execmem.c',
-    'rtasm/rtasm_x86sse.c',
     'rtasm/rtasm_ppc.c',
     'rtasm/rtasm_ppc_spe.c',
+    'rtasm/rtasm_x86sse.c',
     'tgsi/tgsi_build.c',
     'tgsi/tgsi_dump.c',
     'tgsi/tgsi_exec.c',
     'tgsi/tgsi_info.c',
     'tgsi/tgsi_iterate.c',
     'tgsi/tgsi_parse.c',
+    'tgsi/tgsi_ppc.c',
     'tgsi/tgsi_sanity.c',
     'tgsi/tgsi_scan.c',
-    'tgsi/tgsi_ppc.c',
     'tgsi/tgsi_sse2.c',
     'tgsi/tgsi_text.c',
     'tgsi/tgsi_transform.c',
     'tgsi/tgsi_ureg.c',
     'tgsi/tgsi_util.c',
-    'translate/translate_generic.c',
-    'translate/translate_sse.c',
     'translate/translate.c',
     'translate/translate_cache.c',
+    'translate/translate_generic.c',
+    'translate/translate_sse.c',
     'util/u_bitmask.c',
     'util/u_blit.c',
     'util/u_blitter.c',
@@ -148,7 +147,9 @@ source = [
     'util/u_caps.c',
     'util/u_cpu_detect.c',
     'util/u_debug.c',
+    'util/u_debug_describe.c',
     'util/u_debug_memory.c',
+    'util/u_debug_refcnt.c',
     'util/u_debug_stack.c',
     'util/u_debug_symbol.c',
     'util/u_dump_defines.c',
@@ -170,6 +171,8 @@ source = [
     'util/u_hash.c',
     'util/u_hash_table.c',
     'util/u_keymap.c',
+    'util/u_linear.c',
+    'util/u_linkage.c',
     'util/u_network.c',
     'util/u_math.c',
     'util/u_mempool.c',
@@ -208,9 +211,9 @@ if env['llvm']:
     'gallivm/lp_bld_format_soa.c',
     'gallivm/lp_bld_format_yuv.c',
     'gallivm/lp_bld_gather.c',
+    'gallivm/lp_bld_init.c',
     'gallivm/lp_bld_intr.c',
     'gallivm/lp_bld_logic.c',
-    'gallivm/lp_bld_init.c',
     'gallivm/lp_bld_misc.cpp',
     'gallivm/lp_bld_pack.c',
     'gallivm/lp_bld_printf.c',
@@ -222,10 +225,10 @@ if env['llvm']:
     'gallivm/lp_bld_tgsi_soa.c',
     'gallivm/lp_bld_type.c',
     'draw/draw_llvm.c',
-    'draw/draw_pt_fetch_shade_pipeline_llvm.c',
+    'draw/draw_llvm_sample.c',
     'draw/draw_llvm_translate.c',
-    'draw/draw_vs_llvm.c',
-    'draw/draw_llvm_sample.c'
+    'draw/draw_pt_fetch_shade_pipeline_llvm.c',
+    'draw/draw_vs_llvm.c'
     ]
 
 gallium = env.ConvenienceLibrary(
diff --git a/src/gallium/auxiliary/draw/draw_cliptest_tmp.h b/src/gallium/auxiliary/draw/draw_cliptest_tmp.h
new file mode 100644
index 0000000000..958ed20dc8
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_cliptest_tmp.h
@@ -0,0 +1,114 @@
+/**************************************************************************
+ * 
+ * Copyright 2010, VMware, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+static boolean TAG(do_cliptest)( struct pt_post_vs *pvs,
+                                 struct draw_vertex_info *info )
+{
+   struct vertex_header *out = info->verts;
+   const float *scale = pvs->draw->viewport.scale;
+   const float *trans = pvs->draw->viewport.translate;
+   /* const */ float (*plane)[4] = pvs->draw->plane;
+   const unsigned pos = draw_current_shader_position_output(pvs->draw);
+   const unsigned ef = pvs->draw->vs.edgeflag_output;
+   const unsigned nr = pvs->draw->nr_planes;
+   const unsigned flags = (FLAGS);
+   unsigned need_pipeline = 0;
+   unsigned j;
+
+   for (j = 0; j < info->count; j++) {
+      float *position = out->data[pos];
+      unsigned mask = 0x0;
+  
+      initialize_vertex_header(out);
+
+      if (flags & (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_HALF_Z | DO_CLIP_USER)) {
+         out->clip[0] = position[0];
+         out->clip[1] = position[1];
+         out->clip[2] = position[2];
+         out->clip[3] = position[3];
+
+         /* Do the hardwired planes first:
+          */
+         if (flags & DO_CLIP_XY) {
+            if (-position[0] + position[3] < 0) mask |= (1<<0);
+            if ( position[0] + position[3] < 0) mask |= (1<<1);
+            if (-position[1] + position[3] < 0) mask |= (1<<2);
+            if ( position[1] + position[3] < 0) mask |= (1<<3);
+         }
+
+         /* Clip Z planes according to full cube, half cube or none.
+          */
+         if (flags & DO_CLIP_FULL_Z) {
+            if ( position[2] + position[3] < 0) mask |= (1<<4);
+            if (-position[2] + position[3] < 0) mask |= (1<<5);
+         }
+         else if (flags & DO_CLIP_HALF_Z) {
+            if ( position[2]               < 0) mask |= (1<<4);
+            if (-position[2] + position[3] < 0) mask |= (1<<5);
+         }
+
+         if (flags & DO_CLIP_USER) {
+            unsigned i;
+            for (i = 6; i < nr; i++) {
+               if (dot4(position, plane[i]) < 0) 
+                  mask |= (1<<i);
+            }
+         }
+
+         out->clipmask = mask;
+         need_pipeline |= out->clipmask;
+      }
+
+      if ((flags & DO_VIEWPORT) && mask == 0)
+      {
+	 /* divide by w */
+	 float w = 1.0f / position[3];
+
+	 /* Viewport mapping */
+	 position[0] = position[0] * w * scale[0] + trans[0];
+	 position[1] = position[1] * w * scale[1] + trans[1];
+	 position[2] = position[2] * w * scale[2] + trans[2];
+	 position[3] = w;
+      }
+
+      if ((flags & DO_EDGEFLAG) && ef) {
+         const float *edgeflag = out->data[ef];
+         out->edgeflag = !(edgeflag[0] != 1.0f);
+         need_pipeline |= !out->edgeflag;
+      }
+
+      out = (struct vertex_header *)( (char *)out + info->stride );
+   }
+
+   return need_pipeline != 0;
+}
+
+
+#undef FLAGS
+#undef TAG
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 995b675b9a..937b093479 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -34,6 +34,7 @@
 #include "pipe/p_context.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
+#include "util/u_cpu_detect.h"
 #include "draw_context.h"
 #include "draw_vs.h"
 #include "draw_gs.h"
@@ -41,6 +42,25 @@
 #if HAVE_LLVM
 #include "gallivm/lp_bld_init.h"
 #include "draw_llvm.h"
+
+static boolean
+draw_get_option_use_llvm(void)
+{
+   static boolean first = TRUE;
+   static boolean value;
+   if (first) {
+      first = FALSE;
+      value = debug_get_bool_option("DRAW_USE_LLVM", TRUE);
+
+#ifdef PIPE_ARCH_X86
+      util_cpu_detect();
+      /* require SSE2 due to LLVM PR6960. */
+      if (!util_cpu_caps.has_sse2)
+         value = FALSE;
+#endif
+   }
+   return value;
+}
 #endif
 
 struct draw_context *draw_create( struct pipe_context *pipe )
@@ -50,10 +70,13 @@ struct draw_context *draw_create( struct pipe_context *pipe )
       goto fail;
 
 #if HAVE_LLVM
-   lp_build_init();
-   assert(lp_build_engine);
-   draw->engine = lp_build_engine;
-   draw->llvm = draw_llvm_create(draw);
+   if(draw_get_option_use_llvm())
+   {
+      lp_build_init();
+      assert(lp_build_engine);
+      draw->engine = lp_build_engine;
+      draw->llvm = draw_llvm_create(draw);
+   }
 #endif
 
    if (!draw_init(draw))
@@ -83,6 +106,8 @@ boolean draw_init(struct draw_context *draw)
    ASSIGN_4V( draw->plane[4],  0,  0,  1, 1 ); /* yes these are correct */
    ASSIGN_4V( draw->plane[5],  0,  0, -1, 1 ); /* mesa's a bit wonky */
    draw->nr_planes = 6;
+   draw->clip_xy = 1;
+   draw->clip_z = 1;
 
 
    draw->reduced_prim = ~0; /* != any of PIPE_PRIM_x */
@@ -135,7 +160,8 @@ void draw_destroy( struct draw_context *draw )
    draw_vs_destroy( draw );
    draw_gs_destroy( draw );
 #ifdef HAVE_LLVM
-   draw_llvm_destroy( draw->llvm );
+   if(draw->llvm)
+      draw_llvm_destroy( draw->llvm );
 #endif
 
    FREE( draw );
@@ -162,6 +188,14 @@ void draw_set_mrd(struct draw_context *draw, double mrd)
 }
 
 
+static void update_clip_flags( struct draw_context *draw )
+{
+   draw->clip_xy = !draw->driver.bypass_clip_xy;
+   draw->clip_z = (!draw->driver.bypass_clip_z &&
+                   !draw->depth_clamp);
+   draw->clip_user = (draw->nr_planes > 6);
+}
+
 /**
  * Register new primitive rasterization/rendering state.
  * This causes the drawing pipeline to be rebuilt.
@@ -176,18 +210,25 @@ void draw_set_rasterizer_state( struct draw_context *draw,
       draw->rasterizer = raster;
       draw->rast_handle = rast_handle;
 
-      draw->bypass_clipping = draw->driver.bypass_clipping;
-   }
+  }
 }
 
-
+/* With a little more work, llvmpipe will be able to turn this off and
+ * do its own x/y clipping.  
+ *
+ * Some hardware can turn off clipping altogether - in particular any
+ * hardware with a TNL unit can do its own clipping, even if it is
+ * relying on the draw module for some other reason.
+ */
 void draw_set_driver_clipping( struct draw_context *draw,
-                               boolean bypass_clipping )
+                               boolean bypass_clip_xy,
+                               boolean bypass_clip_z )
 {
    draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
 
-   draw->driver.bypass_clipping = bypass_clipping;
-   draw->bypass_clipping = draw->driver.bypass_clipping;
+   draw->driver.bypass_clip_xy = bypass_clip_xy;
+   draw->driver.bypass_clip_z = bypass_clip_z;
+   update_clip_flags(draw);
 }
 
 
@@ -217,6 +258,8 @@ void draw_set_clip_state( struct draw_context *draw,
    memcpy(&draw->plane[6], clip->ucp, clip->nr * sizeof(clip->ucp[0]));
    draw->nr_planes = 6 + clip->nr;
    draw->depth_clamp = clip->depth_clamp;
+
+   update_clip_flags(draw);
 }
 
 
@@ -472,47 +515,28 @@ void draw_set_render( struct draw_context *draw,
 }
 
 
-
-/**
- * Tell the drawing context about the index/element buffer to use
- * (ala glDrawElements)
- * If no element buffer is to be used (i.e. glDrawArrays) then this
- * should be called with eltSize=0 and elements=NULL.
- *
- * \param draw  the drawing context
- * \param eltSize  size of each element (1, 2 or 4 bytes)
- * \param elements  the element buffer ptr
- */
 void
-draw_set_mapped_element_buffer_range( struct draw_context *draw,
-                                      unsigned eltSize,
-                                      int eltBias,
-                                      unsigned min_index,
-                                      unsigned max_index,
-                                      const void *elements )
+draw_set_index_buffer(struct draw_context *draw,
+                      const struct pipe_index_buffer *ib)
 {
-   draw->pt.user.elts = elements;
-   draw->pt.user.eltSize = eltSize;
-   draw->pt.user.eltBias = eltBias;
-   draw->pt.user.min_index = min_index;
-   draw->pt.user.max_index = max_index;
+   if (ib)
+      memcpy(&draw->pt.index_buffer, ib, sizeof(draw->pt.index_buffer));
+   else
+      memset(&draw->pt.index_buffer, 0, sizeof(draw->pt.index_buffer));
 }
 
 
+/**
+ * Tell drawing context where to find mapped index/element buffer.
+ */
 void
-draw_set_mapped_element_buffer( struct draw_context *draw,
-                                unsigned eltSize,
-                                int eltBias,
-                                const void *elements )
+draw_set_mapped_index_buffer(struct draw_context *draw,
+                             const void *elements)
 {
-   draw->pt.user.elts = elements;
-   draw->pt.user.eltSize = eltSize;
-   draw->pt.user.eltBias = eltBias;
-   draw->pt.user.min_index = 0;
-   draw->pt.user.max_index = 0xffffffff;
+    draw->pt.user.elts = elements;
 }
 
- 
+
 /* Revamp me please:
  */
 void draw_do_flush( struct draw_context *draw, unsigned flags )
@@ -659,7 +683,8 @@ draw_set_mapped_texture(struct draw_context *draw,
                         const void *data[DRAW_MAX_TEXTURE_LEVELS])
 {
 #ifdef HAVE_LLVM
-   draw_llvm_set_mapped_texture(draw,
+   if(draw->llvm)
+      draw_llvm_set_mapped_texture(draw,
                                 sampler_idx,
                                 width, height, depth, last_level,
                                 row_stride, img_stride, data);
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index 116716af6f..4c780e4dcb 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -160,18 +160,11 @@ void draw_set_vertex_elements(struct draw_context *draw,
 			      unsigned count,
                               const struct pipe_vertex_element *elements);
 
-void
-draw_set_mapped_element_buffer_range( struct draw_context *draw,
-                                      unsigned eltSize,
-                                      int eltBias,
-                                      unsigned min_index,
-                                      unsigned max_index,
-                                      const void *elements );
-
-void draw_set_mapped_element_buffer( struct draw_context *draw,
-                                     unsigned eltSize, 
-                                     int eltBias,
-                                     const void *elements );
+void draw_set_index_buffer(struct draw_context *draw,
+                           const struct pipe_index_buffer *ib);
+
+void draw_set_mapped_index_buffer(struct draw_context *draw,
+                                  const void *elements);
 
 void draw_set_mapped_vertex_buffer(struct draw_context *draw,
                                    unsigned attr, const void *buffer);
@@ -196,6 +189,9 @@ draw_set_so_state(struct draw_context *draw,
  * draw_pt.c 
  */
 
+void draw_vbo(struct draw_context *draw,
+              const struct pipe_draw_info *info);
+
 void draw_arrays(struct draw_context *draw, unsigned prim,
 		 unsigned start, unsigned count);
 
@@ -216,7 +212,8 @@ void draw_set_render( struct draw_context *draw,
 		      struct vbuf_render *render );
 
 void draw_set_driver_clipping( struct draw_context *draw,
-                               boolean bypass_clipping );
+                               boolean bypass_clip_xy,
+                               boolean bypass_clip_z );
 
 void draw_set_force_passthrough( struct draw_context *draw, 
                                  boolean enable );
diff --git a/src/gallium/auxiliary/draw/draw_decompose_tmp.h b/src/gallium/auxiliary/draw/draw_decompose_tmp.h
index a52d2b5058..a142563af9 100644
--- a/src/gallium/auxiliary/draw/draw_decompose_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_decompose_tmp.h
@@ -54,10 +54,10 @@ FUNC(FUNC_VARS)
 
    FUNC_ENTER;
 
-   /* prim, count, and last_vertex_last should have been defined */
+   /* prim, prim_flags, count, and last_vertex_last should have been defined */
    if (0) {
-      debug_printf("%s: prim 0x%x, count %d, last_vertex_last %d\n",
-            __FUNCTION__, prim, count, last_vertex_last);
+      debug_printf("%s: prim 0x%x, prim_flags 0x%x, count %d, last_vertex_last %d\n",
+            __FUNCTION__, prim, prim_flags, count, last_vertex_last);
    }
 
    switch (prim) {
@@ -80,7 +80,7 @@ FUNC(FUNC_VARS)
    case PIPE_PRIM_LINE_LOOP:
    case PIPE_PRIM_LINE_STRIP:
       if (count >= 2) {
-         flags = DRAW_PIPE_RESET_STIPPLE;
+         flags = (prim_flags & DRAW_SPLIT_BEFORE) ? 0 : DRAW_PIPE_RESET_STIPPLE;
          idx[1] = GET_ELT(0);
          idx[2] = idx[1];
 
@@ -90,7 +90,7 @@ FUNC(FUNC_VARS)
             LINE(flags, idx[0], idx[1]);
          }
          /* close the loop */
-         if (prim == PIPE_PRIM_LINE_LOOP)
+         if (prim == PIPE_PRIM_LINE_LOOP && !prim_flags)
             LINE(flags, idx[1], idx[2]);
       }
       break;
@@ -255,17 +255,23 @@ FUNC(FUNC_VARS)
 
          if (last_vertex_last) {
             flags = (DRAW_PIPE_RESET_STIPPLE |
-                     DRAW_PIPE_EDGE_FLAG_2 |
                      DRAW_PIPE_EDGE_FLAG_0);
+            if (!(prim_flags & DRAW_SPLIT_BEFORE))
+               flags |= DRAW_PIPE_EDGE_FLAG_2;
+
             edge_next = DRAW_PIPE_EDGE_FLAG_0;
-            edge_finish = DRAW_PIPE_EDGE_FLAG_1;
+            edge_finish =
+               (prim_flags & DRAW_SPLIT_AFTER) ? 0 : DRAW_PIPE_EDGE_FLAG_1;
          }
          else {
             flags = (DRAW_PIPE_RESET_STIPPLE |
-                     DRAW_PIPE_EDGE_FLAG_0 |
                      DRAW_PIPE_EDGE_FLAG_1);
+            if (!(prim_flags & DRAW_SPLIT_BEFORE))
+               flags |= DRAW_PIPE_EDGE_FLAG_0;
+
             edge_next = DRAW_PIPE_EDGE_FLAG_1;
-            edge_finish = DRAW_PIPE_EDGE_FLAG_2;
+            edge_finish =
+               (prim_flags & DRAW_SPLIT_AFTER) ? 0 : DRAW_PIPE_EDGE_FLAG_2;
          }
 
          idx[0] = GET_ELT(0);
@@ -300,7 +306,7 @@ FUNC(FUNC_VARS)
 
    case PIPE_PRIM_LINE_STRIP_ADJACENCY:
       if (count >= 4) {
-         flags = DRAW_PIPE_RESET_STIPPLE;
+         flags = (prim_flags & DRAW_SPLIT_BEFORE) ? 0 : DRAW_PIPE_RESET_STIPPLE;
          idx[1] = GET_ELT(0);
          idx[2] = GET_ELT(1);
          idx[3] = GET_ELT(2);
diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index 4a1013e79a..50a03ac95a 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -380,7 +380,7 @@ static void gs_tri_adj(struct draw_geometry_shader *shader,
 
 #define FUNC         gs_run_elts
 #define LOCAL_VARS   const ushort *elts = input_prims->elts;
-#define GET_ELT(idx) (elts[idx] & ~DRAW_PIPE_FLAG_MASK)
+#define GET_ELT(idx) (elts[idx])
 #include "draw_gs_tmp.h"
 
 
@@ -457,6 +457,7 @@ int draw_geometry_shader_run(struct draw_geometry_shader *shader,
    output_prims->start = 0;
    output_prims->count = shader->emitted_vertices;
    output_prims->prim = shader->output_primitive;
+   output_prims->flags = 0x0;
    output_prims->primitive_lengths = shader->primitive_lengths;
    output_prims->primitive_count = shader->emitted_primitives;
    output_verts->count = shader->emitted_vertices;
diff --git a/src/gallium/auxiliary/draw/draw_gs_tmp.h b/src/gallium/auxiliary/draw/draw_gs_tmp.h
index 4a17af0dea..de7b02655a 100644
--- a/src/gallium/auxiliary/draw/draw_gs_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_gs_tmp.h
@@ -6,12 +6,10 @@
 
 #define FUNC_ENTER                                                \
    /* declare more local vars */                                  \
-   struct draw_context *draw = gs->draw;                          \
    const unsigned prim = input_prims->prim;                       \
+   const unsigned prim_flags = input_prims->flags;                \
    const unsigned count = input_prims->count;                     \
-   const boolean last_vertex_last =                               \
-      !(draw->rasterizer->flatshade &&                            \
-        draw->rasterizer->flatshade_first);                       \
+   const boolean last_vertex_last = TRUE;                         \
    do {                                                           \
       debug_assert(input_prims->primitive_count == 1);            \
       switch (prim) {                                             \
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 8d53601d19..8759c38cab 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -210,13 +210,6 @@ draw_llvm_create(struct draw_context *draw)
 {
    struct draw_llvm *llvm;
 
-#ifdef PIPE_ARCH_X86
-   util_cpu_detect();
-   /* require SSE2 due to LLVM PR6960. */
-   if (!util_cpu_caps.has_sse2)
-       return NULL;
-#endif
-
    llvm = CALLOC_STRUCT( draw_llvm );
    if (!llvm)
       return NULL;
@@ -292,15 +285,23 @@ draw_llvm_destroy(struct draw_llvm *llvm)
 }
 
 struct draw_llvm_variant *
-draw_llvm_create_variant(struct draw_llvm *llvm, int num_inputs)
+draw_llvm_create_variant(struct draw_llvm *llvm,
+			 unsigned num_inputs,
+			 const struct draw_llvm_variant_key *key)
 {
-   struct draw_llvm_variant *variant = MALLOC(sizeof(struct draw_llvm_variant));
+   struct draw_llvm_variant *variant;
    struct llvm_vertex_shader *shader =
       llvm_vertex_shader(llvm->draw->vs.vertex_shader);
 
+   variant = MALLOC(sizeof *variant +
+		    shader->variant_key_size -
+		    sizeof variant->key);
+   if (variant == NULL)
+      return NULL;
+
    variant->llvm = llvm;
 
-   draw_llvm_make_variant_key(llvm, &variant->key);
+   memcpy(&variant->key, key, shader->variant_key_size);
 
    llvm->vertex_header_ptr_type = create_vertex_header(llvm, num_inputs);
 
@@ -738,8 +739,9 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0);
 
    /* code generated texture sampling */
-   sampler = draw_llvm_sampler_soa_create(variant->key.sampler,
-                                          context_ptr);
+   sampler = draw_llvm_sampler_soa_create(
+      draw_llvm_variant_key_samplers(&variant->key),
+      context_ptr);
 
 #if DEBUG_STORE
    lp_build_printf(builder, "start = %d, end = %d, step = %d\n",
@@ -901,8 +903,9 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0);
 
    /* code generated texture sampling */
-   sampler = draw_llvm_sampler_soa_create(variant->key.sampler,
-                                          context_ptr);
+   sampler = draw_llvm_sampler_soa_create(
+      draw_llvm_variant_key_samplers(&variant->key),
+      context_ptr);
 
    fetch_max = LLVMBuildSub(builder, fetch_count,
                             LLVMConstInt(LLVMInt32Type(), 1, 0),
@@ -1002,35 +1005,42 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    lp_func_delete_body(variant->function_elts);
 }
 
-void
-draw_llvm_make_variant_key(struct draw_llvm *llvm,
-                           struct draw_llvm_variant_key *key)
+
+struct draw_llvm_variant_key *
+draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store)
 {
    unsigned i;
+   struct draw_llvm_variant_key *key;
+   struct lp_sampler_static_state *sampler;
 
-   memset(key, 0, sizeof(struct draw_llvm_variant_key));
+   key = (struct draw_llvm_variant_key *)store;
 
+   /* Presumably all variants of the shader should have the same
+    * number of vertex elements - ie the number of shader inputs.
+    */
    key->nr_vertex_elements = llvm->draw->pt.nr_vertex_elements;
 
+   /* All variants of this shader will have the same value for
+    * nr_samplers.  Not yet trying to compact away holes in the
+    * sampler array.
+    */
+   key->nr_samplers = llvm->draw->vs.vertex_shader->info.file_max[TGSI_FILE_SAMPLER] + 1;
+
+   sampler = draw_llvm_variant_key_samplers(key);
+
    memcpy(key->vertex_element,
           llvm->draw->pt.vertex_element,
           sizeof(struct pipe_vertex_element) * key->nr_vertex_elements);
+   
+   memset(sampler, 0, key->nr_samplers * sizeof *sampler);
 
-   memcpy(&key->vs,
-          &llvm->draw->vs.vertex_shader->state,
-          sizeof(struct pipe_shader_state));
-
-   /* if the driver implemented the sampling hooks then
-    * setup our sampling state */
-   if (llvm->draw->num_sampler_views && llvm->draw->num_samplers) {
-      for(i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; ++i) {
-         struct draw_vertex_shader *shader = llvm->draw->vs.vertex_shader;
-         if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i))
-            lp_sampler_static_state(&key->sampler[i],
-                                    llvm->draw->sampler_views[i],
-                                    llvm->draw->samplers[i]);
-      }
+   for (i = 0 ; i < key->nr_samplers; i++) {
+      lp_sampler_static_state(&sampler[i],
+			      llvm->draw->sampler_views[i],
+			      llvm->draw->samplers[i]);
    }
+
+   return key;
 }
 
 void
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
index 4addb47d2d..6196b2f983 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -151,12 +151,43 @@ typedef void
 
 struct draw_llvm_variant_key
 {
-   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
-   unsigned                   nr_vertex_elements;
-   struct pipe_shader_state   vs;
-   struct lp_sampler_static_state sampler[PIPE_MAX_VERTEX_SAMPLERS];
+   unsigned nr_vertex_elements:16;
+   unsigned nr_samplers:16;
+
+   /* Variable number of vertex elements:
+    */
+   struct pipe_vertex_element vertex_element[1];
+
+   /* Followed by variable number of samplers:
+    */
+/*   struct lp_sampler_static_state sampler; */
 };
 
+#define DRAW_LLVM_MAX_VARIANT_KEY_SIZE \
+   (sizeof(struct draw_llvm_variant_key) +	\
+    PIPE_MAX_VERTEX_SAMPLERS * sizeof(struct lp_sampler_static_state) +	\
+    (PIPE_MAX_ATTRIBS-1) * sizeof(struct pipe_vertex_element))
+
+
+static INLINE size_t
+draw_llvm_variant_key_size(unsigned nr_vertex_elements,
+			   unsigned nr_samplers)
+{
+   return (sizeof(struct draw_llvm_variant_key) +
+	   nr_samplers * sizeof(struct lp_sampler_static_state) +
+	   (nr_vertex_elements - 1) * sizeof(struct pipe_vertex_element));
+}
+
+
+static INLINE struct lp_sampler_static_state *
+draw_llvm_variant_key_samplers(struct draw_llvm_variant_key *key)
+{
+   return (struct lp_sampler_static_state *)
+      &key->vertex_element[key->nr_vertex_elements];
+}
+
+
+
 struct draw_llvm_variant_list_item
 {
    struct draw_llvm_variant *base;
@@ -165,7 +196,6 @@ struct draw_llvm_variant_list_item
 
 struct draw_llvm_variant
 {
-   struct draw_llvm_variant_key key;
    LLVMValueRef function;
    LLVMValueRef function_elts;
    draw_jit_vert_func jit_func;
@@ -176,11 +206,16 @@ struct draw_llvm_variant
    struct draw_llvm *llvm;
    struct draw_llvm_variant_list_item list_item_global;
    struct draw_llvm_variant_list_item list_item_local;
+
+   /* key is variable-sized, must be last */
+   struct draw_llvm_variant_key key;
+   /* key is variable-sized, must be last */
 };
 
 struct llvm_vertex_shader {
    struct draw_vertex_shader base;
 
+   unsigned variant_key_size;
    struct draw_llvm_variant_list_item variants;
    unsigned variants_created;
    unsigned variants_cached;
@@ -220,14 +255,15 @@ void
 draw_llvm_destroy(struct draw_llvm *llvm);
 
 struct draw_llvm_variant *
-draw_llvm_create_variant(struct draw_llvm *llvm, int num_inputs);
+draw_llvm_create_variant(struct draw_llvm *llvm,
+			 unsigned num_vertex_header_attribs,
+			 const struct draw_llvm_variant_key *key);
 
 void
 draw_llvm_destroy_variant(struct draw_llvm_variant *variant);
 
-void
-draw_llvm_make_variant_key(struct draw_llvm *llvm,
-                           struct draw_llvm_variant_key *key);
+struct draw_llvm_variant_key *
+draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store);
 
 LLVMValueRef
 draw_llvm_translate_from(LLVMBuilderRef builder,
diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c
index 58995e0724..6206197dae 100644
--- a/src/gallium/auxiliary/draw/draw_pipe.c
+++ b/src/gallium/auxiliary/draw/draw_pipe.c
@@ -169,35 +169,27 @@ static void do_triangle( struct draw_context *draw,
 /*
  * Set up macros for draw_pt_decompose.h template code.
  * This code uses vertex indexes / elements.
- *
- * Flags are needed by the stipple and unfilled stages.  When the two stages
- * are active, vcache_run_extras is called and the flags are stored in the
- * higher bits of i0.  Otherwise, flags do not matter.
  */
 
 #define TRIANGLE(flags,i0,i1,i2)                                  \
    do {                                                           \
-      assert(!((i1) & DRAW_PIPE_FLAG_MASK));                      \
-      assert(!((i2) & DRAW_PIPE_FLAG_MASK));                      \
       do_triangle( draw,                                          \
-                   i0,  /* flags */                               \
-                   verts + stride * (i0 & ~DRAW_PIPE_FLAG_MASK),  \
+                   flags,                                         \
+                   verts + stride * (i0),                         \
                    verts + stride * (i1),                         \
                    verts + stride * (i2) );                       \
    } while (0)
 
 #define LINE(flags,i0,i1)                                         \
    do {                                                           \
-      assert(!((i1) & DRAW_PIPE_FLAG_MASK));                      \
       do_line( draw,                                              \
-               i0, /* flags */                                    \
-               verts + stride * (i0 & ~DRAW_PIPE_FLAG_MASK),      \
+               flags,                                             \
+               verts + stride * (i0),                             \
                verts + stride * (i1) );                           \
    } while (0)
 
 #define POINT(i0)                               \
    do {                                         \
-      assert(!((i0) & DRAW_PIPE_FLAG_MASK));    \
       do_point( draw, verts + stride * (i0) );  \
    } while (0)
 
@@ -207,6 +199,7 @@ static void do_triangle( struct draw_context *draw,
 #define FUNC_VARS                               \
     struct draw_context *draw,                  \
     unsigned prim,                              \
+    unsigned prim_flags,                        \
     struct vertex_header *vertices,             \
     unsigned stride,                            \
     const ushort *elts,                         \
@@ -245,22 +238,27 @@ void draw_pipeline_run( struct draw_context *draw,
       const unsigned count = prim_info->primitive_lengths[i];
 
 #if DEBUG
-      /* make sure none of the element indexes go outside the vertex buffer */
+      /* Warn if one of the element indexes go outside the vertex buffer */
       {
          unsigned max_index = 0x0, i;
          /* find the largest element index */
          for (i = 0; i < count; i++) {
-            unsigned int index = (prim_info->elts[start + i]
-                                  & ~DRAW_PIPE_FLAG_MASK);
+            unsigned int index = prim_info->elts[start + i];
             if (index > max_index)
                max_index = index;
          }
-         assert(max_index <= vert_info->count);
+         if (max_index >= vert_info->count) {
+            debug_printf("%s: max_index (%u) outside vertex buffer (%u)\n",
+                         __FUNCTION__,
+                         max_index,
+                         vert_info->count);
+         }
       }
 #endif
 
       pipe_run_elts(draw,
                     prim_info->prim,
+                    prim_info->flags,
                     vert_info->verts,
                     vert_info->stride,
                     prim_info->elts + start,
@@ -298,6 +296,7 @@ void draw_pipeline_run( struct draw_context *draw,
 #define FUNC_VARS                      \
     struct draw_context *draw,         \
     unsigned prim,                     \
+    unsigned prim_flags,               \
     struct vertex_header *vertices,    \
     unsigned stride,                   \
     unsigned count
@@ -330,6 +329,7 @@ void draw_pipeline_run_linear( struct draw_context *draw,
 
       pipe_run_linear(draw,
                       prim_info->prim,
+                      prim_info->flags,
                       (struct vertex_header*)verts,
                       vert_info->stride,
                       count);
diff --git a/src/gallium/auxiliary/draw/draw_pipe_validate.c b/src/gallium/auxiliary/draw/draw_pipe_validate.c
index eafa29276f..8b92543987 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_validate.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_validate.c
@@ -265,7 +265,7 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
 
    /* Clip stage
     */
-   if (!draw->bypass_clipping)
+   if (draw->clip_xy || draw->clip_z || draw->clip_user)
    {
       draw->pipeline.clip->next = next;
       next = draw->pipeline.clip;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index 3c93c9014a..58c5858734 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -353,9 +353,6 @@ vbuf_alloc_vertices( struct vbuf_stage *vbuf )
    /* Allocate a new vertex buffer */
    vbuf->max_vertices = vbuf->render->max_vertex_buffer_bytes / vbuf->vertex_size;
 
-   /* even number */
-   vbuf->max_vertices = vbuf->max_vertices & ~1;
-
    if(vbuf->max_vertices >= UNDEFINED_VERTEX_ID)
       vbuf->max_vertices = UNDEFINED_VERTEX_ID - 1;
 
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 397d4bf653..362f563ba6 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -140,8 +140,7 @@ struct draw_context
       } middle;
 
       struct {
-         struct draw_pt_front_end *vcache;
-         struct draw_pt_front_end *varray;
+         struct draw_pt_front_end *vsplit;
       } front;
 
       struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
@@ -150,6 +149,8 @@ struct draw_context
       struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
       unsigned nr_vertex_elements;
 
+      struct pipe_index_buffer index_buffer;
+
       /* user-space vertex data, buffers */
       struct {
          /** vertex element/index buffer (ex: glDrawElements) */
@@ -175,13 +176,19 @@ struct draw_context
    } pt;
 
    struct {
-      boolean bypass_clipping;
-      boolean bypass_vs;
+      boolean bypass_clip_xy;
+      boolean bypass_clip_z;
    } driver;
 
    boolean flushing;         /**< debugging/sanity */
    boolean suspend_flushing; /**< internally set */
-   boolean bypass_clipping;  /**< set if either api or driver bypass_clipping true */
+
+   /* Flags set if API requires clipping in these planes and the
+    * driver doesn't indicate that it can do it for us.
+    */
+   boolean clip_xy;
+   boolean clip_z;
+   boolean clip_user;
 
    boolean force_passthrough; /**< never clip or shade */
 
@@ -296,6 +303,10 @@ struct draw_vertex_info {
    unsigned count;
 };
 
+/* these flags are set if the primitive is a segment of a larger one */
+#define DRAW_SPLIT_BEFORE 0x1
+#define DRAW_SPLIT_AFTER  0x2
+
 struct draw_prim_info {
    boolean linear;
    unsigned start;
@@ -304,6 +315,7 @@ struct draw_prim_info {
    unsigned count;
 
    unsigned prim;
+   unsigned flags;
    unsigned *primitive_lengths;
    unsigned primitive_count;
 };
@@ -369,21 +381,15 @@ void draw_pipeline_destroy( struct draw_context *draw );
 
 
 
-/* We use the top few bits in the elts[] parameter to convey a little
- * API information.  This limits the number of vertices we can address
- * to only 4096 -- if that becomes a problem, we can switch to 32-bit
- * draw indices.
- *
- * These flags expected at first vertex of lines & triangles when
- * unfilled and/or line stipple modes are operational.
+/*
+ * These flags are used by the pipeline when unfilled and/or line stipple modes
+ * are operational.
  */
-#define DRAW_PIPE_MAX_VERTICES  (0x1<<12)
-#define DRAW_PIPE_EDGE_FLAG_0   (0x1<<12)
-#define DRAW_PIPE_EDGE_FLAG_1   (0x2<<12)
-#define DRAW_PIPE_EDGE_FLAG_2   (0x4<<12)
-#define DRAW_PIPE_EDGE_FLAG_ALL (0x7<<12)
-#define DRAW_PIPE_RESET_STIPPLE (0x8<<12)
-#define DRAW_PIPE_FLAG_MASK     (0xf<<12)
+#define DRAW_PIPE_EDGE_FLAG_0   0x1
+#define DRAW_PIPE_EDGE_FLAG_1   0x2
+#define DRAW_PIPE_EDGE_FLAG_2   0x4
+#define DRAW_PIPE_EDGE_FLAG_ALL 0x7
+#define DRAW_PIPE_RESET_STIPPLE 0x8
 
 void draw_pipeline_run( struct draw_context *draw,
                         const struct draw_vertex_info *vert,
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 248927505d..f44bf2507c 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -39,25 +39,14 @@
 #include "util/u_math.h"
 #include "util/u_prim.h"
 #include "util/u_format.h"
+#include "util/u_draw.h"
 
 
 DEBUG_GET_ONCE_BOOL_OPTION(draw_fse, "DRAW_FSE", FALSE)
 DEBUG_GET_ONCE_BOOL_OPTION(draw_no_fse, "DRAW_NO_FSE", FALSE)
-#ifdef HAVE_LLVM
-DEBUG_GET_ONCE_BOOL_OPTION(draw_use_llvm, "DRAW_USE_LLVM", TRUE)
-#endif
-
-static unsigned trim( unsigned count, unsigned first, unsigned incr )
-{
-   if (count < first)
-      return 0;
-   return count - (count - first) % incr; 
-}
-
-
 
 /* Overall we split things into:
- *     - frontend -- prepare fetch_elts, draw_elts - eg vcache
+ *     - frontend -- prepare fetch_elts, draw_elts - eg vsplit
  *     - middle   -- fetch, shade, cliptest, viewport
  *     - pipeline -- the prim pipeline: clipping, wide lines, etc 
  *     - backend  -- the vbuf_render provided by the driver.
@@ -77,7 +66,7 @@ draw_pt_arrays(struct draw_context *draw,
    {
       unsigned first, incr;
       draw_pt_split_prim(prim, &first, &incr);
-      count = trim(count, first, incr);
+      count = draw_pt_trim_count(count, first, incr);
       if (count < first)
          return TRUE;
    }
@@ -97,7 +86,9 @@ draw_pt_arrays(struct draw_context *draw,
          opt |= PT_PIPELINE;
       }
 
-      if (!draw->bypass_clipping && !draw->pt.test_fse) {
+      if ((draw->clip_xy ||
+           draw->clip_z ||
+           draw->clip_user) && !draw->pt.test_fse) {
          opt |= PT_CLIPTEST;
       }
 
@@ -115,22 +106,11 @@ draw_pt_arrays(struct draw_context *draw,
          middle = draw->pt.middle.general;
    }
 
-
-   /* Pick the right frontend
-    */
-   if (draw->pt.user.elts || (opt & PT_PIPELINE)) {
-      frontend = draw->pt.front.vcache;
-   } else {
-      frontend = draw->pt.front.varray;
-   }
+   frontend = draw->pt.front.vsplit;
 
    frontend->prepare( frontend, prim, middle, opt );
 
-   frontend->run(frontend,
-                 draw_pt_elt_func(draw),
-                 draw_pt_elt_ptr(draw, start),
-                 draw->pt.user.eltBias,
-                 count);
+   frontend->run(frontend, start, count);
 
    frontend->finish( frontend );
 
@@ -143,12 +123,8 @@ boolean draw_pt_init( struct draw_context *draw )
    draw->pt.test_fse = debug_get_option_draw_fse();
    draw->pt.no_fse = debug_get_option_draw_no_fse();
 
-   draw->pt.front.vcache = draw_pt_vcache( draw );
-   if (!draw->pt.front.vcache)
-      return FALSE;
-
-   draw->pt.front.varray = draw_pt_varray(draw);
-   if (!draw->pt.front.varray)
+   draw->pt.front.vsplit = draw_pt_vsplit(draw);
+   if (!draw->pt.front.vsplit)
       return FALSE;
 
    draw->pt.middle.fetch_emit = draw_pt_fetch_emit( draw );
@@ -164,7 +140,7 @@ boolean draw_pt_init( struct draw_context *draw )
       return FALSE;
 
 #if HAVE_LLVM
-   if (debug_get_option_draw_use_llvm())
+   if (draw->llvm)
       draw->pt.middle.llvm = draw_pt_fetch_pipeline_or_emit_llvm( draw );
 #endif
 
@@ -194,14 +170,9 @@ void draw_pt_destroy( struct draw_context *draw )
       draw->pt.middle.fetch_shade_emit = NULL;
    }
 
-   if (draw->pt.front.vcache) {
-      draw->pt.front.vcache->destroy( draw->pt.front.vcache );
-      draw->pt.front.vcache = NULL;
-   }
-
-   if (draw->pt.front.varray) {
-      draw->pt.front.varray->destroy( draw->pt.front.varray );
-      draw->pt.front.varray = NULL;
+   if (draw->pt.front.vsplit) {
+      draw->pt.front.vsplit->destroy( draw->pt.front.vsplit );
+      draw->pt.front.vsplit = NULL;
    }
 }
 
@@ -221,24 +192,29 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count)
       uint ii = 0;
       uint j;
 
-      if (draw->pt.user.elts) {
+      if (draw->pt.user.eltSize) {
+         const char *elts;
+
          /* indexed arrays */
+         elts = (const char *) draw->pt.user.elts;
+         elts += draw->pt.index_buffer.offset;
+
          switch (draw->pt.user.eltSize) {
          case 1:
             {
-               const ubyte *elem = (const ubyte *) draw->pt.user.elts;
+               const ubyte *elem = (const ubyte *) elts;
                ii = elem[start + i];
             }
             break;
          case 2:
             {
-               const ushort *elem = (const ushort *) draw->pt.user.elts;
+               const ushort *elem = (const ushort *) elts;
                ii = elem[start + i];
             }
             break;
          case 4:
             {
-               const uint *elem = (const uint *) draw->pt.user.elts;
+               const uint *elem = (const uint *) elts;
                ii = elem[start + i];
             }
             break;
@@ -324,17 +300,8 @@ draw_arrays(struct draw_context *draw, unsigned prim,
 
 
 /**
- * Draw vertex arrays.
- * This is the main entrypoint into the drawing module.
- * If drawing an indexed primitive, the draw_set_mapped_element_buffer_range()
- * function should have already been called to specify the element/index buffer
- * information.
- *
- * \param prim  one of PIPE_PRIM_x
- * \param start  index of first vertex to draw
- * \param count  number of vertices to draw
- * \param startInstance  number for the first primitive instance (usually 0).
- * \param instanceCount  number of instances to draw (1=non-instanced)
+ * Instanced drawing.
+ * \sa draw_vbo
  */
 void
 draw_arrays_instanced(struct draw_context *draw,
@@ -344,10 +311,50 @@ draw_arrays_instanced(struct draw_context *draw,
                       unsigned startInstance,
                       unsigned instanceCount)
 {
-   unsigned reduced_prim = u_reduced_prim(mode);
+   struct pipe_draw_info info;
+
+   util_draw_init_info(&info);
+
+   info.mode = mode;
+   info.start = start;
+   info.count = count;
+   info.start_instance = startInstance;
+   info.instance_count = instanceCount;
+
+   info.indexed = (draw->pt.user.elts != NULL);
+   if (!info.indexed) {
+      info.min_index = start;
+      info.max_index = start + count - 1;
+   }
+
+   draw_vbo(draw, &info);
+}
+
+
+/**
+ * Draw vertex arrays.
+ * This is the main entrypoint into the drawing module.  If drawing an indexed
+ * primitive, the draw_set_index_buffer() and draw_set_mapped_index_buffer()
+ * functions should have already been called to specify the element/index
+ * buffer information.
+ */
+void
+draw_vbo(struct draw_context *draw,
+         const struct pipe_draw_info *info)
+{
+   unsigned reduced_prim = u_reduced_prim(info->mode);
    unsigned instance;
 
-   assert(instanceCount > 0);
+   assert(info->instance_count > 0);
+   if (info->indexed)
+      assert(draw->pt.user.elts);
+
+   draw->pt.user.eltSize =
+      (info->indexed) ? draw->pt.index_buffer.index_size : 0;
+
+   draw->pt.user.eltBias = info->index_bias;
+   draw->pt.user.min_index = info->min_index;
+   draw->pt.user.max_index = info->max_index;
 
    if (reduced_prim != draw->reduced_prim) {
       draw_do_flush(draw, DRAW_FLUSH_STATE_CHANGE);
@@ -355,8 +362,8 @@ draw_arrays_instanced(struct draw_context *draw,
    }
 
    if (0)
-      debug_printf("draw_arrays(mode=%u start=%u count=%u):\n",
-                   mode, start, count);
+      debug_printf("draw_vbo(mode=%u start=%u count=%u):\n",
+                   info->mode, info->start, info->count);
 
    if (0)
       tgsi_dump(draw->vs.vertex_shader->state.tokens, 0);
@@ -384,10 +391,10 @@ draw_arrays_instanced(struct draw_context *draw,
    }
 
    if (0)
-      draw_print_arrays(draw, mode, start, MIN2(count, 20));
+      draw_print_arrays(draw, info->mode, info->start, MIN2(info->count, 20));
 
-   for (instance = 0; instance < instanceCount; instance++) {
-      draw->instance_id = instance + startInstance;
-      draw_pt_arrays(draw, mode, start, count);
+   for (instance = 0; instance < info->instance_count; instance++) {
+      draw->instance_id = instance + info->start_instance;
+      draw_pt_arrays(draw, info->mode, info->start, info->count);
    }
 }
diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h
index 44356fba4c..5fbb424291 100644
--- a/src/gallium/auxiliary/draw/draw_pt.h
+++ b/src/gallium/auxiliary/draw/draw_pt.h
@@ -35,8 +35,6 @@
 
 #include "pipe/p_compiler.h"
 
-typedef unsigned (*pt_elt_func)( const void *elts, unsigned idx );
-
 struct draw_pt_middle_end;
 struct draw_context;
 struct draw_prim_info;
@@ -52,13 +50,18 @@ struct draw_vertex_info;
 /* The "front end" - prepare sets of fetch, draw elements for the
  * middle end.
  *
- * Currenly one version of this:
- *    - vcache - catchall implementation, decomposes to TRI/LINE/POINT prims
- * Later:
- *    - varray, varray_split
- *    - velement, velement_split
+ * The fetch elements are indices to the vertices.  The draw elements are
+ * indices to the fetched vertices.  When both arrays of elements are both
+ * linear, middle->run_linear is called;  When only the fetch elements are
+ * linear, middle->run_linear_elts is called;  Otherwise, middle->run is
+ * called.
+ *
+ * When the number of the draw elements exceeds max_vertex of the middle end,
+ * the draw elements (as well as the fetch elements) are splitted and the
+ * middle end is called multiple times.
  *
- * Currenly only using the vcache version.
+ * Currenly there is:
+ *    - vsplit - catchall implementation, splits big prims
  */
 struct draw_pt_front_end {
    void (*prepare)( struct draw_pt_front_end *,
@@ -67,9 +70,7 @@ struct draw_pt_front_end {
 		    unsigned opt );
 
    void (*run)( struct draw_pt_front_end *,
-                pt_elt_func elt_func,
-                const void *elt_ptr,
-                int elt_bias,
+                unsigned start,
                 unsigned count );
 
    void (*finish)( struct draw_pt_front_end * );
@@ -80,6 +81,8 @@ struct draw_pt_front_end {
 /* The "middle end" - prepares actual hardware vertices for the
  * hardware backend.
  *
+ * prim_flags is as defined by pipe_draw_info::flags.
+ *
  * Currently two versions of this:
  *     - fetch, vertex shade, cliptest, prim-pipeline
  *     - fetch, emit (ie passthrough)
@@ -94,11 +97,13 @@ struct draw_pt_middle_end {
                 const unsigned *fetch_elts,
                 unsigned fetch_count,
                 const ushort *draw_elts,
-                unsigned draw_count );
+                unsigned draw_count,
+                unsigned prim_flags );
 
    void (*run_linear)(struct draw_pt_middle_end *,
                       unsigned start,
-                      unsigned count);
+                      unsigned count,
+                      unsigned prim_flags );
 
    /* Transform all vertices in a linear range and then draw them with
     * the supplied element list.  May fail and return FALSE.
@@ -107,7 +112,8 @@ struct draw_pt_middle_end {
                             unsigned fetch_start,
                             unsigned fetch_count,
                             const ushort *draw_elts,
-                            unsigned draw_count );
+                            unsigned draw_count,
+                            unsigned prim_flags );
 
    int (*get_max_vertex_count)( struct draw_pt_middle_end * );
 
@@ -122,19 +128,11 @@ struct vbuf_render;
 struct vertex_header;
 
 
-/* Helper functions.
- */
-pt_elt_func draw_pt_elt_func( struct draw_context *draw );
-const void *draw_pt_elt_ptr( struct draw_context *draw,
-                             unsigned start );
-
 /* Frontends: 
  *
- * Currently only the general-purpose vcache implementation, could add
- * a special case for tiny vertex buffers.
+ * Currently only the general-purpose vsplit implementation.
  */
-struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw );
-struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw);
+struct draw_pt_front_end *draw_pt_vsplit(struct draw_context *draw);
 
 
 /* Middle-ends:
@@ -223,7 +221,9 @@ boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
 			     struct draw_vertex_info *info );
 
 void draw_pt_post_vs_prepare( struct pt_post_vs *pvs,
-			      boolean bypass_clipping,
+			      boolean clip_xy,
+			      boolean clip_z,
+			      boolean clip_user,
 			      boolean bypass_viewport,
 			      boolean opengl,
 			      boolean need_edgeflags );
@@ -237,6 +237,7 @@ void draw_pt_post_vs_destroy( struct pt_post_vs *pvs );
  * Utils: 
  */
 void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr);
+unsigned draw_pt_trim_count(unsigned count, unsigned first, unsigned incr);
 
 
 #endif
diff --git a/src/gallium/auxiliary/draw/draw_pt_elts.c b/src/gallium/auxiliary/draw/draw_pt_elts.c
deleted file mode 100644
index 88f4d9f495..0000000000
--- a/src/gallium/auxiliary/draw/draw_pt_elts.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "draw/draw_pt.h"
-#include "draw/draw_private.h"
-
-/* Neat get_elt func that also works for varrays drawing by encoding
- * the start value into a pointer.  
- */
-
-static unsigned elt_uint( const void *elts, unsigned idx )
-{
-   return *(((const uint *)elts) + idx);
-}
-
-static unsigned elt_ushort( const void *elts, unsigned idx )
-{
-   return *(((const ushort *)elts) + idx);
-}
-
-static unsigned elt_ubyte( const void *elts, unsigned idx )
-{
-   return *(((const ubyte *)elts) + idx);
-}
-
-static unsigned elt_vert( const void *elts, unsigned idx )
-{
-   /* unsigned index is packed in the pointer */
-   return (unsigned)(uintptr_t)elts + idx;
-}
-
-pt_elt_func draw_pt_elt_func( struct draw_context *draw )
-{
-   switch (draw->pt.user.eltSize) {
-   case 0: return &elt_vert;
-   case 1: return &elt_ubyte;
-   case 2: return &elt_ushort; 
-   case 4: return &elt_uint;
-   default: return NULL;
-   }
-}     
-
-const void *draw_pt_elt_ptr( struct draw_context *draw,
-                             unsigned start )
-{
-   const char *elts = draw->pt.user.elts;
-
-   switch (draw->pt.user.eltSize) {
-   case 0: 
-      return (const void *)(((const ubyte *)NULL) + start);
-   case 1: 
-      return (const void *)(((const ubyte *)elts) + start);
-   case 2: 
-      return (const void *)(((const ushort *)elts) + start);
-   case 4: 
-      return (const void *)(((const uint *)elts) + start);
-   default:
-      return NULL;
-   }
-}
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index 5568fbb9f8..c8dfc16911 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -120,9 +120,6 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
 
    *max_vertices = (draw->render->max_vertex_buffer_bytes / 
                     (vinfo->size * 4));
-
-   /* even number */
-   *max_vertices = *max_vertices & ~1;
 }
 
 
@@ -147,11 +144,6 @@ void draw_pt_emit( struct pt_emit *emit,
    if (vertex_count == 0)
       return;
 
-   if (vertex_count >= UNDEFINED_VERTEX_ID) {
-      assert(0);
-      return;
-   }
-
    /* XXX: and work out some way to coordinate the render primitive
     * between vbuf.c and here...
     */
@@ -226,9 +218,6 @@ void draw_pt_emit_linear(struct pt_emit *emit,
     */
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
-   if (count >= UNDEFINED_VERTEX_ID)
-      goto fail;
-
    /* XXX: and work out some way to coordinate the render primitive
     * between vbuf.c and here...
     */
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index 5c8af17c8e..e706b7796f 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -191,15 +191,6 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
 
    *max_vertices = (draw->render->max_vertex_buffer_bytes / 
                     (vinfo->size * 4));
-
-   /* Return an even number of verts.
-    * This prevents "parity" errors when splitting long triangle strips which
-    * can lead to front/back culling mix-ups.
-    * Every other triangle in a strip has an alternate front/back orientation
-    * so splitting at an odd position can cause the orientation of subsequent
-    * triangles to get reversed.
-    */
-   *max_vertices = *max_vertices & ~1;
 }
 
 
@@ -210,7 +201,8 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
                             const unsigned *fetch_elts,
                             unsigned fetch_count,
                             const ushort *draw_elts,
-                            unsigned draw_count )
+                            unsigned draw_count,
+                            unsigned prim_flags )
 {
    struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
    struct draw_context *draw = feme->draw;
@@ -220,11 +212,6 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
     */
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
-   if (fetch_count >= UNDEFINED_VERTEX_ID) {
-      assert(0);
-      return;
-   }
-
    draw->render->allocate_vertices( draw->render,
                                     (ushort)feme->translate->key.output_stride,
                                     (ushort)fetch_count );
@@ -273,7 +260,8 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
 
 static void fetch_emit_run_linear( struct draw_pt_middle_end *middle,
                                    unsigned start,
-                                   unsigned count )
+                                   unsigned count,
+                                   unsigned prim_flags )
 {
    struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
    struct draw_context *draw = feme->draw;
@@ -283,9 +271,6 @@ static void fetch_emit_run_linear( struct draw_pt_middle_end *middle,
     */
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
-   if (count >= UNDEFINED_VERTEX_ID) 
-      goto fail;
-
    if (!draw->render->allocate_vertices( draw->render,
                                          (ushort)feme->translate->key.output_stride,
                                          (ushort)count )) 
@@ -334,7 +319,8 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle,
                                         unsigned start,
                                         unsigned count,
                                         const ushort *draw_elts,
-                                        unsigned draw_count )
+                                        unsigned draw_count,
+                                        unsigned prim_flags )
 {
    struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
    struct draw_context *draw = feme->draw;
@@ -344,9 +330,6 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle,
     */
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
-   if (count >= UNDEFINED_VERTEX_ID)
-      return FALSE;
-
    if (!draw->render->allocate_vertices( draw->render,
                                          (ushort)feme->translate->key.output_stride,
                                          (ushort)count ))
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
index b8270280b6..7c198c6026 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -102,7 +102,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
                                fse->key.nr_inputs);     /* inputs - fetch from api format */
 
    fse->key.viewport = !draw->identity_viewport;
-   fse->key.clip = !draw->bypass_clipping;
+   fse->key.clip = draw->clip_xy || draw->clip_z || draw->clip_user;
    fse->key.const_vbuffers = 0;
 
    memset(fse->key.element, 0, 
@@ -175,15 +175,6 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
    *max_vertices = (draw->render->max_vertex_buffer_bytes / 
                     (vinfo->size * 4));
 
-   /* Return an even number of verts.
-    * This prevents "parity" errors when splitting long triangle strips which
-    * can lead to front/back culling mix-ups.
-    * Every other triangle in a strip has an alternate front/back orientation
-    * so splitting at an odd position can cause the orientation of subsequent
-    * triangles to get reversed.
-    */
-   *max_vertices = *max_vertices & ~1;
-
    /* Probably need to do this somewhere (or fix exec shader not to
     * need it):
     */
@@ -197,7 +188,8 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
 
 static void fse_run_linear( struct draw_pt_middle_end *middle, 
                             unsigned start, 
-                            unsigned count )
+                            unsigned count,
+                            unsigned prim_flags )
 {
    struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle;
    struct draw_context *draw = fse->draw;
@@ -207,9 +199,6 @@ static void fse_run_linear( struct draw_pt_middle_end *middle,
     */
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
-   if (count >= UNDEFINED_VERTEX_ID) 
-      goto fail;
-
    if (!draw->render->allocate_vertices( draw->render,
                                          (ushort)fse->key.output_stride,
                                          (ushort)count ))
@@ -265,7 +254,8 @@ fse_run(struct draw_pt_middle_end *middle,
         const unsigned *fetch_elts,
         unsigned fetch_count,
         const ushort *draw_elts,
-        unsigned draw_count )
+        unsigned draw_count,
+        unsigned prim_flags )
 {
    struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle;
    struct draw_context *draw = fse->draw;
@@ -275,9 +265,6 @@ fse_run(struct draw_pt_middle_end *middle,
     */
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
-   if (fetch_count >= UNDEFINED_VERTEX_ID) 
-      goto fail;
-
    if (!draw->render->allocate_vertices( draw->render,
                                          (ushort)fse->key.output_stride,
                                          (ushort)fetch_count ))
@@ -327,7 +314,8 @@ static boolean fse_run_linear_elts( struct draw_pt_middle_end *middle,
                                  unsigned start, 
                                  unsigned count,
                                  const ushort *draw_elts,
-                                 unsigned draw_count )
+                                 unsigned draw_count,
+                                 unsigned prim_flags )
 {
    struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle;
    struct draw_context *draw = fse->draw;
@@ -337,9 +325,6 @@ static boolean fse_run_linear_elts( struct draw_pt_middle_end *middle,
     */
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
-   if (count >= UNDEFINED_VERTEX_ID)
-      return FALSE;
-
    if (!draw->render->allocate_vertices( draw->render,
                                          (ushort)fse->key.output_stride,
                                          (ushort)count ))
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index 5b16c3788e..b72fd61245 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -100,8 +100,10 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
     * but gl vs dx9 clip spaces.
     */
    draw_pt_post_vs_prepare( fpme->post_vs,
-			    (boolean)draw->bypass_clipping,
-			    (boolean)draw->identity_viewport,
+			    draw->clip_xy,
+			    draw->clip_z,
+			    draw->clip_user,
+			    draw->identity_viewport,
 			    (boolean)draw->rasterizer->gl_rasterization_rules,
 			    (draw->vs.edgeflag_output ? TRUE : FALSE) );
 
@@ -112,16 +114,13 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
 			    gs_out_prim,
                             max_vertices );
 
-      *max_vertices = MAX2( *max_vertices,
-                            DRAW_PIPE_MAX_VERTICES );
+      *max_vertices = MAX2( *max_vertices, 4096 );
    }
    else {
-      *max_vertices = DRAW_PIPE_MAX_VERTICES; 
+      /* limit max fetches by limiting max_vertices */
+      *max_vertices = 4096;
    }
 
-   /* return even number */
-   *max_vertices = *max_vertices & ~1;
-
    /* No need to prepare the shader.
     */
    vs->prepare(vs, draw);
@@ -295,7 +294,8 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
                                 const unsigned *fetch_elts,
                                 unsigned fetch_count,
                                 const ushort *draw_elts,
-                                unsigned draw_count )
+                                unsigned draw_count,
+                                unsigned prim_flags )
 {
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_fetch_info fetch_info;
@@ -311,6 +311,7 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
    prim_info.count = draw_count;
    prim_info.elts = draw_elts;
    prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &draw_count;
 
@@ -320,7 +321,8 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
 
 static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle,
                                        unsigned start,
-                                       unsigned count)
+                                       unsigned count,
+                                       unsigned prim_flags)
 {
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_fetch_info fetch_info;
@@ -336,6 +338,7 @@ static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle,
    prim_info.count = count;
    prim_info.elts = NULL;
    prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &count;
 
@@ -348,7 +351,8 @@ static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle
                                                unsigned start,
                                                unsigned count,
                                                const ushort *draw_elts,
-                                               unsigned draw_count )
+                                               unsigned draw_count,
+                                               unsigned prim_flags )
 {
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_fetch_info fetch_info;
@@ -364,6 +368,7 @@ static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle
    prim_info.count = draw_count;
    prim_info.elts = draw_elts;
    prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &draw_count;
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index 4b99bee86a..77291e304e 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -66,7 +66,8 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle,
    struct draw_context *draw = fpme->draw;
    struct llvm_vertex_shader *shader =
       llvm_vertex_shader(draw->vs.vertex_shader);
-   struct draw_llvm_variant_key key;
+   char store[DRAW_LLVM_MAX_VARIANT_KEY_SIZE];
+   struct draw_llvm_variant_key *key;
    struct draw_llvm_variant *variant = NULL;
    struct draw_llvm_variant_list_item *li;
    unsigned i;
@@ -106,8 +107,10 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle,
     * but gl vs dx9 clip spaces.
     */
    draw_pt_post_vs_prepare( fpme->post_vs,
-			    (boolean)draw->bypass_clipping,
-			    (boolean)(draw->identity_viewport),
+			    draw->clip_xy,
+			    draw->clip_z,
+			    draw->clip_user,
+			    draw->identity_viewport,
 			    (boolean)draw->rasterizer->gl_rasterization_rules,
 			    (draw->vs.edgeflag_output ? TRUE : FALSE) );
 
@@ -118,21 +121,21 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle,
 			    out_prim,
                             max_vertices );
 
-      *max_vertices = MAX2( *max_vertices,
-                            DRAW_PIPE_MAX_VERTICES );
+      *max_vertices = MAX2( *max_vertices, 4096 );
    }
    else {
-      *max_vertices = DRAW_PIPE_MAX_VERTICES;
+      /* limit max fetches by limiting max_vertices */
+      *max_vertices = 4096;
    }
 
    /* return even number */
    *max_vertices = *max_vertices & ~1;
-
-   draw_llvm_make_variant_key(fpme->llvm, &key);
+   
+   key = draw_llvm_make_variant_key(fpme->llvm, store);
 
    li = first_elem(&shader->variants);
    while(!at_end(&shader->variants, li)) {
-      if(memcmp(&li->base->key, &key, sizeof key) == 0) {
+      if(memcmp(&li->base->key, key, shader->variant_key_size) == 0) {
          variant = li->base;
          break;
       }
@@ -155,7 +158,7 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle,
          }
       }
 
-      variant = draw_llvm_create_variant(fpme->llvm, nr);
+      variant = draw_llvm_create_variant(fpme->llvm, nr, key);
 
       if (variant) {
          insert_at_head(&shader->variants, &variant->list_item_local);
@@ -294,7 +297,8 @@ static void llvm_middle_end_run( struct draw_pt_middle_end *middle,
                                  const unsigned *fetch_elts,
                                  unsigned fetch_count,
                                  const ushort *draw_elts,
-                                 unsigned draw_count )
+                                 unsigned draw_count,
+                                 unsigned prim_flags )
 {
    struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
    struct draw_fetch_info fetch_info;
@@ -310,6 +314,7 @@ static void llvm_middle_end_run( struct draw_pt_middle_end *middle,
    prim_info.count = draw_count;
    prim_info.elts = draw_elts;
    prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &draw_count;
 
@@ -319,7 +324,8 @@ static void llvm_middle_end_run( struct draw_pt_middle_end *middle,
 
 static void llvm_middle_end_linear_run( struct draw_pt_middle_end *middle,
                                        unsigned start,
-                                       unsigned count)
+                                       unsigned count,
+                                       unsigned prim_flags)
 {
    struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
    struct draw_fetch_info fetch_info;
@@ -335,6 +341,7 @@ static void llvm_middle_end_linear_run( struct draw_pt_middle_end *middle,
    prim_info.count = count;
    prim_info.elts = NULL;
    prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &count;
 
@@ -348,7 +355,8 @@ llvm_middle_end_linear_run_elts( struct draw_pt_middle_end *middle,
                                  unsigned start,
                                  unsigned count,
                                  const ushort *draw_elts,
-                                 unsigned draw_count )
+                                 unsigned draw_count,
+                                 unsigned prim_flags )
 {
    struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
    struct draw_fetch_info fetch_info;
@@ -364,6 +372,7 @@ llvm_middle_end_linear_run_elts( struct draw_pt_middle_end *middle,
    prim_info.count = draw_count;
    prim_info.elts = draw_elts;
    prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &draw_count;
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
index 308f927b77..769409cfd6 100644
--- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -26,14 +26,26 @@
  **************************************************************************/
 
 #include "util/u_memory.h"
+#include "util/u_math.h"
 #include "pipe/p_context.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pt.h"
 
+
+#define DO_CLIP_XY           0x1
+#define DO_CLIP_FULL_Z       0x2
+#define DO_CLIP_HALF_Z       0x4
+#define DO_CLIP_USER         0x8
+#define DO_VIEWPORT          0x10
+#define DO_EDGEFLAG          0x20
+
+
 struct pt_post_vs {
    struct draw_context *draw;
 
+   unsigned flags;
+
    boolean (*run)( struct pt_post_vs *pvs,
                    struct draw_vertex_info *info );
 };
@@ -56,186 +68,47 @@ dot4(const float *a, const float *b)
            a[3]*b[3]);
 }
 
-static INLINE unsigned
-compute_clipmask_gl(const float *clip, /*const*/ float plane[][4], unsigned nr,
-                    boolean clip_depth)
-{
-   unsigned mask = 0x0;
-   unsigned i;
+#define FLAGS (0)
+#define TAG(x) x##_none
+#include "draw_cliptest_tmp.h"
 
-#if 0
-   debug_printf("compute clipmask %f %f %f %f\n",
-                clip[0], clip[1], clip[2], clip[3]);
-   assert(clip[3] != 0.0);
-#endif
+#define FLAGS (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_VIEWPORT)
+#define TAG(x) x##_xy_fullz_viewport
+#include "draw_cliptest_tmp.h"
 
-   /* Do the hardwired planes first:
-    */
-   if (-clip[0] + clip[3] < 0) mask |= (1<<0);
-   if ( clip[0] + clip[3] < 0) mask |= (1<<1);
-   if (-clip[1] + clip[3] < 0) mask |= (1<<2);
-   if ( clip[1] + clip[3] < 0) mask |= (1<<3);
-   if (clip_depth) {
-      if ( clip[2] + clip[3] < 0) mask |= (1<<4); /* match mesa clipplane numbering - for now */
-      if (-clip[2] + clip[3] < 0) mask |= (1<<5); /* match mesa clipplane numbering - for now */
-   }
+#define FLAGS (DO_CLIP_XY | DO_CLIP_HALF_Z | DO_VIEWPORT)
+#define TAG(x) x##_xy_halfz_viewport
+#include "draw_cliptest_tmp.h"
 
-   /* Followed by any remaining ones:
-    */
-   for (i = 6; i < nr; i++) {
-      if (dot4(clip, plane[i]) < 0) 
-         mask |= (1<<i);
-   }
+#define FLAGS (DO_CLIP_FULL_Z | DO_VIEWPORT)
+#define TAG(x) x##_fullz_viewport
+#include "draw_cliptest_tmp.h"
 
-   return mask;
-}
+#define FLAGS (DO_CLIP_HALF_Z | DO_VIEWPORT)
+#define TAG(x) x##_halfz_viewport
+#include "draw_cliptest_tmp.h"
 
+#define FLAGS (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | DO_VIEWPORT)
+#define TAG(x) x##_xy_fullz_user_viewport
+#include "draw_cliptest_tmp.h"
 
-/* The normal case - cliptest, rhw divide, viewport transform.
- *
- * Also handle identity viewport here at the expense of a few wasted
- * instructions
- */
-static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs,
-                                             struct draw_vertex_info *info )
-{
-   struct vertex_header *out = info->verts;
-   const float *scale = pvs->draw->viewport.scale;
-   const float *trans = pvs->draw->viewport.translate;
-   const unsigned pos = draw_current_shader_position_output(pvs->draw);
-   unsigned clipped = 0;
-   unsigned j;
-
-   if (0) debug_printf("%s count, %d\n", __FUNCTION__, info->count);
-
-   for (j = 0; j < info->count; j++) {
-      float *position = out->data[pos];
-
-      initialize_vertex_header(out);
-#if 0
-      debug_printf("%d) io = %p, data = %p = [%f, %f, %f, %f]\n",
-                   j, out, position, position[0], position[1], position[2], position[3]);
-#endif
-
-      out->clip[0] = position[0];
-      out->clip[1] = position[1];
-      out->clip[2] = position[2];
-      out->clip[3] = position[3];
-
-      out->vertex_id = 0xffff;
-      /* Disable depth clipping if depth clamping is enabled. */
-      out->clipmask = compute_clipmask_gl(out->clip, 
-					  pvs->draw->plane,
-                                          pvs->draw->nr_planes,
-                                          !pvs->draw->depth_clamp);
-      clipped += out->clipmask;
-
-      if (out->clipmask == 0)
-      {
-	 /* divide by w */
-	 float w = 1.0f / position[3];
-
-	 /* Viewport mapping */
-	 position[0] = position[0] * w * scale[0] + trans[0];
-	 position[1] = position[1] * w * scale[1] + trans[1];
-	 position[2] = position[2] * w * scale[2] + trans[2];
-	 position[3] = w;
-#if 0
-         debug_printf("post viewport: %f %f %f %f\n",
-                      position[0],
-                      position[1],
-                      position[2],
-                      position[3]);
-#endif
-      }
-
-      out = (struct vertex_header *)( (char *)out + info->stride );
-   }
-
-   return clipped != 0;
-}
+#define FLAGS (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | DO_VIEWPORT | DO_EDGEFLAG)
+#define TAG(x) x##_xy_fullz_user_viewport_edgeflag
+#include "draw_cliptest_tmp.h"
 
 
 
-/* As above plus edgeflags
+/* Don't want to create 64 versions of this function, so catch the
+ * less common ones here.  This is looking like something which should
+ * be code-generated, perhaps appended to the end of the vertex
+ * shader.
  */
-static boolean
-post_vs_cliptest_viewport_gl_edgeflag(struct pt_post_vs *pvs,
-                                      struct draw_vertex_info *info)
-{
-   unsigned j;
-   boolean needpipe;
-
-   needpipe = post_vs_cliptest_viewport_gl(pvs, info);
-
-   /* If present, copy edgeflag VS output into vertex header.
-    * Otherwise, leave header as is.
-    */
-   if (pvs->draw->vs.edgeflag_output) {
-      struct vertex_header *out = info->verts;
-      int ef = pvs->draw->vs.edgeflag_output;
-
-      for (j = 0; j < info->count; j++) {
-         const float *edgeflag = out->data[ef];
-         out->edgeflag = !(edgeflag[0] != 1.0f);
-         needpipe |= !out->edgeflag;
-         out = (struct vertex_header *)( (char *)out + info->stride );
-      }
-   }
-   return needpipe;
-}
-
+#define FLAGS (pvs->flags)
+#define TAG(x) x##_generic
+#include "draw_cliptest_tmp.h"
 
 
 
-/* If bypass_clipping is set, skip cliptest and rhw divide.
- */
-static boolean post_vs_viewport( struct pt_post_vs *pvs,
-                                 struct draw_vertex_info *info )
-{
-   struct vertex_header *out = info->verts;
-   const float *scale = pvs->draw->viewport.scale;
-   const float *trans = pvs->draw->viewport.translate;
-   const unsigned pos = draw_current_shader_position_output(pvs->draw);
-   unsigned j;
-
-   if (0) debug_printf("%s\n", __FUNCTION__);
-   for (j = 0; j < info->count; j++) {
-      float *position = out->data[pos];
-
-      initialize_vertex_header(out);
-      /* Viewport mapping only, no cliptest/rhw divide
-       */
-      position[0] = position[0] * scale[0] + trans[0];
-      position[1] = position[1] * scale[1] + trans[1];
-      position[2] = position[2] * scale[2] + trans[2];
-
-      out = (struct vertex_header *)((char *)out + info->stride);
-   }
-
-   return FALSE;
-}
-
-
-/* If bypass_clipping is set and we have an identity viewport, nothing
- * to do.
- */
-static boolean post_vs_none( struct pt_post_vs *pvs,
-			     struct draw_vertex_info *info )
-{
-   struct vertex_header *out = info->verts;
-   unsigned j;
-
-   if (0) debug_printf("%s\n", __FUNCTION__);
-   /* just initialize the vertex_id in all headers */
-   for (j = 0; j < info->count; j++) {
-      initialize_vertex_header(out);
-
-      out = (struct vertex_header *)((char *)out + info->stride);
-   }
-   return FALSE;
-}
-
 boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
 			     struct draw_vertex_info *info )
 {
@@ -244,31 +117,72 @@ boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
 
 
 void draw_pt_post_vs_prepare( struct pt_post_vs *pvs,
-			      boolean bypass_clipping,
+			      boolean clip_xy,
+			      boolean clip_z,
+                              boolean clip_user,
 			      boolean bypass_viewport,
 			      boolean opengl,
 			      boolean need_edgeflags )
 {
-   if (!need_edgeflags) {
-      if (bypass_clipping) {
-         if (bypass_viewport)
-            pvs->run = post_vs_none;
-         else
-            pvs->run = post_vs_viewport;
-      }
-      else {
-         /* if (opengl) */
-         pvs->run = post_vs_cliptest_viewport_gl;
-      }
+   pvs->flags = 0;
+
+   if (clip_xy)
+      pvs->flags |= DO_CLIP_XY;
+   
+   if (clip_z && opengl) {
+      pvs->flags |= DO_CLIP_FULL_Z;
+      ASSIGN_4V( pvs->draw->plane[4],  0,  0,  1, 1 );
+   }
+
+   if (clip_z && !opengl) {
+      pvs->flags |= DO_CLIP_HALF_Z;
+      ASSIGN_4V( pvs->draw->plane[4],  0,  0,  1, 0 );
    }
-   else {
-      /* If we need to copy edgeflags to the vertex header, it should
-       * mean we're running the primitive pipeline.  Hence the bypass
-       * flags should be false.
-       */
-      assert(!bypass_clipping);
-      assert(!bypass_viewport);
-      pvs->run = post_vs_cliptest_viewport_gl_edgeflag;
+
+   if (clip_user)
+      pvs->flags |= DO_CLIP_USER;
+
+   if (!bypass_viewport)
+      pvs->flags |= DO_VIEWPORT;
+
+   if (need_edgeflags)
+      pvs->flags |= DO_EDGEFLAG;
+
+   /* Now select the relevant function:
+    */
+   switch (pvs->flags) {
+   case 0:
+      pvs->run = do_cliptest_none;
+      break;
+
+   case DO_CLIP_XY | DO_CLIP_FULL_Z | DO_VIEWPORT:
+      pvs->run = do_cliptest_xy_fullz_viewport;
+      break;
+
+   case DO_CLIP_XY | DO_CLIP_HALF_Z | DO_VIEWPORT:
+      pvs->run = do_cliptest_xy_halfz_viewport;
+      break;
+
+   case DO_CLIP_FULL_Z | DO_VIEWPORT:
+      pvs->run = do_cliptest_fullz_viewport;
+      break;
+
+   case DO_CLIP_HALF_Z | DO_VIEWPORT:
+      pvs->run = do_cliptest_halfz_viewport;
+      break;
+
+   case DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | DO_VIEWPORT:
+      pvs->run = do_cliptest_xy_fullz_user_viewport;
+      break;
+
+   case (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER |
+         DO_VIEWPORT | DO_EDGEFLAG):
+      pvs->run = do_cliptest_xy_fullz_user_viewport_edgeflag;
+      break;
+      
+   default:
+      pvs->run = do_cliptest_generic;
+      break;
    }
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_so_emit.c b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
index f7f4f24d35..c86bdd99a3 100644
--- a/src/gallium/auxiliary/draw/draw_pt_so_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
@@ -225,7 +225,7 @@ static void so_tri(struct pt_so_emit *so, int i0, int i1, int i2)
 
 #define FUNC         so_run_elts
 #define LOCAL_VARS   const ushort *elts = input_prims->elts;
-#define GET_ELT(idx) (elts[start + (idx)] & ~DRAW_PIPE_FLAG_MASK)
+#define GET_ELT(idx) (elts[start + (idx)])
 #include "draw_so_emit_tmp.h"
 
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_util.c b/src/gallium/auxiliary/draw/draw_pt_util.c
index 182a597cca..513bbbed21 100644
--- a/src/gallium/auxiliary/draw/draw_pt_util.c
+++ b/src/gallium/auxiliary/draw/draw_pt_util.c
@@ -92,3 +92,10 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr)
       break;
    }
 }
+
+unsigned draw_pt_trim_count(unsigned count, unsigned first, unsigned incr)
+{
+   if (count < first)
+      return 0;
+   return count - (count - first) % incr;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray.c b/src/gallium/auxiliary/draw/draw_pt_varray.c
deleted file mode 100644
index cd7bb7bf25..0000000000
--- a/src/gallium/auxiliary/draw/draw_pt_varray.c
+++ /dev/null
@@ -1,200 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include "util/u_math.h"
-#include "util/u_memory.h"
-
-#include "draw/draw_context.h"
-#include "draw/draw_private.h"
-#include "draw/draw_pt.h"
-
-#define FETCH_MAX 256
-#define DRAW_MAX (FETCH_MAX+8)
-
-struct varray_frontend {
-   struct draw_pt_front_end base;
-   struct draw_context *draw;
-
-   ushort draw_elts[DRAW_MAX];
-   unsigned fetch_elts[FETCH_MAX];
-
-   unsigned driver_fetch_max;
-   unsigned fetch_max;
-
-   struct draw_pt_middle_end *middle;
-
-   unsigned input_prim;
-   unsigned output_prim;
-};
-
-
-static void varray_flush_linear(struct varray_frontend *varray,
-                                unsigned start, unsigned count)
-{
-   if (count) {
-      assert(varray->middle->run_linear);
-      varray->middle->run_linear(varray->middle, start, count);
-   }
-}
-
-static void varray_line_loop_segment(struct varray_frontend *varray,
-                                     unsigned start,
-                                     unsigned segment_start,
-                                     unsigned segment_count,
-                                     boolean end )
-{
-   assert(segment_count < varray->fetch_max);
-   if (segment_count >= 1) {
-      unsigned nr = 0, i;
-
-      for (i = 0; i < segment_count; i++) 
-         varray->fetch_elts[nr++] = start + segment_start + i;
-
-      if (end) 
-         varray->fetch_elts[nr++] = start;
-
-      assert(nr <= FETCH_MAX);
-
-      varray->middle->run(varray->middle, 
-                          varray->fetch_elts,
-                          nr,
-                          varray->draw_elts, /* ie. linear */
-                          nr);
-   }
-}
-
-
-
-static void varray_fan_segment(struct varray_frontend *varray,
-                               unsigned start, 
-                               unsigned segment_start,
-                               unsigned segment_count )
-{
-   assert(segment_count < varray->fetch_max);
-   if (segment_count >= 2) {
-      unsigned nr = 0, i;
-
-      if (segment_start != 0)
-         varray->fetch_elts[nr++] = start;
-
-      for (i = 0 ; i < segment_count; i++) 
-         varray->fetch_elts[nr++] = start + segment_start + i;
-
-      assert(nr <= FETCH_MAX);
-
-      varray->middle->run(varray->middle, 
-                          varray->fetch_elts,
-                          nr,
-                          varray->draw_elts, /* ie. linear */
-                          nr);
-   }
-}
-
-
-
-
-#define FUNC varray_run
-#include "draw_pt_varray_tmp_linear.h"
-
-static unsigned decompose_prim[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY + 1] = {
-   PIPE_PRIM_POINTS,
-   PIPE_PRIM_LINES,
-   PIPE_PRIM_LINE_STRIP,        /* decomposed LINELOOP */
-   PIPE_PRIM_LINE_STRIP,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLE_STRIP,
-   PIPE_PRIM_TRIANGLE_FAN,
-   PIPE_PRIM_QUADS,
-   PIPE_PRIM_QUAD_STRIP,
-   PIPE_PRIM_POLYGON,
-   PIPE_PRIM_LINES_ADJACENCY,
-   PIPE_PRIM_LINE_STRIP_ADJACENCY,
-   PIPE_PRIM_TRIANGLES_ADJACENCY,
-   PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY
-};
-
-
-
-static void varray_prepare(struct draw_pt_front_end *frontend,
-                           unsigned in_prim,
-                           struct draw_pt_middle_end *middle,
-                           unsigned opt)
-{
-   struct varray_frontend *varray = (struct varray_frontend *)frontend;
-
-   varray->base.run = varray_run;
-
-   varray->input_prim = in_prim;
-   assert(in_prim < Elements(decompose_prim));
-   varray->output_prim = decompose_prim[in_prim];
-
-   varray->middle = middle;
-   middle->prepare(middle,
-                   varray->output_prim,
-                   opt, &varray->driver_fetch_max );
-
-   /* check that the max is even */
-   assert((varray->driver_fetch_max & 1) == 0);
-
-   varray->fetch_max = MIN2(FETCH_MAX, varray->driver_fetch_max);
-}
-
-
-
-
-static void varray_finish(struct draw_pt_front_end *frontend)
-{
-   struct varray_frontend *varray = (struct varray_frontend *)frontend;
-   varray->middle->finish(varray->middle);
-   varray->middle = NULL;
-}
-
-static void varray_destroy(struct draw_pt_front_end *frontend)
-{
-   FREE(frontend);
-}
-
-
-struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw)
-{
-   ushort i;
-   struct varray_frontend *varray = CALLOC_STRUCT(varray_frontend);
-   if (varray == NULL)
-      return NULL;
-
-   varray->base.prepare = varray_prepare;
-   varray->base.run     = NULL;
-   varray->base.finish  = varray_finish;
-   varray->base.destroy = varray_destroy;
-   varray->draw = draw;
-
-   for (i = 0; i < DRAW_MAX; i++) {
-      varray->draw_elts[i] = i;
-   }
-
-   return &varray->base;
-}
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h
deleted file mode 100644
index 7c722457c3..0000000000
--- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h
+++ /dev/null
@@ -1,238 +0,0 @@
-
-static void FUNC(struct draw_pt_front_end *frontend,
-                 pt_elt_func get_elt,
-                 const void *elts,
-                 unsigned count)
-{
-   struct varray_frontend *varray = (struct varray_frontend *)frontend;
-   struct draw_context *draw = varray->draw;
-   unsigned start = (unsigned)elts;
-
-   boolean flatfirst = (draw->rasterizer->flatshade &&
-                        draw->rasterizer->flatshade_first);
-   unsigned i, j;
-   ushort flags;
-   unsigned first, incr;
-
-   varray->fetch_start = start;
-
-   draw_pt_split_prim(varray->input_prim, &first, &incr);
-
-#if 0
-   debug_printf("%s (%d) %d/%d\n", __FUNCTION__,
-                varray->input_prim,
-                start, count);
-#endif
-
-   switch (varray->input_prim) {
-   case PIPE_PRIM_POINTS:
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i < end; i++) {
-            POINT(varray, i + 0);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-      break;
-
-   case PIPE_PRIM_LINES:
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i+1 < end; i += 2) {
-            LINE(varray, DRAW_PIPE_RESET_STIPPLE,
-                 i + 0, i + 1);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-      break;
-
-   case PIPE_PRIM_LINE_LOOP:
-      if (count >= 2) {
-         flags = DRAW_PIPE_RESET_STIPPLE;
-
-         for (j = 0; j + first <= count; j += i) {
-            unsigned end = MIN2(FETCH_MAX, count - j);
-            end -= (end % incr);
-            for (i = 1; i < end; i++, flags = 0) {
-               LINE(varray, flags, i - 1, i);
-            }
-            LINE(varray, flags, i - 1, 0);
-            i = end;
-            fetch_init(varray, end);
-            varray_flush(varray);
-         }
-      }
-      break;
-
-   case PIPE_PRIM_LINE_STRIP:
-      flags = DRAW_PIPE_RESET_STIPPLE;
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 1; i < end; i++, flags = 0) {
-            LINE(varray, flags, i - 1, i);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLES:
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i+2 < end; i += 3) {
-            TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                     i + 0, i + 1, i + 2);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLE_STRIP:
-      if (flatfirst) {
-         for (j = 0; j + first <= count; j += i) {
-            unsigned end = MIN2(FETCH_MAX, count - j);
-            end -= (end % incr);
-            for (i = 0; i+2 < end; i++) {
-               TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                        i + 0, i + 1 + (i&1), i + 2 - (i&1));
-            }
-            i = end;
-            fetch_init(varray, end);
-            varray_flush(varray);
-            if (j + first + i <= count) {
-               varray->fetch_start -= 2;
-               i -= 2;
-            }
-         }
-      }
-      else {
-         for (j = 0; j + first <= count; j += i) {
-            unsigned end = MIN2(FETCH_MAX, count - j);
-            end -= (end  % incr);
-            for (i = 0; i + 2 < end; i++) {
-               TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                        i + 0 + (i&1), i + 1 - (i&1), i + 2);
-            }
-            i = end;
-            fetch_init(varray, end);
-            varray_flush(varray);
-            if (j + first + i <= count) {
-               varray->fetch_start -= 2;
-               i -= 2;
-            }
-         }
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLE_FAN:
-      if (count >= 3) {
-         if (flatfirst) {
-            flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL;
-            for (j = 0; j + first <= count; j += i) {
-               unsigned end = MIN2(FETCH_MAX, count - j);
-               end -= (end % incr);
-               for (i = 0; i+2 < end; i++) {
-                  TRIANGLE(varray, flags, i + 1, i + 2, 0);
-               }
-               i = end;
-               fetch_init(varray, end);
-               varray_flush(varray);
-            }
-         }
-         else {
-            flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL;
-            for (j = 0; j + first <= count; j += i) {
-               unsigned end = MIN2(FETCH_MAX, count - j);
-               end -= (end % incr);
-               for (i = 0; i+2 < end; i++) {
-                  TRIANGLE(varray, flags, 0, i + 1, i + 2);
-               }
-               i = end;
-               fetch_init(varray, end);
-               varray_flush(varray);
-            }
-         }
-      }
-      break;
-
-   case PIPE_PRIM_QUADS:
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i+3 < end; i += 4) {
-            QUAD(varray, i + 0, i + 1, i + 2, i + 3);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-      break;
-
-   case PIPE_PRIM_QUAD_STRIP:
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i+3 < end; i += 2) {
-            QUAD(varray, i + 2, i + 0, i + 1, i + 3);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-         if (j + first + i <= count) {
-            varray->fetch_start -= 2;
-            i -= 2;
-         }
-      }
-      break;
-
-   case PIPE_PRIM_POLYGON:
-   {
-      /* These bitflags look a little odd because we submit the
-       * vertices as (1,2,0) to satisfy flatshade requirements.
-       */
-      const ushort edge_first  = DRAW_PIPE_EDGE_FLAG_2;
-      const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0;
-      const ushort edge_last   = DRAW_PIPE_EDGE_FLAG_1;
-
-      flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle;
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i+2 < end; i++, flags = edge_middle) {
-
-            if (i + 3 == count)
-               flags |= edge_last;
-
-            TRIANGLE(varray, flags, i + 1, i + 2, 0);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-   }
-   break;
-
-   default:
-      assert(0);
-      break;
-   }
-
-   varray_flush(varray);
-}
-
-#undef TRIANGLE
-#undef QUAD
-#undef POINT
-#undef LINE
-#undef FUNC
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h
deleted file mode 100644
index 55e43b2a71..0000000000
--- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h
+++ /dev/null
@@ -1,103 +0,0 @@
-static unsigned trim( unsigned count, unsigned first, unsigned incr )
-{
-   /*
-    * count either has been trimmed in draw_pt_arrays or is set to
-    * (driver)_fetch_max which is hopefully always larger than first.
-    */
-   assert(count >= first);
-   return count - (count - first) % incr;
-}
-
-static void FUNC(struct draw_pt_front_end *frontend,
-                 pt_elt_func get_elt,
-                 const void *elts,
-                 int elt_bias,
-                 unsigned count)
-{
-   struct varray_frontend *varray = (struct varray_frontend *)frontend;
-   unsigned start = (unsigned) ((char *) elts - (char *) NULL);
-
-   unsigned j;
-   unsigned first, incr;
-
-   assert(elt_bias == 0);
-
-   draw_pt_split_prim(varray->input_prim, &first, &incr);
-   
-   /* Sanitize primitive length:
-    */
-   count = trim(count, first, incr); 
-   if (count < first)
-      return;
-
-#if 0
-   debug_printf("%s (%d) %d/%d\n", __FUNCTION__,
-                varray->input_prim,
-                start, count);
-#endif
-
-   switch (varray->input_prim) {
-   case PIPE_PRIM_POINTS:
-   case PIPE_PRIM_LINES:
-   case PIPE_PRIM_TRIANGLES:
-   case PIPE_PRIM_LINE_STRIP:
-   case PIPE_PRIM_TRIANGLE_STRIP:
-   case PIPE_PRIM_QUADS:
-   case PIPE_PRIM_QUAD_STRIP:
-   case PIPE_PRIM_LINES_ADJACENCY:
-   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
-   case PIPE_PRIM_TRIANGLES_ADJACENCY:
-   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
-      for (j = 0; j < count;) {
-         unsigned remaining = count - j;
-         unsigned nr = trim( MIN2(varray->driver_fetch_max, remaining), first, incr );
-         varray_flush_linear(varray, start + j, nr);
-         j += nr;
-         if (nr != remaining) 
-            j -= (first - incr);
-      }
-      break;
-
-   case PIPE_PRIM_LINE_LOOP:
-      /* Always have to decompose as we've stated that this will be
-       * emitted as a line-strip.
-       */
-      for (j = 0; j < count;) {
-         unsigned remaining = count - j;
-         unsigned nr = trim( MIN2(varray->fetch_max-1, remaining), first, incr );
-         varray_line_loop_segment(varray, start, j, nr, nr == remaining);
-         j += nr;
-         if (nr != remaining) 
-            j -= (first - incr);
-      }
-      break;
-
-
-   case PIPE_PRIM_POLYGON:
-   case PIPE_PRIM_TRIANGLE_FAN: 
-      if (count < varray->driver_fetch_max) {
-         varray_flush_linear(varray, start, count);
-      }
-      else {
-         for ( j = 0; j < count;) {
-            unsigned remaining = count - j;
-            unsigned nr = trim( MIN2(varray->fetch_max-1, remaining), first, incr );
-            varray_fan_segment(varray, start, j, nr);
-            j += nr;
-            if (nr != remaining) 
-               j -= (first - incr);
-         }
-      }
-      break;
-
-   default:
-      assert(0);
-      break;
-   }
-}
-
-#undef TRIANGLE
-#undef QUAD
-#undef POINT
-#undef LINE
-#undef FUNC
diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c
deleted file mode 100644
index a848b54f7d..0000000000
--- a/src/gallium/auxiliary/draw/draw_pt_vcache.c
+++ /dev/null
@@ -1,610 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "util/u_memory.h"
-#include "util/u_prim.h"
-#include "draw/draw_context.h"
-#include "draw/draw_private.h"
-#include "draw/draw_pt.h"
-
-
-#define CACHE_MAX 256
-#define FETCH_MAX 256
-#define DRAW_MAX (16*1024)
-
-
-struct vcache_frontend {
-   struct draw_pt_front_end base;
-   struct draw_context *draw;
-
-   unsigned in[CACHE_MAX];
-   ushort out[CACHE_MAX];
-
-   ushort draw_elts[DRAW_MAX];
-   unsigned fetch_elts[FETCH_MAX];
-
-   unsigned draw_count;
-   unsigned fetch_count;
-   unsigned fetch_max;
-
-   struct draw_pt_middle_end *middle;
-
-   unsigned input_prim;
-   unsigned output_prim;
-
-   unsigned middle_prim;
-   unsigned opt;
-};
-
-
-static INLINE void
-vcache_flush( struct vcache_frontend *vcache )
-{
-   if (vcache->middle_prim != vcache->output_prim) {
-      vcache->middle_prim = vcache->output_prim;
-      vcache->middle->prepare( vcache->middle,
-                               vcache->middle_prim,
-                               vcache->opt,
-                               &vcache->fetch_max );
-   }
-
-   if (vcache->draw_count) {
-      vcache->middle->run( vcache->middle,
-                           vcache->fetch_elts,
-                           vcache->fetch_count,
-                           vcache->draw_elts,
-                           vcache->draw_count );
-   }
-
-   memset(vcache->in, ~0, sizeof(vcache->in));
-   vcache->fetch_count = 0;
-   vcache->draw_count = 0;
-}
-
-
-static INLINE void 
-vcache_check_flush( struct vcache_frontend *vcache )
-{
-   if (vcache->draw_count + 6 >= DRAW_MAX ||
-       vcache->fetch_count + 6 >= FETCH_MAX) {
-      vcache_flush( vcache );
-   }
-}
-
-
-static INLINE void 
-vcache_elt( struct vcache_frontend *vcache,
-            unsigned felt,
-            ushort flags )
-{
-   unsigned idx = felt % CACHE_MAX;
-
-   if (vcache->in[idx] != felt) {
-      assert(vcache->fetch_count < FETCH_MAX);
-
-      vcache->in[idx] = felt;
-      vcache->out[idx] = (ushort)vcache->fetch_count;
-      vcache->fetch_elts[vcache->fetch_count++] = felt;
-   }
-
-   vcache->draw_elts[vcache->draw_count++] = vcache->out[idx] | flags;
-}
-
-
-                   
-static INLINE void 
-vcache_triangle( struct vcache_frontend *vcache,
-                 unsigned i0,
-                 unsigned i1,
-                 unsigned i2 )
-{
-   vcache_elt(vcache, i0, 0);
-   vcache_elt(vcache, i1, 0);
-   vcache_elt(vcache, i2, 0);
-   vcache_check_flush(vcache);
-}
-
-			  
-static INLINE void 
-vcache_triangle_flags( struct vcache_frontend *vcache,
-                       ushort flags,
-                       unsigned i0,
-                       unsigned i1,
-                       unsigned i2 )
-{
-   vcache_elt(vcache, i0, flags);
-   vcache_elt(vcache, i1, 0);
-   vcache_elt(vcache, i2, 0);
-   vcache_check_flush(vcache);
-}
-
-
-static INLINE void 
-vcache_line( struct vcache_frontend *vcache,
-             unsigned i0,
-             unsigned i1 )
-{
-   vcache_elt(vcache, i0, 0);
-   vcache_elt(vcache, i1, 0);
-   vcache_check_flush(vcache);
-}
-
-
-static INLINE void 
-vcache_line_flags( struct vcache_frontend *vcache,
-                   ushort flags,
-                   unsigned i0,
-                   unsigned i1 )
-{
-   vcache_elt(vcache, i0, flags);
-   vcache_elt(vcache, i1, 0);
-   vcache_check_flush(vcache);
-}
-
-
-static INLINE void 
-vcache_point( struct vcache_frontend *vcache,
-              unsigned i0 )
-{
-   vcache_elt(vcache, i0, 0);
-   vcache_check_flush(vcache);
-}
-
-
-static INLINE void
-vcache_line_adj_flags( struct vcache_frontend *vcache,
-                       unsigned flags,
-                       unsigned a0, unsigned i0, unsigned i1, unsigned a1 )
-{
-   vcache_elt(vcache, a0, 0);
-   vcache_elt(vcache, i0, flags);
-   vcache_elt(vcache, i1, 0);
-   vcache_elt(vcache, a1, 0);
-   vcache_check_flush(vcache);
-}
-
-
-static INLINE void
-vcache_line_adj( struct vcache_frontend *vcache,
-                 unsigned a0, unsigned i0, unsigned i1, unsigned a1 )
-{
-   vcache_elt(vcache, a0, 0);
-   vcache_elt(vcache, i0, 0);
-   vcache_elt(vcache, i1, 0);
-   vcache_elt(vcache, a1, 0);
-   vcache_check_flush(vcache);
-}
-
-
-static INLINE void
-vcache_triangle_adj_flags( struct vcache_frontend *vcache,
-                           unsigned flags,
-                           unsigned i0, unsigned a0,
-                           unsigned i1, unsigned a1,
-                           unsigned i2, unsigned a2 )
-{
-   vcache_elt(vcache, i0, flags);
-   vcache_elt(vcache, a0, 0);
-   vcache_elt(vcache, i1, 0);
-   vcache_elt(vcache, a1, 0);
-   vcache_elt(vcache, i2, 0);
-   vcache_elt(vcache, a2, 0);
-   vcache_check_flush(vcache);
-}
-
-
-static INLINE void
-vcache_triangle_adj( struct vcache_frontend *vcache,
-                     unsigned i0, unsigned a0,
-                     unsigned i1, unsigned a1,
-                     unsigned i2, unsigned a2 )
-{
-   vcache_elt(vcache, i0, 0);
-   vcache_elt(vcache, a0, 0);
-   vcache_elt(vcache, i1, 0);
-   vcache_elt(vcache, a1, 0);
-   vcache_elt(vcache, i2, 0);
-   vcache_elt(vcache, a2, 0);
-   vcache_check_flush(vcache);
-}
-
-
-/* At least for now, we're back to using a template include file for
- * this.  The two paths aren't too different though - it may be
- * possible to reunify them.
- */
-#define TRIANGLE(flags,i0,i1,i2) vcache_triangle_flags(vcache,flags,i0,i1,i2)
-#define LINE(flags,i0,i1)        vcache_line_flags(vcache,flags,i0,i1)
-#define POINT(i0)                vcache_point(vcache,i0)
-#define LINE_ADJ(flags,a0,i0,i1,a1) \
-   vcache_line_adj_flags(vcache,flags,a0,i0,i1,a1)
-#define TRIANGLE_ADJ(flags,i0,a0,i1,a1,i2,a2) \
-   vcache_triangle_adj_flags(vcache,flags,i0,a0,i1,a1,i2,a2)
-#define FUNC vcache_run_extras
-#include "draw_pt_vcache_tmp.h"
-
-#define TRIANGLE(flags,i0,i1,i2) vcache_triangle(vcache,i0,i1,i2)
-#define LINE(flags,i0,i1)        vcache_line(vcache,i0,i1)
-#define POINT(i0)                vcache_point(vcache,i0)
-#define LINE_ADJ(flags,a0,i0,i1,a1) \
-   vcache_line_adj(vcache,a0,i0,i1,a1)
-#define TRIANGLE_ADJ(flags,i0,a0,i1,a1,i2,a2) \
-   vcache_triangle_adj(vcache,i0,a0,i1,a1,i2,a2)
-#define FUNC vcache_run
-#include "draw_pt_vcache_tmp.h"
-
-static INLINE void 
-rebase_uint_elts( const unsigned *src,
-                  unsigned count,
-                  int delta,
-                  ushort *dest )
-{
-   unsigned i;
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i] + delta);
-}
-
-
-static INLINE void 
-rebase_ushort_elts( const ushort *src,
-                    unsigned count,
-                    int delta,
-                    ushort *dest )
-{
-   unsigned i;
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i] + delta);
-}
-
-
-static INLINE void 
-rebase_ubyte_elts( const ubyte *src,
-                   unsigned count,
-                   int delta,
-                   ushort *dest )
-{
-   unsigned i;
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i] + delta);
-}
-
-
-static INLINE void 
-translate_uint_elts( const unsigned *src,
-                     unsigned count,
-                     ushort *dest )
-{
-   unsigned i;
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i]);
-}
-
-
-static INLINE void 
-translate_ushort_elts( const ushort *src,
-                       unsigned count,
-                       ushort *dest )
-{
-   unsigned i;
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i]);
-}
-
-
-static INLINE void 
-translate_ubyte_elts( const ubyte *src,
-                      unsigned count,
-                      ushort *dest )
-{
-   unsigned i;
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i]);
-}
-
-
-
-
-#if 0
-static INLINE enum pipe_format 
-format_from_get_elt( pt_elt_func get_elt )
-{
-   switch (draw->pt.user.eltSize) {
-   case 1: return PIPE_FORMAT_R8_UNORM;
-   case 2: return PIPE_FORMAT_R16_UNORM;
-   case 4: return PIPE_FORMAT_R32_UNORM;
-   default: return PIPE_FORMAT_NONE;
-   }
-}
-#endif
-
-
-/**
- * Check if any vertex attributes use instance divisors.
- * Note that instance divisors complicate vertex fetching so we need
- * to take the vcache path when they're in use.
- */
-static boolean
-any_instance_divisors(const struct draw_context *draw)
-{
-   uint i;
-
-   for (i = 0; i < draw->pt.nr_vertex_elements; i++) {
-      uint div = draw->pt.vertex_element[i].instance_divisor;
-      if (div)
-         return TRUE;
-   }
-   return FALSE;
-}
-
-
-static INLINE void 
-vcache_check_run( struct draw_pt_front_end *frontend, 
-                  pt_elt_func get_elt,
-                  const void *elts,
-                  int elt_bias,
-                  unsigned draw_count )
-{
-   struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; 
-   struct draw_context *draw = vcache->draw;
-   const unsigned min_index = draw->pt.user.min_index;
-   const unsigned max_index = draw->pt.user.max_index;
-   const unsigned index_size = draw->pt.user.eltSize;
-   unsigned fetch_count;
-   const ushort *transformed_elts;
-   ushort *storage = NULL;
-   boolean ok = FALSE;
-
-   /* debug: verify indexes are in range [min_index, max_index] */
-   if (0) {
-      unsigned i;
-      for (i = 0; i < draw_count; i++) {
-         if (index_size == 1) {
-            assert( ((const ubyte *) elts)[i] >= min_index);
-            assert( ((const ubyte *) elts)[i] <= max_index);
-         }
-         else if (index_size == 2) {
-            assert( ((const ushort *) elts)[i] >= min_index);
-            assert( ((const ushort *) elts)[i] <= max_index);
-         }
-         else {
-            assert(index_size == 4);
-            assert( ((const uint *) elts)[i] >= min_index);
-            assert( ((const uint *) elts)[i] <= max_index);
-         }
-      }
-   }
-
-   /* Note: max_index is frequently 0xffffffff so we have to be sure
-    * that any arithmetic involving max_index doesn't overflow!
-    */
-   if (max_index >= (unsigned) DRAW_PIPE_MAX_VERTICES)
-      goto fail;
-
-   if (any_instance_divisors(draw))
-      goto fail;
-
-   fetch_count = max_index + 1 - min_index;
-
-   if (0)
-      debug_printf("fetch_count %d fetch_max %d draw_count %d\n", fetch_count, 
-                   vcache->fetch_max,
-                   draw_count);
-
-   if (elt_bias + max_index >= DRAW_PIPE_MAX_VERTICES ||
-       fetch_count >= UNDEFINED_VERTEX_ID ||
-       fetch_count > draw_count) {
-      if (0) debug_printf("fail\n");
-      goto fail;
-   }
-
-   if (vcache->middle_prim != vcache->input_prim) {
-      vcache->middle_prim = vcache->input_prim;
-      vcache->middle->prepare( vcache->middle,
-                               vcache->middle_prim,
-                               vcache->opt,
-                               &vcache->fetch_max );
-   }
-
-   assert((elt_bias >= 0 && min_index + elt_bias >= min_index) ||
-          (elt_bias <  0 && min_index + elt_bias <  min_index));
-
-   if (min_index == 0 &&
-       index_size == 2) {
-      transformed_elts = (const ushort *)elts;
-   }
-   else {
-      storage = MALLOC( draw_count * sizeof(ushort) );
-      if (!storage)
-         goto fail;
-      
-      if (min_index == 0) {
-         switch(index_size) {
-         case 1:
-            translate_ubyte_elts( (const ubyte *)elts,
-                                  draw_count,
-                                  storage );
-            break;
-
-         case 2:
-            translate_ushort_elts( (const ushort *)elts,
-                                   draw_count,
-                                   storage );
-            break;
-
-         case 4:
-            translate_uint_elts( (const uint *)elts,
-                                 draw_count,
-                                 storage );
-            break;
-
-         default:
-            assert(0);
-            FREE(storage);
-            return;
-         }
-      }
-      else {
-         switch(index_size) {
-         case 1:
-            rebase_ubyte_elts( (const ubyte *)elts,
-                               draw_count,
-                               0 - (int)min_index,
-                               storage );
-            break;
-
-         case 2:
-            rebase_ushort_elts( (const ushort *)elts,
-                                draw_count,
-                                0 - (int)min_index,
-                                storage );
-            break;
-
-         case 4:
-            rebase_uint_elts( (const uint *)elts,
-                              draw_count,
-                              0 - (int)min_index,
-                              storage );
-            break;
-
-         default:
-            assert(0);
-            FREE(storage);
-            return;
-         }
-      }
-      transformed_elts = storage;
-   }
-
-   if (fetch_count < UNDEFINED_VERTEX_ID)
-      ok = vcache->middle->run_linear_elts( vcache->middle,
-                                            min_index + elt_bias, /* start */
-                                            fetch_count,
-                                            transformed_elts,
-                                            draw_count );
-   
-   FREE(storage);
-
-   if (ok)
-      return;
-
-   debug_printf("failed to execute atomic draw elts for %d/%d, splitting up\n",
-                fetch_count, draw_count);
-
-fail:
-   vcache_run( frontend, get_elt, elts, elt_bias, draw_count );
-}
-
-
-
-
-static void
-vcache_prepare( struct draw_pt_front_end *frontend,
-                unsigned in_prim,
-                struct draw_pt_middle_end *middle,
-                unsigned opt )
-{
-   struct vcache_frontend *vcache = (struct vcache_frontend *)frontend;
-
-   if (opt & PT_PIPELINE) {
-      vcache->base.run = vcache_run_extras;
-   }
-   else {
-      vcache->base.run = vcache_check_run;
-   }
-
-   /* VCache will always emit the reduced version of its input
-    * primitive, ie STRIP/FANS become TRIS, etc.
-    *
-    * This is not to be confused with what the GS might be up to,
-    * which is a separate issue.
-    */
-   vcache->input_prim = in_prim;
-   switch (in_prim) {
-   case PIPE_PRIM_LINES_ADJACENCY:
-   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
-      vcache->output_prim = PIPE_PRIM_LINES_ADJACENCY;
-      break;
-   case PIPE_PRIM_TRIANGLES_ADJACENCY:
-   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
-      vcache->output_prim = PIPE_PRIM_TRIANGLES_ADJACENCY;
-      break;
-   default:
-      vcache->output_prim = u_reduced_prim(in_prim);
-   }
-
-   vcache->middle = middle;
-   vcache->opt = opt;
-
-   /* Have to run prepare here, but try and guess a good prim for
-    * doing so:
-    */
-   vcache->middle_prim = (opt & PT_PIPELINE)
-      ? vcache->output_prim : vcache->input_prim;
-
-   middle->prepare( middle,
-                    vcache->middle_prim,
-                    opt, &vcache->fetch_max );
-}
-
-
-static void 
-vcache_finish( struct draw_pt_front_end *frontend )
-{
-   struct vcache_frontend *vcache = (struct vcache_frontend *)frontend;
-   vcache->middle->finish( vcache->middle );
-   vcache->middle = NULL;
-}
-
-
-static void 
-vcache_destroy( struct draw_pt_front_end *frontend )
-{
-   FREE(frontend);
-}
-
-
-struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw )
-{
-   struct vcache_frontend *vcache = CALLOC_STRUCT( vcache_frontend );
-   if (vcache == NULL)
-      return NULL;
- 
-   vcache->base.prepare = vcache_prepare;
-   vcache->base.run     = NULL;
-   vcache->base.finish  = vcache_finish;
-   vcache->base.destroy = vcache_destroy;
-   vcache->draw = draw;
-   
-   memset(vcache->in, ~0, sizeof(vcache->in));
-  
-   return &vcache->base;
-}
diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h
deleted file mode 100644
index 1a3748d5f0..0000000000
--- a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#define FUNC_VARS                      \
-   struct draw_pt_front_end *frontend, \
-   pt_elt_func get_elt,                \
-   const void *elts,                   \
-   int elt_bias,                       \
-   unsigned count
-
-#define LOCAL_VARS \
-   struct vcache_frontend *vcache = (struct vcache_frontend *) frontend;   \
-   struct draw_context *draw = vcache->draw;                               \
-   const unsigned prim = vcache->input_prim;                               \
-   const boolean last_vertex_last = !(draw->rasterizer->flatshade &&       \
-                                      draw->rasterizer->flatshade_first);
-
-#define GET_ELT(idx) (get_elt(elts, idx) + elt_bias)
-
-#define FUNC_EXIT do { vcache_flush(vcache); } while (0)
-
-#include "draw_decompose_tmp.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit.c b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
new file mode 100644
index 0000000000..a687525309
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
@@ -0,0 +1,208 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.9
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_pt.h"
+
+#define SEGMENT_SIZE 1024
+#define MAP_SIZE     256
+
+struct vsplit_frontend {
+   struct draw_pt_front_end base;
+   struct draw_context *draw;
+
+   unsigned prim;
+
+   struct draw_pt_middle_end *middle;
+
+   unsigned max_vertices;
+   ushort segment_size;
+
+   /* buffers for splitting */
+   unsigned fetch_elts[SEGMENT_SIZE];
+   ushort draw_elts[SEGMENT_SIZE];
+   ushort identity_draw_elts[SEGMENT_SIZE];
+
+   struct {
+      /* map a fetch element to a draw element */
+      unsigned fetches[MAP_SIZE];
+      ushort draws[MAP_SIZE];
+      boolean has_max_fetch;
+
+      ushort num_fetch_elts;
+      ushort num_draw_elts;
+   } cache;
+};
+
+
+static void
+vsplit_clear_cache(struct vsplit_frontend *vsplit)
+{
+   memset(vsplit->cache.fetches, 0xff, sizeof(vsplit->cache.fetches));
+   vsplit->cache.has_max_fetch = FALSE;
+   vsplit->cache.num_fetch_elts = 0;
+   vsplit->cache.num_draw_elts = 0;
+}
+
+static void
+vsplit_flush_cache(struct vsplit_frontend *vsplit, unsigned flags)
+{
+   vsplit->middle->run(vsplit->middle,
+         vsplit->fetch_elts, vsplit->cache.num_fetch_elts,
+         vsplit->draw_elts, vsplit->cache.num_draw_elts, flags);
+}
+
+/**
+ * Add a fetch element and add it to the draw elements.
+ */
+static INLINE void
+vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned fetch)
+{
+   unsigned hash = fetch % MAP_SIZE;
+
+   if (vsplit->cache.fetches[hash] != fetch) {
+      /* update cache */
+      vsplit->cache.fetches[hash] = fetch;
+      vsplit->cache.draws[hash] = vsplit->cache.num_fetch_elts;
+
+      /* add fetch */
+      assert(vsplit->cache.num_fetch_elts < vsplit->segment_size);
+      vsplit->fetch_elts[vsplit->cache.num_fetch_elts++] = fetch;
+   }
+
+   vsplit->draw_elts[vsplit->cache.num_draw_elts++] = vsplit->cache.draws[hash];
+}
+
+
+/**
+ * Add a fetch element and add it to the draw elements.  The fetch element is
+ * in full range (uint).
+ */
+static INLINE void
+vsplit_add_cache_uint(struct vsplit_frontend *vsplit, unsigned fetch)
+{
+   /* special care for 0xffffffff */
+   if (fetch == 0xffffffff && !vsplit->cache.has_max_fetch) {
+      unsigned hash = fetch % MAP_SIZE;
+      vsplit->cache.fetches[hash] = fetch - 1; /* force update */
+      vsplit->cache.has_max_fetch = TRUE;
+   }
+
+   vsplit_add_cache(vsplit, fetch);
+}
+
+
+#define FUNC vsplit_run_linear
+#include "draw_pt_vsplit_tmp.h"
+
+#define FUNC vsplit_run_ubyte
+#define ELT_TYPE ubyte
+#define ADD_CACHE(vsplit, fetch) vsplit_add_cache(vsplit, fetch)
+#include "draw_pt_vsplit_tmp.h"
+
+#define FUNC vsplit_run_ushort
+#define ELT_TYPE ushort
+#define ADD_CACHE(vsplit, fetch) vsplit_add_cache(vsplit, fetch)
+#include "draw_pt_vsplit_tmp.h"
+
+#define FUNC vsplit_run_uint
+#define ELT_TYPE uint
+#define ADD_CACHE(vsplit, fetch) vsplit_add_cache_uint(vsplit, fetch)
+#include "draw_pt_vsplit_tmp.h"
+
+
+static void vsplit_prepare(struct draw_pt_front_end *frontend,
+                           unsigned in_prim,
+                           struct draw_pt_middle_end *middle,
+                           unsigned opt)
+{
+   struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend;
+
+   switch (vsplit->draw->pt.user.eltSize) {
+   case 0:
+      vsplit->base.run = vsplit_run_linear;
+      break;
+   case 1:
+      vsplit->base.run = vsplit_run_ubyte;
+      break;
+   case 2:
+      vsplit->base.run = vsplit_run_ushort;
+      break;
+   case 4:
+      vsplit->base.run = vsplit_run_uint;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   /* split only */
+   vsplit->prim = in_prim;
+
+   vsplit->middle = middle;
+   middle->prepare(middle, vsplit->prim, opt, &vsplit->max_vertices);
+
+   vsplit->segment_size = MIN2(SEGMENT_SIZE, vsplit->max_vertices);
+}
+
+
+static void vsplit_finish(struct draw_pt_front_end *frontend)
+{
+   struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend;
+   vsplit->middle->finish(vsplit->middle);
+   vsplit->middle = NULL;
+}
+
+
+static void vsplit_destroy(struct draw_pt_front_end *frontend)
+{
+   FREE(frontend);
+}
+
+
+struct draw_pt_front_end *draw_pt_vsplit(struct draw_context *draw)
+{
+   struct vsplit_frontend *vsplit = CALLOC_STRUCT(vsplit_frontend);
+   ushort i;
+
+   if (!vsplit)
+      return NULL;
+
+   vsplit->base.prepare = vsplit_prepare;
+   vsplit->base.run     = NULL;
+   vsplit->base.finish  = vsplit_finish;
+   vsplit->base.destroy = vsplit_destroy;
+   vsplit->draw = draw;
+
+   for (i = 0; i < SEGMENT_SIZE; i++)
+      vsplit->identity_draw_elts[i] = i;
+
+   return &vsplit->base;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
new file mode 100644
index 0000000000..3f66f962e1
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
@@ -0,0 +1,309 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.9
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#define CONCAT2(name, elt_type) name ## elt_type
+#define CONCAT(name, elt_type) CONCAT2(name, elt_type)
+
+#ifdef ELT_TYPE
+
+/**
+ * Fetch all elements in [min_index, max_index] with bias, and use the
+ * (rebased) index buffer as the draw elements.
+ */
+static boolean
+CONCAT(vsplit_primitive_, ELT_TYPE)(struct vsplit_frontend *vsplit,
+                                    unsigned istart, unsigned icount)
+{
+   struct draw_context *draw = vsplit->draw;
+   const ELT_TYPE *ib = (const ELT_TYPE *)
+      ((const char *) draw->pt.user.elts + draw->pt.index_buffer.offset);
+   const unsigned min_index = draw->pt.user.min_index;
+   const unsigned max_index = draw->pt.user.max_index;
+   const int elt_bias = draw->pt.user.eltBias;
+   unsigned fetch_start, fetch_count;
+   const ushort *draw_elts = NULL;
+   unsigned i;
+
+   /* use the ib directly */
+   if (min_index == 0 && sizeof(ib[0]) == sizeof(draw_elts[0])) {
+      if (icount > vsplit->max_vertices)
+         return FALSE;
+
+      for (i = 0; i < icount; i++) {
+         ELT_TYPE idx = ib[istart + i];
+         assert(idx >= min_index && idx <= max_index);
+      }
+      draw_elts = (const ushort *) ib;
+   }
+   else {
+      /* have to go through vsplit->draw_elts */
+      if (icount > vsplit->segment_size)
+         return FALSE;
+   }
+
+   /* this is faster only when we fetch less elements than the normal path */
+   if (max_index - min_index > icount - 1)
+      return FALSE;
+
+   if (elt_bias < 0 && min_index < -elt_bias)
+      return FALSE;
+
+   /* why this check? */
+   for (i = 0; i < draw->pt.nr_vertex_elements; i++) {
+      if (draw->pt.vertex_element[i].instance_divisor)
+         return FALSE;
+   }
+
+   fetch_start = min_index + elt_bias;
+   fetch_count = max_index - min_index + 1;
+
+   if (!draw_elts) {
+      if (min_index == 0) {
+         for (i = 0; i < icount; i++) {
+            ELT_TYPE idx = ib[istart + i];
+
+            assert(idx >= min_index && idx <= max_index);
+            vsplit->draw_elts[i] = (ushort) idx;
+         }
+      }
+      else {
+         for (i = 0; i < icount; i++) {
+            ELT_TYPE idx = ib[istart + i];
+
+            assert(idx >= min_index && idx <= max_index);
+            vsplit->draw_elts[i] = (ushort) (idx - min_index);
+         }
+      }
+
+      draw_elts = vsplit->draw_elts;
+   }
+
+   return vsplit->middle->run_linear_elts(vsplit->middle,
+                                          fetch_start, fetch_count,
+                                          draw_elts, icount, 0x0);
+}
+
+/**
+ * Use the cache to prepare the fetch and draw elements, and flush.
+ *
+ * When spoken is TRUE, ispoken replaces istart;  When close is TRUE, iclose is
+ * appended.
+ */
+static INLINE void
+CONCAT(vsplit_segment_cache_, ELT_TYPE)(struct vsplit_frontend *vsplit,
+                                        unsigned flags,
+                                        unsigned istart, unsigned icount,
+                                        boolean spoken, unsigned ispoken,
+                                        boolean close, unsigned iclose)
+{
+   struct draw_context *draw = vsplit->draw;
+   const ELT_TYPE *ib = (const ELT_TYPE *)
+      ((const char *) draw->pt.user.elts + draw->pt.index_buffer.offset);
+   const int ibias = draw->pt.user.eltBias;
+   unsigned i;
+
+   assert(icount + !!close <= vsplit->segment_size);
+
+   vsplit_clear_cache(vsplit);
+
+   spoken = !!spoken;
+   if (ibias == 0) {
+      if (spoken)
+         ADD_CACHE(vsplit, ib[ispoken]);
+
+      for (i = spoken; i < icount; i++)
+         ADD_CACHE(vsplit, ib[istart + i]);
+
+      if (close)
+         ADD_CACHE(vsplit, ib[iclose]);
+   }
+   else if (ibias > 0) {
+      if (spoken)
+         ADD_CACHE(vsplit, (uint) ib[ispoken] + ibias);
+
+      for (i = spoken; i < icount; i++)
+         ADD_CACHE(vsplit, (uint) ib[istart + i] + ibias);
+
+      if (close)
+         ADD_CACHE(vsplit, (uint) ib[iclose] + ibias);
+   }
+   else {
+      if (spoken) {
+         if (ib[ispoken] < -ibias)
+            return;
+         ADD_CACHE(vsplit, ib[ispoken] + ibias);
+      }
+
+      for (i = spoken; i < icount; i++) {
+         if (ib[istart + i] < -ibias)
+            return;
+         ADD_CACHE(vsplit, ib[istart + i] + ibias);
+      }
+
+      if (close) {
+         if (ib[iclose] < -ibias)
+            return;
+         ADD_CACHE(vsplit, ib[iclose] + ibias);
+      }
+   }
+
+   vsplit_flush_cache(vsplit, flags);
+}
+
+static void
+CONCAT(vsplit_segment_simple_, ELT_TYPE)(struct vsplit_frontend *vsplit,
+                                         unsigned flags,
+                                         unsigned istart,
+                                         unsigned icount)
+{
+   CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit,
+         flags, istart, icount, FALSE, 0, FALSE, 0);
+}
+
+static void
+CONCAT(vsplit_segment_loop_, ELT_TYPE)(struct vsplit_frontend *vsplit,
+                                       unsigned flags,
+                                       unsigned istart,
+                                       unsigned icount,
+                                       unsigned i0)
+{
+   const boolean close_loop = ((flags) == DRAW_SPLIT_BEFORE);
+
+   CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit,
+         flags, istart, icount, FALSE, 0, close_loop, i0);
+}
+
+static void
+CONCAT(vsplit_segment_fan_, ELT_TYPE)(struct vsplit_frontend *vsplit,
+                                      unsigned flags,
+                                      unsigned istart,
+                                      unsigned icount,
+                                      unsigned i0)
+{
+   const boolean use_spoken = (((flags) & DRAW_SPLIT_BEFORE) != 0);
+
+   CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit,
+         flags, istart, icount, use_spoken, i0, FALSE, 0);
+}
+
+#define LOCAL_VARS                                                         \
+   struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend;   \
+   const unsigned prim = vsplit->prim;                                     \
+   const unsigned max_count_simple = vsplit->segment_size;                 \
+   const unsigned max_count_loop = vsplit->segment_size - 1;               \
+   const unsigned max_count_fan = vsplit->segment_size;
+
+#define PRIMITIVE(istart, icount)   \
+   CONCAT(vsplit_primitive_, ELT_TYPE)(vsplit, istart, icount)
+
+#else /* ELT_TYPE */
+
+static void
+vsplit_segment_simple_linear(struct vsplit_frontend *vsplit, unsigned flags,
+                             unsigned istart, unsigned icount)
+{
+   assert(icount <= vsplit->max_vertices);
+   vsplit->middle->run_linear(vsplit->middle, istart, icount, flags);
+}
+
+static void
+vsplit_segment_loop_linear(struct vsplit_frontend *vsplit, unsigned flags,
+                           unsigned istart, unsigned icount, unsigned i0)
+{
+   boolean close_loop = (flags == DRAW_SPLIT_BEFORE);
+   unsigned nr;
+
+   assert(icount + !!close_loop <= vsplit->segment_size);
+
+   if (close_loop) {
+      for (nr = 0; nr < icount; nr++)
+         vsplit->fetch_elts[nr] = istart + nr;
+      vsplit->fetch_elts[nr++] = i0;
+
+      vsplit->middle->run(vsplit->middle, vsplit->fetch_elts, nr,
+            vsplit->identity_draw_elts, nr, flags);
+   }
+   else {
+      vsplit->middle->run_linear(vsplit->middle, istart, icount, flags);
+   }
+}
+
+static void
+vsplit_segment_fan_linear(struct vsplit_frontend *vsplit, unsigned flags,
+                          unsigned istart, unsigned icount, unsigned i0)
+{
+   boolean use_spoken = ((flags & DRAW_SPLIT_BEFORE) != 0);
+   unsigned nr = 0, i;
+
+   assert(icount + !!use_spoken <= vsplit->segment_size);
+
+   if (use_spoken) {
+      vsplit->fetch_elts[nr++] = i0;
+      for (i = 1 ; i < icount; i++)
+         vsplit->fetch_elts[nr++] = istart + i;
+
+      vsplit->middle->run(vsplit->middle, vsplit->fetch_elts, nr,
+            vsplit->identity_draw_elts, nr, flags);
+   }
+   else {
+      vsplit->middle->run_linear(vsplit->middle, istart, icount, flags);
+   }
+}
+
+#define LOCAL_VARS                                                         \
+   struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend;   \
+   const unsigned prim = vsplit->prim;                                     \
+   const unsigned max_count_simple = vsplit->max_vertices;                 \
+   const unsigned max_count_loop = vsplit->segment_size - 1;               \
+   const unsigned max_count_fan = vsplit->segment_size;
+
+#define PRIMITIVE(istart, icount) FALSE
+
+#define ELT_TYPE linear
+
+#endif /* ELT_TYPE */
+
+#define FUNC_VARS                      \
+   struct draw_pt_front_end *frontend, \
+   unsigned start,                     \
+   unsigned count
+
+#define SEGMENT_SIMPLE(flags, istart, icount)   \
+   CONCAT(vsplit_segment_simple_, ELT_TYPE)(vsplit, flags, istart, icount)
+
+#define SEGMENT_LOOP(flags, istart, icount, i0) \
+   CONCAT(vsplit_segment_loop_, ELT_TYPE)(vsplit, flags, istart, icount, i0)
+
+#define SEGMENT_FAN(flags, istart, icount, i0)  \
+   CONCAT(vsplit_segment_fan_, ELT_TYPE)(vsplit, flags, istart, icount, i0)
+
+#include "draw_split_tmp.h"
+
+#undef CONCAT2
+#undef CONCAT
+
+#undef ELT_TYPE
+#undef ADD_CACHE
diff --git a/src/gallium/auxiliary/draw/draw_so_emit_tmp.h b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h
index 6d8937a0b4..7fafde9d5e 100644
--- a/src/gallium/auxiliary/draw/draw_so_emit_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h
@@ -7,11 +7,9 @@
 
 #define FUNC_ENTER                                                \
    /* declare more local vars */                                  \
-   struct draw_context *draw = so->draw;                          \
    const unsigned prim = input_prims->prim;                       \
-   const boolean last_vertex_last =                               \
-      !(draw->rasterizer->flatshade &&                            \
-        draw->rasterizer->flatshade_first);                       \
+   const unsigned prim_flags = input_prims->flags;                \
+   const boolean last_vertex_last = TRUE;                         \
    do {                                                           \
       debug_assert(input_prims->primitive_count == 1);            \
       switch (prim) {                                             \
diff --git a/src/gallium/auxiliary/draw/draw_split_tmp.h b/src/gallium/auxiliary/draw/draw_split_tmp.h
new file mode 100644
index 0000000000..47defc62b9
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_split_tmp.h
@@ -0,0 +1,176 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.9
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+static void
+FUNC(FUNC_VARS)
+{
+   unsigned first, incr;
+   LOCAL_VARS
+
+   /*
+    * prim, start, count, and max_count_{simple,loop,fan} should have been
+    * defined
+    */
+   if (0) {
+      debug_printf("%s: prim 0x%x, start %d, count %d, max_count_simple %d, "
+                   "max_count_loop %d, max_count_fan %d\n",
+                   __FUNCTION__, prim, start, count, max_count_simple,
+                   max_count_loop, max_count_fan);
+   }
+
+   draw_pt_split_prim(prim, &first, &incr);
+   /* sanitize primitive length */
+   count = draw_pt_trim_count(count, first, incr);
+   if (count < first)
+      return;
+
+   /* try flushing the entire primitive */
+   if (PRIMITIVE(start, count))
+      return;
+
+   /* must be able to at least flush two complete primitives */
+   assert(max_count_simple >= first + incr &&
+          max_count_loop >= first + incr &&
+          max_count_fan >= first + incr);
+
+   /* no splitting required */
+   if (count <= max_count_simple) {
+      SEGMENT_SIMPLE(0x0, start, count);
+   }
+   else {
+      const unsigned rollback = first - incr;
+      unsigned flags = DRAW_SPLIT_AFTER, seg_start = 0, seg_max;
+
+      /*
+       * Both count and seg_max below are explicitly trimmed.  Because
+       *
+       *   seg_start = N * (seg_max - rollback) = N' * incr,
+       *
+       * we have
+       *
+       *   remaining = count - seg_start = first + N'' * incr.
+       *
+       * That is, remaining is implicitly trimmed.
+       */
+      switch (prim) {
+      case PIPE_PRIM_POINTS:
+      case PIPE_PRIM_LINES:
+      case PIPE_PRIM_LINE_STRIP:
+      case PIPE_PRIM_TRIANGLES:
+      case PIPE_PRIM_TRIANGLE_STRIP:
+      case PIPE_PRIM_QUADS:
+      case PIPE_PRIM_QUAD_STRIP:
+      case PIPE_PRIM_LINES_ADJACENCY:
+      case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+      case PIPE_PRIM_TRIANGLES_ADJACENCY:
+      case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+         seg_max =
+            draw_pt_trim_count(MIN2(max_count_simple, count), first, incr);
+         if (prim == PIPE_PRIM_TRIANGLE_STRIP ||
+             prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY) {
+            /* make sure we flush even number of triangles at a time */
+            if (seg_max < count && !(((seg_max - first) / incr) & 1))
+               seg_max -= incr;
+         }
+
+         do {
+            const unsigned remaining = count - seg_start;
+
+            if (remaining > seg_max) {
+               SEGMENT_SIMPLE(flags, start + seg_start, seg_max);
+               seg_start += seg_max - rollback;
+
+               flags |= DRAW_SPLIT_BEFORE;
+            }
+            else {
+               flags &= ~DRAW_SPLIT_AFTER;
+
+               SEGMENT_SIMPLE(flags, start + seg_start, remaining);
+               seg_start += remaining;
+            }
+         } while (seg_start < count);
+         break;
+
+      case PIPE_PRIM_LINE_LOOP:
+         seg_max =
+            draw_pt_trim_count(MIN2(max_count_loop, count), first, incr);
+
+         do {
+            const unsigned remaining = count - seg_start;
+
+            if (remaining > seg_max) {
+               SEGMENT_LOOP(flags, start + seg_start, seg_max, start);
+               seg_start += seg_max - rollback;
+
+               flags |= DRAW_SPLIT_BEFORE;
+            }
+            else {
+               flags &= ~DRAW_SPLIT_AFTER;
+
+               SEGMENT_LOOP(flags, start + seg_start, remaining, start);
+               seg_start += remaining;
+            }
+         } while (seg_start < count);
+         break;
+
+      case PIPE_PRIM_TRIANGLE_FAN:
+      case PIPE_PRIM_POLYGON:
+         seg_max =
+            draw_pt_trim_count(MIN2(max_count_fan, count), first, incr);
+
+         do {
+            const unsigned remaining = count - seg_start;
+
+            if (remaining > seg_max) {
+               SEGMENT_FAN(flags, start + seg_start, seg_max, start);
+               seg_start += seg_max - rollback;
+
+               flags |= DRAW_SPLIT_BEFORE;
+            }
+            else {
+               flags &= ~DRAW_SPLIT_AFTER;
+
+               SEGMENT_FAN(flags, start + seg_start, remaining, start);
+               seg_start += remaining;
+            }
+         } while (seg_start < count);
+         break;
+
+      default:
+         assert(0);
+         break;
+      }
+   }
+}
+
+#undef FUNC
+#undef FUNC_VARS
+#undef LOCAL_VARS
+
+#undef PRIMITIVE
+#undef SEGMENT_SIMPLE
+#undef SEGMENT_LOOP
+#undef SEGMENT_FAN
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index d13ad24fff..fa9992db78 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -28,6 +28,7 @@
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_screen.h"
 
 #include "draw_private.h"
 #include "draw_context.h"
@@ -109,6 +110,11 @@ draw_create_vs_llvm(struct draw_context *draw,
 
    tgsi_scan_shader(state->tokens, &vs->base.info);
 
+   vs->variant_key_size = 
+      draw_llvm_variant_key_size(
+	 vs->base.info.file_max[TGSI_FILE_INPUT]+1,
+	 vs->base.info.file_max[TGSI_FILE_SAMPLER]+1);
+
    vs->base.draw = draw;
    vs->base.prepare = vs_llvm_prepare;
    vs->base.run_linear = vs_llvm_run_linear;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 7b35dd4bb4..e0d30be98d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -59,14 +59,6 @@
 #include "lp_bld_arit.h"
 
 
-/*
- * XXX: Increasing eliminates some artifacts, but adds others, most
- * noticeably corruption in the Earth halo in Google Earth.
- */
-#define RCP_NEWTON_STEPS 0
-
-#define RSQRT_NEWTON_STEPS 0
-
 #define EXP_POLY_DEGREE 3
 
 #define LOG_POLY_DEGREE 5
@@ -267,7 +259,7 @@ lp_build_add(struct lp_build_context *bld,
 }
 
 
-/** Return the sum of the elements of a */
+/** Return the scalar sum of the elements of a */
 LLVMValueRef
 lp_build_sum_vector(struct lp_build_context *bld,
                     LLVMValueRef a)
@@ -278,11 +270,9 @@ lp_build_sum_vector(struct lp_build_context *bld,
 
    assert(lp_check_value(type, a));
 
-   if (a == bld->zero)
-      return bld->zero;
-   if (a == bld->undef)
-      return bld->undef;
-   assert(type.length > 1);
+   if (type.length == 1) {
+      return a;
+   }
 
    assert(!bld->type.norm);
 
@@ -546,7 +536,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
    if(b == 2 && bld->type.floating)
       return lp_build_add(bld, a, a);
 
-   if(util_is_pot(b)) {
+   if(util_is_power_of_two(b)) {
       unsigned shift = ffs(b) - 1;
 
       if(bld->type.floating) {
@@ -1266,6 +1256,11 @@ lp_build_sqrt(struct lp_build_context *bld,
  *
  *   x_{i+1} = x_i * (2 - a * x_i)
  *
+ * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
+ * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
+ * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
+ * halo. It would be necessary to clamp the argument to prevent this.
+ *
  * See also:
  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
@@ -1306,13 +1301,27 @@ lp_build_rcp(struct lp_build_context *bld,
    if(LLVMIsConstant(a))
       return LLVMConstFDiv(bld->one, a);
 
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+   /*
+    * We don't use RCPPS because:
+    * - it only has 10bits of precision
+    * - it doesn't even get the reciprocate of 1.0 exactly
+    * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
+    * - for recent processors the benefit over DIVPS is marginal, a case
+    *   depedent
+    *
+    * We could still use it on certain processors if benchmarks show that the
+    * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
+    * particular uses that require less workarounds.
+    */
+
+   if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+      const unsigned num_iterations = 0;
       LLVMValueRef res;
       unsigned i;
 
       res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
 
-      for (i = 0; i < RCP_NEWTON_STEPS; ++i) {
+      for (i = 0; i < num_iterations; ++i) {
          res = lp_build_rcp_refine(bld, a, res);
       }
 
@@ -1363,13 +1372,14 @@ lp_build_rsqrt(struct lp_build_context *bld,
 
    assert(type.floating);
 
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+   if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+      const unsigned num_iterations = 0;
       LLVMValueRef res;
       unsigned i;
 
       res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
 
-      for (i = 0; i < RSQRT_NEWTON_STEPS; ++i) {
+      for (i = 0; i < num_iterations; ++i) {
          res = lp_build_rsqrt_refine(bld, a, res);
       }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.c b/src/gallium/auxiliary/gallivm/lp_bld_debug.c
index 39dfc51e50..d3a5afff8c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.c
@@ -46,7 +46,7 @@
 boolean
 lp_check_alignment(const void *ptr, unsigned alignment)
 {
-   assert(util_is_pot(alignment));
+   assert(util_is_power_of_two(alignment));
    return ((uintptr_t)ptr & (alignment - 1)) == 0;
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 247cb83ce6..92123e09d3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -388,7 +388,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
 
    if (format_matches_type(format_desc, type) &&
        format_desc->block.bits <= type.width * 4 &&
-       util_is_pot(format_desc->block.bits)) {
+       util_is_power_of_two(format_desc->block.bits)) {
       LLVMValueRef packed;
 
       /*
@@ -416,7 +416,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
        format_desc->block.width == 1 &&
        format_desc->block.height == 1 &&
-       util_is_pot(format_desc->block.bits) &&
+       util_is_power_of_two(format_desc->block.bits) &&
        format_desc->block.bits <= 32 &&
        format_desc->is_bitmask &&
        !format_desc->is_mixed &&
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 6d5410d970..48baf7c425 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -40,6 +40,7 @@
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
 #include <llvm/ExecutionEngine/JITEventListener.h>
 #include <llvm/Support/CommandLine.h>
+#include <llvm/Support/PrettyStackTrace.h>
 
 #include "pipe/p_config.h"
 #include "util/u_debug.h"
@@ -143,7 +144,6 @@ lp_set_target_options(void)
    llvm::UnsafeFPMath = true;
 #endif
 
-#if 0
    /*
     * LLVM will generate MMX instructions for vectors <= 64 bits, leading to
     * innefficient code, and in 32bit systems, to the corruption of the FPU
@@ -152,10 +152,8 @@ lp_set_target_options(void)
     * See also:
     * - http://llvm.org/bugs/show_bug.cgi?id=3287
     * - http://l4.me.uk/post/2009/06/07/llvm-wrinkle-3-configuration-what-configuration/
-    *
-    * XXX: Unfortunately this is not working.
     */
-   static boolean first = FALSE;
+   static boolean first = TRUE;
    if (first) {
       static const char* options[] = {
          "prog",
@@ -164,7 +162,13 @@ lp_set_target_options(void)
       llvm::cl::ParseCommandLineOptions(2, const_cast<char**>(options));
       first = FALSE;
    }
-#endif
+
+   /*
+    * By default LLVM adds a signal handler to output a pretty stack trace.
+    * This signal handler is never removed, causing problems when unloading the
+    * shared object where the gallium driver resides.
+    */
+   llvm::DisablePrettyStackTrace = true;
 }
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index e470082b97..e947b90d16 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -37,6 +37,8 @@
 #define LP_BLD_PACK_H
 
 
+#include "pipe/p_compiler.h"
+
 #include "gallivm/lp_bld.h"
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 0fd014ab9b..259b1142e3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -82,9 +82,9 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    state->swizzle_a         = view->swizzle_a;
 
    state->target            = texture->target;
-   state->pot_width         = util_is_pot(texture->width0);
-   state->pot_height        = util_is_pot(texture->height0);
-   state->pot_depth         = util_is_pot(texture->depth0);
+   state->pot_width         = util_is_power_of_two(texture->width0);
+   state->pot_height        = util_is_power_of_two(texture->height0);
+   state->pot_depth         = util_is_power_of_two(texture->depth0);
 
    state->wrap_s            = sampler->wrap_s;
    state->wrap_t            = sampler->wrap_t;
@@ -124,6 +124,52 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
 
 /**
+ * Compute the partial offset of a pixel block along an arbitrary axis.
+ *
+ * @param coord   coordinate in pixels
+ * @param stride  number of bytes between rows of successive pixel blocks
+ * @param block_length  number of pixels in a pixels block along the coordinate
+ *                      axis
+ * @param out_offset    resulting relative offset of the pixel block in bytes
+ * @param out_subcoord  resulting sub-block pixel coordinate
+ */
+void
+lp_build_sample_partial_offset(struct lp_build_context *bld,
+                               unsigned block_length,
+                               LLVMValueRef coord,
+                               LLVMValueRef stride,
+                               LLVMValueRef *out_offset,
+                               LLVMValueRef *out_subcoord)
+{
+   LLVMValueRef offset;
+   LLVMValueRef subcoord;
+
+   if (block_length == 1) {
+      subcoord = bld->zero;
+   }
+   else {
+      /*
+       * Pixel blocks have power of two dimensions. LLVM should convert the
+       * rem/div to bit arithmetic.
+       * TODO: Verify this.
+       */
+
+      LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length);
+      subcoord = LLVMBuildURem(bld->builder, coord, block_width, "");
+      coord    = LLVMBuildUDiv(bld->builder, coord, block_width, "");
+   }
+
+   offset = lp_build_mul(bld, coord, stride);
+
+   assert(out_offset);
+   assert(out_subcoord);
+
+   *out_offset = offset;
+   *out_subcoord = subcoord;
+}
+
+
+/**
  * Compute the offset of a pixel block.
  *
  * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
@@ -144,48 +190,35 @@ lp_build_sample_offset(struct lp_build_context *bld,
 {
    LLVMValueRef x_stride;
    LLVMValueRef offset;
-   LLVMValueRef i;
-   LLVMValueRef j;
-
-   /*
-    * Describe the coordinates in terms of pixel blocks.
-    *
-    * TODO: pixel blocks are power of two. LLVM should convert rem/div to
-    * bit arithmetic. Verify this.
-    */
-
-   if (format_desc->block.width == 1) {
-      i = bld->zero;
-   }
-   else {
-      LLVMValueRef block_width = lp_build_const_int_vec(bld->type, format_desc->block.width);
-      i = LLVMBuildURem(bld->builder, x, block_width, "");
-      x = LLVMBuildUDiv(bld->builder, x, block_width, "");
-   }
-
-   if (format_desc->block.height == 1) {
-      j = bld->zero;
-   }
-   else {
-      LLVMValueRef block_height = lp_build_const_int_vec(bld->type, format_desc->block.height);
-      j = LLVMBuildURem(bld->builder, y, block_height, "");
-      y = LLVMBuildUDiv(bld->builder, y, block_height, "");
-   }
 
    x_stride = lp_build_const_vec(bld->type, format_desc->block.bits/8);
-   offset = lp_build_mul(bld, x, x_stride);
+
+   lp_build_sample_partial_offset(bld,
+                                  format_desc->block.width,
+                                  x, x_stride,
+                                  &offset, out_i);
 
    if (y && y_stride) {
-      LLVMValueRef y_offset = lp_build_mul(bld, y, y_stride);
+      LLVMValueRef y_offset;
+      lp_build_sample_partial_offset(bld,
+                                     format_desc->block.height,
+                                     y, y_stride,
+                                     &y_offset, out_j);
       offset = lp_build_add(bld, offset, y_offset);
    }
+   else {
+      *out_j = bld->zero;
+   }
 
    if (z && z_stride) {
-      LLVMValueRef z_offset = lp_build_mul(bld, z, z_stride);
+      LLVMValueRef z_offset;
+      LLVMValueRef k;
+      lp_build_sample_partial_offset(bld,
+                                     1, /* pixel blocks are always 2D */
+                                     z, z_stride,
+                                     &z_offset, &k);
       offset = lp_build_add(bld, offset, z_offset);
    }
 
    *out_offset = offset;
-   *out_i = i;
-   *out_j = j;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 5b8f478094..caafc4eca0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -36,6 +36,8 @@
 #define LP_BLD_SAMPLE_H
 
 
+#include "pipe/p_format.h"
+
 #include "gallivm/lp_bld.h"
 
 struct pipe_resource;
@@ -147,6 +149,15 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
 
 void
+lp_build_sample_partial_offset(struct lp_build_context *bld,
+                               unsigned block_length,
+                               LLVMValueRef coord,
+                               LLVMValueRef stride,
+                               LLVMValueRef *out_offset,
+                               LLVMValueRef *out_i);
+
+
+void
 lp_build_sample_offset(struct lp_build_context *bld,
                        const struct util_format_description *format_desc,
                        LLVMValueRef x,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 806c7d56a8..1f39d9c98b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -176,6 +176,7 @@ texture_dims(enum pipe_texture_target tex)
    case PIPE_TEXTURE_1D:
       return 1;
    case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
    case PIPE_TEXTURE_CUBE:
       return 2;
    case PIPE_TEXTURE_3D:
@@ -322,59 +323,6 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
 
 
 /**
- * Fetch the texels as <4n x i8> in AoS form.
- */
-static LLVMValueRef
-lp_build_sample_packed(struct lp_build_sample_context *bld,
-                       LLVMValueRef x,
-                       LLVMValueRef y,
-                       LLVMValueRef y_stride,
-                       LLVMValueRef data_array)
-{
-   LLVMValueRef offset, i, j;
-   LLVMValueRef data_ptr;
-   LLVMValueRef res;
-
-   /* convert x,y,z coords to linear offset from start of texture, in bytes */
-   lp_build_sample_offset(&bld->uint_coord_bld,
-                          bld->format_desc,
-                          x, y, NULL, y_stride, NULL,
-                          &offset, &i, &j);
-
-   /* get pointer to mipmap level 0 data */
-   data_ptr = lp_build_get_const_mipmap_level(bld, data_array, 0);
-
-   if (util_format_is_rgba8_variant(bld->format_desc)) {
-      /* Just fetch the data directly without swizzling */
-      assert(bld->format_desc->block.width == 1);
-      assert(bld->format_desc->block.height == 1);
-      assert(bld->format_desc->block.bits <= bld->texel_type.width);
-
-      res = lp_build_gather(bld->builder,
-                            bld->texel_type.length,
-                            bld->format_desc->block.bits,
-                            bld->texel_type.width,
-                            data_ptr, offset);
-   }
-   else {
-      struct lp_type type;
-
-      assert(bld->texel_type.width == 32);
-
-      memset(&type, 0, sizeof type);
-      type.width = 8;
-      type.length = bld->texel_type.length*4;
-      type.norm = TRUE;
-
-      res = lp_build_fetch_rgba_aos(bld->builder, bld->format_desc, type,
-                                    data_ptr, offset, i, j);
-   }
-
-   return res;
-}
-
-
-/**
  * Helper to compute the mirror function for the PIPE_WRAP_MIRROR modes.
  */
 static LLVMValueRef
@@ -408,7 +356,7 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld,
 
 
 /**
- * We only support a few wrap modes in lp_build_sample_wrap_int() at this time.
+ * We only support a few wrap modes in lp_build_sample_wrap_linear_int() at this time.
  * Return whether the given mode is supported by that function.
  */
 static boolean
@@ -430,13 +378,18 @@ is_simple_wrap_mode(unsigned mode)
  * \param length  the texture size along one dimension
  * \param is_pot  if TRUE, length is a power of two
  * \param wrap_mode  one of PIPE_TEX_WRAP_x
+ * \param i0  resulting sub-block pixel coordinate for coord0
  */
-static LLVMValueRef
-lp_build_sample_wrap_int(struct lp_build_sample_context *bld,
-                         LLVMValueRef coord,
-                         LLVMValueRef length,
-                         boolean is_pot,
-                         unsigned wrap_mode)
+static void
+lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
+                                 unsigned block_length,
+                                 LLVMValueRef coord,
+                                 LLVMValueRef length,
+                                 LLVMValueRef stride,
+                                 boolean is_pot,
+                                 unsigned wrap_mode,
+                                 LLVMValueRef *out_offset,
+                                 LLVMValueRef *out_i)
 {
    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
@@ -469,7 +422,134 @@ lp_build_sample_wrap_int(struct lp_build_sample_context *bld,
       assert(0);
    }
 
-   return coord;
+   lp_build_sample_partial_offset(uint_coord_bld, block_length, coord, stride,
+                                  out_offset, out_i);
+}
+
+
+/**
+ * Build LLVM code for texture wrap mode, for scaled integer texcoords.
+ * \param coord0  the incoming texcoord (s,t,r or q) scaled to the texture size
+ * \param length  the texture size along one dimension
+ * \param stride  pixel stride along the coordinate axis
+ * \param block_length  is the length of the pixel block along the
+ *                      coordinate axis
+ * \param is_pot  if TRUE, length is a power of two
+ * \param wrap_mode  one of PIPE_TEX_WRAP_x
+ * \param offset0  resulting relative offset for coord0
+ * \param offset1  resulting relative offset for coord0 + 1
+ * \param i0  resulting sub-block pixel coordinate for coord0
+ * \param i1  resulting sub-block pixel coordinate for coord0 + 1
+ */
+static void
+lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
+                                unsigned block_length,
+                                LLVMValueRef coord0,
+                                LLVMValueRef length,
+                                LLVMValueRef stride,
+                                boolean is_pot,
+                                unsigned wrap_mode,
+                                LLVMValueRef *offset0,
+                                LLVMValueRef *offset1,
+                                LLVMValueRef *i0,
+                                LLVMValueRef *i1)
+{
+   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef length_minus_one;
+   LLVMValueRef lmask, umask, mask;
+
+   if (block_length != 1) {
+      /*
+       * If the pixel block covers more than one pixel then there is no easy
+       * way to calculate offset1 relative to offset0. Instead, compute them
+       * independently.
+       */
+
+      LLVMValueRef coord1;
+
+      lp_build_sample_wrap_nearest_int(bld,
+                                       block_length,
+                                       coord0,
+                                       length,
+                                       stride,
+                                       is_pot,
+                                       wrap_mode,
+                                       offset0, i0);
+
+      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+
+      lp_build_sample_wrap_nearest_int(bld,
+                                       block_length,
+                                       coord1,
+                                       length,
+                                       stride,
+                                       is_pot,
+                                       wrap_mode,
+                                       offset1, i1);
+
+      return;
+   }
+
+   /*
+    * Scalar pixels -- try to compute offset0 and offset1 with a single stride
+    * multiplication.
+    */
+
+   *i0 = uint_coord_bld->zero;
+   *i1 = uint_coord_bld->zero;
+
+   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      if (is_pot) {
+         coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, "");
+      }
+      else {
+         /* Signed remainder won't give the right results for negative
+          * dividends but unsigned remainder does.*/
+         coord0 = LLVMBuildURem(bld->builder, coord0, length, "");
+      }
+
+      mask = lp_build_compare(bld->builder, int_coord_bld->type,
+                              PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
+
+      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
+      *offset1 = LLVMBuildAnd(bld->builder,
+                              lp_build_add(uint_coord_bld, *offset0, stride),
+                              mask, "");
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      lmask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type,
+                               PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
+      umask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type,
+                               PIPE_FUNC_LESS, coord0, length_minus_one);
+
+      coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero);
+      coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one);
+
+      mask = LLVMBuildAnd(bld->builder, lmask, umask, "");
+
+      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
+      *offset1 = lp_build_add(uint_coord_bld,
+                              *offset0,
+                              LLVMBuildAnd(bld->builder, stride, mask, ""));
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+   default:
+      assert(0);
+      *offset0 = uint_coord_bld->zero;
+      *offset1 = uint_coord_bld->zero;
+      break;
+   }
 }
 
 
@@ -1740,16 +1820,21 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
    LLVMValueRef i32_c8, i32_c128, i32_c255;
    LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
    LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi;
-   LLVMValueRef x0, x1;
-   LLVMValueRef y0, y1;
-   LLVMValueRef neighbors[2][2];
+   LLVMValueRef data_ptr;
+   LLVMValueRef x_stride, y_stride;
+   LLVMValueRef x_offset0, x_offset1;
+   LLVMValueRef y_offset0, y_offset1;
+   LLVMValueRef offset[2][2];
+   LLVMValueRef x_subcoord[2], y_subcoord[2];
    LLVMValueRef neighbors_lo[2][2];
    LLVMValueRef neighbors_hi[2][2];
    LLVMValueRef packed, packed_lo, packed_hi;
    LLVMValueRef unswizzled[4];
-   LLVMValueRef stride;
+   const unsigned level = 0;
+   unsigned i, j;
 
-   assert(bld->static_state->target == PIPE_TEXTURE_2D);
+   assert(bld->static_state->target == PIPE_TEXTURE_2D
+         || bld->static_state->target == PIPE_TEXTURE_RECT);
    assert(bld->static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR);
    assert(bld->static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR);
    assert(bld->static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE);
@@ -1793,21 +1878,30 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
    s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
    t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
 
-   x0 = s_ipart;
-   y0 = t_ipart;
-
-   x1 = lp_build_add(&bld->int_coord_bld, x0, bld->int_coord_bld.one);
-   y1 = lp_build_add(&bld->int_coord_bld, y0, bld->int_coord_bld.one);
-
-   x0 = lp_build_sample_wrap_int(bld, x0, width,  bld->static_state->pot_width,
-                                 bld->static_state->wrap_s);
-   y0 = lp_build_sample_wrap_int(bld, y0, height, bld->static_state->pot_height,
-                                 bld->static_state->wrap_t);
-
-   x1 = lp_build_sample_wrap_int(bld, x1, width,  bld->static_state->pot_width,
-                                 bld->static_state->wrap_s);
-   y1 = lp_build_sample_wrap_int(bld, y1, height, bld->static_state->pot_height,
-                                 bld->static_state->wrap_t);
+   x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
+                                 bld->format_desc->block.bits/8);
+
+   y_stride = lp_build_get_const_level_stride_vec(bld, stride_array, level);
+
+   lp_build_sample_wrap_linear_int(bld,
+                                   bld->format_desc->block.width,
+                                   s_ipart, width, x_stride,
+                                   bld->static_state->pot_width,
+                                   bld->static_state->wrap_s,
+                                   &x_offset0, &x_offset1,
+                                   &x_subcoord[0], &x_subcoord[1]);
+   lp_build_sample_wrap_linear_int(bld,
+                                   bld->format_desc->block.height,
+                                   t_ipart, height, y_stride,
+                                   bld->static_state->pot_height,
+                                   bld->static_state->wrap_t,
+                                   &y_offset0, &y_offset1,
+                                   &y_subcoord[0], &y_subcoord[1]);
+
+   offset[0][0] = lp_build_add(&bld->uint_coord_bld, x_offset0, y_offset0);
+   offset[0][1] = lp_build_add(&bld->uint_coord_bld, x_offset1, y_offset0);
+   offset[1][0] = lp_build_add(&bld->uint_coord_bld, x_offset0, y_offset1);
+   offset[1][1] = lp_build_add(&bld->uint_coord_bld, x_offset1, y_offset1);
 
    /*
     * Transform 4 x i32 in
@@ -1836,7 +1930,6 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
       LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
       LLVMValueRef shuffle_lo;
       LLVMValueRef shuffle_hi;
-      unsigned i, j;
 
       for(j = 0; j < h16.type.length; j += 4) {
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
@@ -1864,7 +1957,10 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
       t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_hi, "");
    }
 
-   stride = lp_build_get_const_level_stride_vec(bld, stride_array, 0);
+   /*
+    * get pointer to mipmap level 0 data
+    */
+   data_ptr = lp_build_get_const_mipmap_level(bld, data_array, level);
 
    /*
     * Fetch the pixels as 4 x 32bit (rgba order might differ):
@@ -1883,20 +1979,38 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
     * The higher 8 bits of the resulting elements will be zero.
     */
 
-   neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_array);
-   neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_array);
-   neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_array);
-   neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_array);
+   for (j = 0; j < 2; ++j) {
+      for (i = 0; i < 2; ++i) {
+         LLVMValueRef rgba8;
 
-   neighbors[0][0] = LLVMBuildBitCast(builder, neighbors[0][0], u8n_vec_type, "");
-   neighbors[0][1] = LLVMBuildBitCast(builder, neighbors[0][1], u8n_vec_type, "");
-   neighbors[1][0] = LLVMBuildBitCast(builder, neighbors[1][0], u8n_vec_type, "");
-   neighbors[1][1] = LLVMBuildBitCast(builder, neighbors[1][1], u8n_vec_type, "");
+         if (util_format_is_rgba8_variant(bld->format_desc)) {
+            /*
+             * Given the format is a rgba8, just read the pixels as is,
+             * without any swizzling. Swizzling will be done later.
+             */
+            rgba8 = lp_build_gather(bld->builder,
+                                    bld->texel_type.length,
+                                    bld->format_desc->block.bits,
+                                    bld->texel_type.width,
+                                    data_ptr, offset[j][i]);
 
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][0], &neighbors_lo[0][0], &neighbors_hi[0][0]);
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][1], &neighbors_lo[0][1], &neighbors_hi[0][1]);
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][0], &neighbors_lo[1][0], &neighbors_hi[1][0]);
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][1], &neighbors_lo[1][1], &neighbors_hi[1][1]);
+            rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+
+         }
+         else {
+            rgba8 = lp_build_fetch_rgba_aos(bld->builder,
+                                            bld->format_desc,
+                                            u8n.type,
+                                            data_ptr, offset[j][i],
+                                            x_subcoord[i],
+                                            y_subcoord[j]);
+         }
+
+         lp_build_unpack2(builder, u8n.type, h16.type,
+                          rgba8,
+                          &neighbors_lo[j][i], &neighbors_hi[j][i]);
+      }
+   }
 
    /*
     * Linear interpolate with 8.8 fixed point.
@@ -2077,7 +2191,8 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    }
    else if (util_format_fits_8unorm(bld.format_desc) &&
             bld.format_desc->nr_channels > 1 &&
-            static_state->target == PIPE_TEXTURE_2D &&
+            (static_state->target == PIPE_TEXTURE_2D ||
+                  static_state->target == PIPE_TEXTURE_RECT) &&
             static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR &&
             static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR &&
             static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 0aa64affac..0e07f7f3f3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -200,8 +200,10 @@ static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
    }
    mask->cond_stack[mask->cond_stack_size++] = mask->cond_mask;
    assert(LLVMTypeOf(val) == mask->int_vec_type);
-   mask->cond_mask = val;
-
+   mask->cond_mask = LLVMBuildAnd(mask->bld->builder,
+                                  mask->cond_mask,
+                                  val,
+                                  "");
    lp_exec_mask_update(mask);
 }
 
@@ -802,7 +804,7 @@ emit_store(
 
    case TGSI_FILE_PREDICATE:
       lp_exec_mask_store(&bld->exec_mask, pred, value,
-                         bld->preds[index][chan_index]);
+                         bld->preds[reg->Register.Index][chan_index]);
       break;
 
    default:
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index 3ffe916f8e..fec1d3dfbc 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -128,16 +128,16 @@ struct lp_build_context
     */
    struct lp_type type;
 
-   /** Same as lp_build_undef(type) */
+   /** Same as lp_build_elem_type(type) */
    LLVMTypeRef elem_type;
 
-   /** Same as lp_build_undef(type) */
+   /** Same as lp_build_vec_type(type) */
    LLVMTypeRef vec_type;
 
-   /** Same as lp_build_undef(type) */
+   /** Same as lp_build_int_elem_type(type) */
    LLVMTypeRef int_elem_type;
 
-   /** Same as lp_build_undef(type) */
+   /** Same as lp_build_int_vec_type(type) */
    LLVMTypeRef int_vec_type;
 
    /** Same as lp_build_undef(type) */
diff --git a/src/gallium/auxiliary/os/os_stream.c b/src/gallium/auxiliary/os/os_stream.c
new file mode 100644
index 0000000000..3c55fc00d9
--- /dev/null
+++ b/src/gallium/auxiliary/os/os_stream.c
@@ -0,0 +1,58 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_config.h"
+
+#include "os_stream.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
+
+int
+os_default_stream_vprintf (struct os_stream* stream, const char *format, va_list ap)
+{
+   char buf[1024];
+   int retval;
+   va_list ap2;
+   va_copy(ap2, ap);
+   retval = util_vsnprintf(buf, sizeof(buf), format, ap2);
+   va_end(ap2);
+   if(retval <= 0)
+   {}
+   else if(retval < sizeof(buf))
+      stream->write(stream, buf, retval);
+   else
+   {
+      char* str = MALLOC(retval + 1);
+      if(!str)
+         return -1;
+      retval = util_vsnprintf(str, retval + 1, format, ap);
+      if(retval > 0)
+         stream->write(stream, str, retval);
+      FREE(str);
+   }
+
+   return retval;
+}
diff --git a/src/gallium/auxiliary/os/os_stream.h b/src/gallium/auxiliary/os/os_stream.h
index 693a0621e2..6c6050bb02 100644
--- a/src/gallium/auxiliary/os/os_stream.h
+++ b/src/gallium/auxiliary/os/os_stream.h
@@ -50,6 +50,9 @@ struct os_stream
 
    void
    (*flush)(struct os_stream *stream);
+
+   int
+   (*vprintf)(struct os_stream *stream, const char* format, va_list ap);
 };
 
 
@@ -90,6 +93,27 @@ os_stream_flush(struct os_stream *stream)
    stream->flush(stream);
 }
 
+int
+os_default_stream_vprintf (struct os_stream* stream, const char *format, va_list ap);
+
+static INLINE int
+os_stream_vprintf (struct os_stream* stream, const char *format, va_list ap)
+{
+   return stream->vprintf(stream, format, ap);
+}
+
+static INLINE int
+os_stream_printf (struct os_stream* stream, const char *format, ...)
+{
+   int retval;
+   va_list args;
+
+   va_start (args, format);
+   retval = stream->vprintf(stream, format, args);
+   va_end (args);
+
+   return retval;
+}
 
 struct os_stream *
 os_file_stream_create(const char *filename);
@@ -118,5 +142,4 @@ os_str_stream_get_and_close(struct os_stream *stream);
 #define os_file_stream_create(_filename) os_null_stream_create()
 #endif
 
-
 #endif /* _OS_STREAM_H_ */
diff --git a/src/gallium/auxiliary/os/os_stream_log.c b/src/gallium/auxiliary/os/os_stream_log.c
index 7cc2028a22..b01377c346 100644
--- a/src/gallium/auxiliary/os/os_stream_log.c
+++ b/src/gallium/auxiliary/os/os_stream_log.c
@@ -73,7 +73,8 @@ static struct os_stream
 os_log_stream_struct = {
    &os_log_stream_close,
    &os_log_stream_write,
-   &os_log_stream_flush
+   &os_log_stream_flush,
+   &os_default_stream_vprintf,
 };
 
 
diff --git a/src/gallium/auxiliary/os/os_stream_null.c b/src/gallium/auxiliary/os/os_stream_null.c
index 128c4e8f0e..a549a789e6 100644
--- a/src/gallium/auxiliary/os/os_stream_null.c
+++ b/src/gallium/auxiliary/os/os_stream_null.c
@@ -56,12 +56,18 @@ os_null_stream_flush(struct os_stream *stream)
    (void)stream;
 }
 
+static int
+os_null_stream_vprintf (struct os_stream* stream, const char *format, va_list ap)
+{
+   return 0;
+}
 
 static struct os_stream
 os_null_stream = {
    &os_null_stream_close,
    &os_null_stream_write,
-   &os_null_stream_flush
+   &os_null_stream_flush,
+   &os_null_stream_vprintf
 };
 
 
diff --git a/src/gallium/auxiliary/os/os_stream_stdc.c b/src/gallium/auxiliary/os/os_stream_stdc.c
index 9e7ed71107..37e7d063e2 100644
--- a/src/gallium/auxiliary/os/os_stream_stdc.c
+++ b/src/gallium/auxiliary/os/os_stream_stdc.c
@@ -83,6 +83,14 @@ os_stdc_stream_flush(struct os_stream *_stream)
    fflush(stream->file);
 }
 
+static int
+os_stdc_stream_vprintf (struct os_stream* _stream, const char *format, va_list ap)
+{
+   struct os_stdc_stream *stream = os_stdc_stream(_stream);
+
+   return vfprintf(stream->file, format, ap);
+}
+
 
 struct os_stream *
 os_file_stream_create(const char *filename)
@@ -96,6 +104,7 @@ os_file_stream_create(const char *filename)
    stream->base.close = &os_stdc_stream_close;
    stream->base.write = &os_stdc_stream_write;
    stream->base.flush = &os_stdc_stream_flush;
+   stream->base.vprintf = &os_stdc_stream_vprintf;
 
    stream->file = fopen(filename, "w");
    if(!stream->file)
diff --git a/src/gallium/auxiliary/os/os_stream_str.c b/src/gallium/auxiliary/os/os_stream_str.c
index b5c7270d2a..be9478b2a1 100644
--- a/src/gallium/auxiliary/os/os_stream_str.c
+++ b/src/gallium/auxiliary/os/os_stream_str.c
@@ -118,6 +118,7 @@ os_str_stream_create(size_t size)
    stream->base.close = &os_str_stream_close;
    stream->base.write = &os_str_stream_write;
    stream->base.flush = &os_str_stream_flush;
+   stream->base.vprintf = &os_default_stream_vprintf;
 
    stream->str = os_malloc(size);
    if(!stream->str)
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
index cec2524da2..2ef02160f2 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
@@ -50,8 +50,7 @@
 #define PB_BUFMGR_H_
 
 
-#include "pipe/p_compiler.h"
-#include "pipe/p_defines.h"
+#include "pb_buffer.h"
 
 
 #ifdef __cplusplus
diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
index 2e15751e50..0461c81550 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
@@ -30,7 +30,7 @@
 #include "rtasm_cpu.h"
 
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
 static boolean rtasm_sse_enabled(void)
 {
    static boolean firsttime = 1;
@@ -49,7 +49,7 @@ static boolean rtasm_sse_enabled(void)
 int rtasm_cpu_has_sse(void)
 {
    /* FIXME: actually detect this at run-time */
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    return rtasm_sse_enabled();
 #else
    return 0;
@@ -59,7 +59,7 @@ int rtasm_cpu_has_sse(void)
 int rtasm_cpu_has_sse2(void) 
 {
    /* FIXME: actually detect this at run-time */
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    return rtasm_sse_enabled();
 #else
    return 0;
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 9f70b73698..75b0f6a68e 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -22,8 +22,9 @@
  **************************************************************************/
 
 #include "pipe/p_config.h"
+#include "util/u_cpu_detect.h"
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
 
 #include "pipe/p_compiler.h"
 #include "util/u_debug.h"
@@ -231,6 +232,10 @@ static void emit_modrm( struct x86_function *p,
    
    assert(reg.mod == mod_REG);
    
+   /* TODO: support extended x86-64 registers */
+   assert(reg.idx < 8);
+   assert(regmem.idx < 8);
+
    val |= regmem.mod << 6;     	/* mod field */
    val |= reg.idx << 3;		/* reg field */
    val |= regmem.idx;		/* r/m field */
@@ -363,6 +368,12 @@ int x86_get_label( struct x86_function *p )
  */
 
 
+void x64_rexw(struct x86_function *p)
+{
+   if(x86_target(p) != X86_32)
+      emit_1ub(p, 0x48);
+}
+
 void x86_jcc( struct x86_function *p,
 	      enum x86_cc cc,
 	      int label )
@@ -449,6 +460,52 @@ void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
    emit_1i(p, imm);
 }
 
+void x86_mov_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   if(dst.mod == mod_REG)
+      x86_mov_reg_imm(p, dst, imm);
+   else
+   {
+      emit_1ub(p, 0xc7);
+      emit_modrm_noreg(p, 0, dst);
+      emit_1i(p, imm);
+   }
+}
+
+void x86_mov16_imm( struct x86_function *p, struct x86_reg dst, uint16_t imm )
+{
+   DUMP_RI( dst, imm );
+   emit_1ub(p, 0x66);
+   if(dst.mod == mod_REG)
+   {
+      emit_1ub(p, 0xb8 + dst.idx);
+      emit_2ub(p, imm & 0xff, imm >> 8);
+   }
+   else
+   {
+      emit_1ub(p, 0xc7);
+      emit_modrm_noreg(p, 0, dst);
+      emit_2ub(p, imm & 0xff, imm >> 8);
+   }
+}
+
+void x86_mov8_imm( struct x86_function *p, struct x86_reg dst, uint8_t imm )
+{
+   DUMP_RI( dst, imm );
+   if(dst.mod == mod_REG)
+   {
+      emit_1ub(p, 0xb0 + dst.idx);
+      emit_1ub(p, imm);
+   }
+   else
+   {
+      emit_1ub(p, 0xc6);
+      emit_modrm_noreg(p, 0, dst);
+      emit_1ub(p, imm);
+   }
+}
+
 /**
  * Immediate group 1 instructions.
  */
@@ -520,7 +577,7 @@ void x86_push( struct x86_function *p,
    }
 
 
-   p->stack_offset += 4;
+   p->stack_offset += sizeof(void*);
 }
 
 void x86_push_imm32( struct x86_function *p,
@@ -530,7 +587,7 @@ void x86_push_imm32( struct x86_function *p,
    emit_1ub(p, 0x68);
    emit_1i(p,  imm32);
 
-   p->stack_offset += 4;
+   p->stack_offset += sizeof(void*);
 }
 
 
@@ -540,23 +597,33 @@ void x86_pop( struct x86_function *p,
    DUMP_R( reg );
    assert(reg.mod == mod_REG);
    emit_1ub(p, 0x58 + reg.idx);
-   p->stack_offset -= 4;
+   p->stack_offset -= sizeof(void*);
 }
 
 void x86_inc( struct x86_function *p,
 	      struct x86_reg reg )
 {
    DUMP_R( reg );
-   assert(reg.mod == mod_REG);
-   emit_1ub(p, 0x40 + reg.idx);
+   if(x86_target(p) == X86_32 && reg.mod == mod_REG)
+   {
+      emit_1ub(p, 0x40 + reg.idx);
+      return;
+   }
+   emit_1ub(p, 0xff);
+   emit_modrm_noreg(p, 0, reg);
 }
 
 void x86_dec( struct x86_function *p,
 	      struct x86_reg reg )
 {
    DUMP_R( reg );
-   assert(reg.mod == mod_REG);
-   emit_1ub(p, 0x48 + reg.idx);
+   if(x86_target(p) == X86_32 && reg.mod == mod_REG)
+   {
+      emit_1ub(p, 0x48 + reg.idx);
+      return;
+   }
+   emit_1ub(p, 0xff);
+   emit_modrm_noreg(p, 1, reg);
 }
 
 void x86_ret( struct x86_function *p )
@@ -583,9 +650,82 @@ void x86_mov( struct x86_function *p,
 	      struct x86_reg src )
 {
    DUMP_RR( dst, src );
+   /* special hack for reading arguments until we support x86-64 registers everywhere */
+   if(src.mod == mod_REG && dst.mod == mod_REG && (src.idx >= 8 || dst.idx >= 8))
+   {
+      uint8_t rex = 0x40;
+      if(dst.idx >= 8)
+      {
+         rex |= 4;
+         dst.idx -= 8;
+      }
+      if(src.idx >= 8)
+      {
+         rex |= 1;
+         src.idx -= 8;
+      }
+      emit_1ub(p, rex);
+   }
+   emit_op_modrm( p, 0x8b, 0x89, dst, src );
+}
+
+void x86_mov16( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_1ub(p, 0x66);
+   emit_op_modrm( p, 0x8b, 0x89, dst, src );
+}
+
+void x86_mov8( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm( p, 0x8a, 0x88, dst, src );
+}
+
+void x64_mov64( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   uint8_t rex = 0x48;
+   DUMP_RR( dst, src );
+   assert(x86_target(p) != X86_32);
+
+   /* special hack for reading arguments until we support x86-64 registers everywhere */
+   if(src.mod == mod_REG && dst.mod == mod_REG && (src.idx >= 8 || dst.idx >= 8))
+   {
+      if(dst.idx >= 8)
+      {
+         rex |= 4;
+         dst.idx -= 8;
+      }
+      if(src.idx >= 8)
+      {
+         rex |= 1;
+         src.idx -= 8;
+      }
+   }
+   emit_1ub(p, rex);
    emit_op_modrm( p, 0x8b, 0x89, dst, src );
 }
 
+void x86_movzx8(struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, 0x0f, 0xb6);
+   emit_modrm(p, dst, src);
+}
+
+void x86_movzx16(struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, 0x0f, 0xb7);
+   emit_modrm(p, dst, src);
+}
+
 void x86_xor( struct x86_function *p,
 	      struct x86_reg dst,
 	      struct x86_reg src )
@@ -680,6 +820,61 @@ void x86_div( struct x86_function *p,
    emit_op_modrm(p, 0xf7, 0, x86_make_reg(file_REG32, 6), src);
 }
 
+void x86_bswap( struct x86_function *p, struct x86_reg reg )
+{
+   DUMP_R(reg);
+   assert(reg.file == file_REG32);
+   assert(reg.mod == mod_REG);
+   emit_2ub(p, 0x0f, 0xc8 + reg.idx);
+}
+
+void x86_shr_imm( struct x86_function *p, struct x86_reg reg, unsigned imm )
+{
+   DUMP_RI(reg, imm);
+   if(imm == 1)
+   {
+      emit_1ub(p, 0xd1);
+      emit_modrm_noreg(p, 5, reg);
+   }
+   else
+   {
+      emit_1ub(p, 0xc1);
+      emit_modrm_noreg(p, 5, reg);
+      emit_1ub(p, imm);
+   }
+}
+
+void x86_sar_imm( struct x86_function *p, struct x86_reg reg, unsigned imm )
+{
+   DUMP_RI(reg, imm);
+   if(imm == 1)
+   {
+      emit_1ub(p, 0xd1);
+      emit_modrm_noreg(p, 7, reg);
+   }
+   else
+   {
+      emit_1ub(p, 0xc1);
+      emit_modrm_noreg(p, 7, reg);
+      emit_1ub(p, imm);
+   }
+}
+
+void x86_shl_imm( struct x86_function *p, struct x86_reg reg, unsigned imm  )
+{
+   DUMP_RI(reg, imm);
+   if(imm == 1)
+   {
+      emit_1ub(p, 0xd1);
+      emit_modrm_noreg(p, 4, reg);
+   }
+   else
+   {
+      emit_1ub(p, 0xc1);
+      emit_modrm_noreg(p, 4, reg);
+      emit_1ub(p, imm);
+   }
+}
 
 
 /***********************************************************************
@@ -1013,6 +1208,77 @@ void sse_movmskps( struct x86_function *p,
  * SSE2 instructions
  */
 
+void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0x66, 0x0f);
+   if(dst.mod == mod_REG && dst.file == file_REG32)
+   {
+      emit_1ub(p, 0x7e);
+      emit_modrm(p, src, dst);
+   }
+   else
+   {
+      emit_op_modrm(p, 0x6e, 0x7e, dst, src);
+   }
+}
+
+void sse2_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   switch (dst.mod) {
+   case mod_REG:
+      emit_3ub(p, 0xf3, 0x0f, 0x7e);
+      emit_modrm(p, dst, src);
+      break;
+   case mod_INDIRECT:
+   case mod_DISP32:
+   case mod_DISP8:
+      assert(src.mod == mod_REG);
+      emit_3ub(p, 0x66, 0x0f, 0xd6);
+      emit_modrm(p, src, dst);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+void sse2_movdqu( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0xf3, 0x0f);
+   emit_op_modrm(p, 0x6f, 0x7f, dst, src);
+}
+
+void sse2_movdqa( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0x66, 0x0f);
+   emit_op_modrm(p, 0x6f, 0x7f, dst, src);
+}
+
+void sse2_movsd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0xf2, 0x0f);
+   emit_op_modrm(p, 0x10, 0x11, dst, src);
+}
+
+void sse2_movupd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0x66, 0x0f);
+   emit_op_modrm(p, 0x10, 0x11, dst, src);
+}
+
+void sse2_movapd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0x66, 0x0f);
+   emit_op_modrm(p, 0x28, 0x29, dst, src);
+}
+
 /**
  * Perform a reduced swizzle:
  */
@@ -1027,6 +1293,28 @@ void sse2_pshufd( struct x86_function *p,
    emit_1ub(p, shuf); 
 }
 
+void sse2_pshuflw( struct x86_function *p,
+                  struct x86_reg dst,
+                  struct x86_reg src,
+                  unsigned char shuf)
+{
+   DUMP_RRI( dst, src, shuf );
+   emit_3ub(p, 0xf2, X86_TWOB, 0x70);
+   emit_modrm(p, dst, src);
+   emit_1ub(p, shuf);
+}
+
+void sse2_pshufhw( struct x86_function *p,
+                  struct x86_reg dst,
+                  struct x86_reg src,
+                  unsigned char shuf)
+{
+   DUMP_RRI( dst, src, shuf );
+   emit_3ub(p, 0xf3, X86_TWOB, 0x70);
+   emit_modrm(p, dst, src);
+   emit_1ub(p, shuf);
+}
+
 void sse2_cvttps2dq( struct x86_function *p,
                      struct x86_reg dst,
                      struct x86_reg src )
@@ -1045,6 +1333,24 @@ void sse2_cvtps2dq( struct x86_function *p,
    emit_modrm( p, dst, src );
 }
 
+void sse2_cvtsd2ss( struct x86_function *p,
+                    struct x86_reg dst,
+                    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0xf2, 0x0f, 0x5a);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_cvtpd2ps( struct x86_function *p,
+                    struct x86_reg dst,
+                    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, 0x0f, 0x5a);
+   emit_modrm( p, dst, src );
+}
+
 void sse2_packssdw( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
@@ -1081,6 +1387,97 @@ void sse2_punpcklbw( struct x86_function *p,
    emit_modrm( p, dst, src );
 }
 
+void sse2_punpcklwd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, 0x0f, 0x61);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_punpckldq( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, 0x0f, 0x62);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_punpcklqdq( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, 0x0f, 0x6c);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_psllw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x71);
+   emit_modrm_noreg(p, 6, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_pslld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x72);
+   emit_modrm_noreg(p, 6, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psllq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x73);
+   emit_modrm_noreg(p, 6, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psrlw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x71);
+   emit_modrm_noreg(p, 2, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psrld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x72);
+   emit_modrm_noreg(p, 2, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psrlq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x73);
+   emit_modrm_noreg(p, 2, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psraw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x71);
+   emit_modrm_noreg(p, 4, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psrad_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x72);
+   emit_modrm_noreg(p, 4, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_por( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_3ub(p, 0x66, 0x0f, 0xeb);
+   emit_modrm(p, dst, src);
+}
 
 void sse2_rcpps( struct x86_function *p,
                  struct x86_reg dst,
@@ -1100,18 +1497,6 @@ void sse2_rcpss( struct x86_function *p,
    emit_modrm( p, dst, src );
 }
 
-void sse2_movd( struct x86_function *p,
-		struct x86_reg dst,
-		struct x86_reg src )
-{
-   DUMP_RR( dst, src );
-   emit_2ub(p, 0x66, X86_TWOB);
-   emit_op_modrm( p, 0x6e, 0x7e, dst, src );
-}
-
-
-
-
 /***********************************************************************
  * x87 instructions
  */
@@ -1702,23 +2087,80 @@ void x86_cdecl_caller_pop_regs( struct x86_function *p )
 }
 
 
-/* Retreive a reference to one of the function arguments, taking into
- * account any push/pop activity:
- */
 struct x86_reg x86_fn_arg( struct x86_function *p,
-			   unsigned arg )
+                           unsigned arg )
 {
-   return x86_make_disp(x86_make_reg(file_REG32, reg_SP), 
+   switch(x86_target(p))
+   {
+   case X86_64_WIN64_ABI:
+      /* Microsoft uses a different calling convention than the rest of the world */
+      switch(arg)
+      {
+      case 1:
+         return x86_make_reg(file_REG32, reg_CX);
+      case 2:
+         return x86_make_reg(file_REG32, reg_DX);
+      case 3:
+         return x86_make_reg(file_REG32, reg_R8);
+      case 4:
+         return x86_make_reg(file_REG32, reg_R9);
+      default:
+	 /* Win64 allocates stack slots as if it pushed the first 4 arguments too */
+         return x86_make_disp(x86_make_reg(file_REG32, reg_SP),
+               p->stack_offset + arg * 8);
+      }
+   case X86_64_STD_ABI:
+      switch(arg)
+      {
+      case 1:
+         return x86_make_reg(file_REG32, reg_DI);
+      case 2:
+         return x86_make_reg(file_REG32, reg_SI);
+      case 3:
+         return x86_make_reg(file_REG32, reg_DX);
+      case 4:
+         return x86_make_reg(file_REG32, reg_CX);
+      case 5:
+         return x86_make_reg(file_REG32, reg_R8);
+      case 6:
+         return x86_make_reg(file_REG32, reg_R9);
+      default:
+         return x86_make_disp(x86_make_reg(file_REG32, reg_SP),
+               p->stack_offset + (arg - 6) * 8);     /* ??? */
+      }
+   case X86_32:
+      return x86_make_disp(x86_make_reg(file_REG32, reg_SP),
 			p->stack_offset + arg * 4);	/* ??? */
+   default:
+      abort();
+   }
 }
 
+static void x86_init_func_common( struct x86_function *p )
+{
+   util_cpu_detect();
+   p->caps = 0;
+   if(util_cpu_caps.has_mmx)
+      p->caps |= X86_MMX;
+   if(util_cpu_caps.has_mmx2)
+      p->caps |= X86_MMX2;
+   if(util_cpu_caps.has_sse)
+      p->caps |= X86_SSE;
+   if(util_cpu_caps.has_sse2)
+      p->caps |= X86_SSE2;
+   if(util_cpu_caps.has_sse3)
+      p->caps |= X86_SSE3;
+   if(util_cpu_caps.has_sse4_1)
+      p->caps |= X86_SSE4_1;
+   p->csr = p->store;
+   DUMP_START();
+}
 
 void x86_init_func( struct x86_function *p )
 {
    p->size = 0;
    p->store = NULL;
-   p->csr = p->store;
-   DUMP_START();
+   x86_init_func_common(p);
 }
 
 void x86_init_func_size( struct x86_function *p, unsigned code_size )
@@ -1728,8 +2170,7 @@ void x86_init_func_size( struct x86_function *p, unsigned code_size )
    if (p->store == NULL) {
       p->store = p->error_overflow;
    }
-   p->csr = p->store;
-   DUMP_START();
+   x86_init_func_common(p);
 }
 
 void x86_release_func( struct x86_function *p )
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 6208e8f707..2b9678b176 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -24,22 +24,31 @@
 #ifndef _RTASM_X86SSE_H_
 #define _RTASM_X86SSE_H_
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_config.h"
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
 
 /* It is up to the caller to ensure that instructions issued are
  * suitable for the host cpu.  There are no checks made in this module
  * for mmx/sse/sse2 support on the cpu.
  */
 struct x86_reg {
-   unsigned file:3;
-   unsigned idx:3;
+   unsigned file:2;
+   unsigned idx:4;
    unsigned mod:2;		/* mod_REG if this is just a register */
    int      disp:24;		/* only +/- 23bits of offset - should be enough... */
 };
 
+#define X86_MMX 1
+#define X86_MMX2 2
+#define X86_SSE 4
+#define X86_SSE2 8
+#define X86_SSE3 0x10
+#define X86_SSE4_1 0x20
+
 struct x86_function {
+   unsigned caps;
    unsigned size;
    unsigned char *store;
    unsigned char *csr;
@@ -75,7 +84,15 @@ enum x86_reg_name {
    reg_SP,
    reg_BP,
    reg_SI,
-   reg_DI
+   reg_DI,
+   reg_R8,
+   reg_R9,
+   reg_R10,
+   reg_R11,
+   reg_R12,
+   reg_R13,
+   reg_R14,
+   reg_R15
 };
 
 
@@ -110,6 +127,29 @@ typedef void (*x86_func)(void);
 /* Begin/end/retrieve function creation:
  */
 
+enum x86_target
+{
+   X86_32,
+   X86_64_STD_ABI,
+   X86_64_WIN64_ABI
+};
+
+/* make this read a member of x86_function if target != host is desired */
+static INLINE enum x86_target x86_target( struct x86_function* p )
+{
+#ifdef PIPE_ARCH_X86
+   return X86_32;
+#elif defined(_WIN64)
+   return X86_64_WIN64_ABI;
+#elif defined(PIPE_ARCH_X86_64)
+   return X86_64_STD_ABI;
+#endif
+}
+
+static INLINE unsigned x86_target_caps( struct x86_function* p )
+{
+   return p->caps;
+}
 
 void x86_init_func( struct x86_function *p );
 void x86_init_func_size( struct x86_function *p, unsigned code_size );
@@ -138,6 +178,8 @@ struct x86_reg x86_get_base_reg( struct x86_reg reg );
  */
 int x86_get_label( struct x86_function *p );
 
+void x64_rexw(struct x86_function *p);
+
 void x86_jcc( struct x86_function *p,
 	      enum x86_cc cc,
 	      int label );
@@ -178,18 +220,54 @@ void mmx_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void mmx_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 
+void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movdqu( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movdqa( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movsd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movupd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movapd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
 void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_cvtdq2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvtsd2ss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvtpd2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
 void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
                   unsigned char shuf );
+void sse2_pshuflw( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                  unsigned char shuf );
+void sse2_pshufhw( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                  unsigned char shuf );
 void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 
+void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_punpcklwd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_punpckldq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_punpcklqdq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_psllw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_pslld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_psllq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+
+void sse2_psrlw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_psrld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_psrlq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+
+void sse2_psraw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_psrad_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+
+void sse2_por( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_pshuflw( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm );
+void sse2_pshufhw( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm );
+void sse2_pshufd( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm );
 
 void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr);
 void sse_prefetch0( struct x86_function *p, struct x86_reg ptr);
@@ -227,7 +305,6 @@ void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg
 void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src );
-void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_movmskps( struct x86_function *p, struct x86_reg dst, struct x86_reg src);
 
 void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
@@ -237,6 +314,14 @@ void x86_dec( struct x86_function *p, struct x86_reg reg );
 void x86_inc( struct x86_function *p, struct x86_reg reg );
 void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x64_mov64( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov8( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov16( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_movzx8(struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_movzx16(struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov_imm(struct x86_function *p, struct x86_reg dst, int imm );
+void x86_mov8_imm(struct x86_function *p, struct x86_reg dst, uint8_t imm );
+void x86_mov16_imm(struct x86_function *p, struct x86_reg dst, uint16_t imm );
 void x86_mul( struct x86_function *p, struct x86_reg src );
 void x86_imul( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
@@ -250,7 +335,10 @@ void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_sahf( struct x86_function *p );
 void x86_div( struct x86_function *p, struct x86_reg src );
-
+void x86_bswap( struct x86_function *p, struct x86_reg src );
+void x86_shr_imm( struct x86_function *p, struct x86_reg reg, unsigned imm );
+void x86_sar_imm( struct x86_function *p, struct x86_reg reg, unsigned imm );
+void x86_shl_imm( struct x86_function *p, struct x86_reg reg, unsigned imm  );
 
 void x86_cdecl_caller_push_regs( struct x86_function *p );
 void x86_cdecl_caller_pop_regs( struct x86_function *p );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.h b/src/gallium/auxiliary/tgsi/tgsi_dump.h
index 4cd27317b3..dd78b36100 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.h
@@ -28,6 +28,7 @@
 #ifndef TGSI_DUMP_H
 #define TGSI_DUMP_H
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_shader_tokens.h"
 
 #if defined __cplusplus
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 298f3d0a8b..0757f05dfa 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -3239,6 +3239,8 @@ exec_instruction(
 
          if (mach->CallStackTop == 0) {
             /* returning from main() */
+            mach->CondStackTop = 0;
+            mach->LoopStackTop = 0;
             *pc = -1;
             return;
          }
@@ -3767,6 +3769,9 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
    }
 #endif
 
+   /* Strictly speaking, these assertions aren't really needed but they
+    * can potentially catch some bugs in the control flow code.
+    */
    assert(mach->CondStackTop == 0);
    assert(mach->LoopStackTop == 0);
    assert(mach->ContStackTop == 0);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.h b/src/gallium/auxiliary/tgsi/tgsi_info.h
index 50248884fd..1992d11bbe 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.h
@@ -28,6 +28,7 @@
 #ifndef TGSI_INFO_H
 #define TGSI_INFO_H
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_shader_tokens.h"
 
 #if defined __cplusplus
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index db9a342220..1891203abe 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -282,17 +282,6 @@ tgsi_parse_token(
 }
 
 
-unsigned
-tgsi_num_tokens(const struct tgsi_token *tokens)
-{
-   struct tgsi_parse_context ctx;
-   if (tgsi_parse_init(&ctx, tokens) == TGSI_PARSE_OK) {
-      unsigned len = (ctx.FullHeader.Header.HeaderSize +
-                      ctx.FullHeader.Header.BodySize);
-      return len;
-   }
-   return 0;
-}
 
 
 /**
@@ -319,3 +308,19 @@ tgsi_alloc_tokens(unsigned num_tokens)
    unsigned bytes = num_tokens * sizeof(struct tgsi_token);
    return (struct tgsi_token *) MALLOC(bytes);
 }
+
+
+void
+tgsi_dump_tokens(const struct tgsi_token *tokens)
+{
+   const unsigned *dwords = (const unsigned *)tokens;
+   int nr = tgsi_num_tokens(tokens);
+   int i;
+   
+   assert(sizeof(*tokens) == sizeof(unsigned));
+
+   debug_printf("const unsigned tokens[%d] = {\n", nr);
+   for (i = 0; i < nr; i++)
+      debug_printf("0x%08x,\n", dwords[i]);
+   debug_printf("};\n");
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h
index 36de8807b4..d4df585176 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h
@@ -28,6 +28,7 @@
 #ifndef TGSI_PARSE_H
 #define TGSI_PARSE_H
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_shader_tokens.h"
 
 #if defined __cplusplus
@@ -132,8 +133,15 @@ void
 tgsi_parse_token(
    struct tgsi_parse_context *ctx );
 
-unsigned
-tgsi_num_tokens(const struct tgsi_token *tokens);
+static INLINE unsigned
+tgsi_num_tokens(const struct tgsi_token *tokens)
+{
+   struct tgsi_header header = *(const struct tgsi_header *) tokens;
+   return header.HeaderSize + header.BodySize;
+}
+
+void
+tgsi_dump_tokens(const struct tgsi_token *tokens);
 
 struct tgsi_token *
 tgsi_dup_tokens(const struct tgsi_token *tokens);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/tgsi_sse2.h
index d81ee3d00e..00aa8b84fe 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.h
@@ -32,9 +32,12 @@
 extern "C" {
 #endif
 
+#include "pipe/p_compiler.h"
+
+struct tgsi_exec_machine;
+struct tgsi_interp_coef;
 struct tgsi_token;
 struct x86_function;
-struct tgsi_interp_coef;
 
 unsigned
 tgsi_emit_sse2(
diff --git a/src/gallium/auxiliary/translate/translate.c b/src/gallium/auxiliary/translate/translate.c
index fe638e211f..73287b667d 100644
--- a/src/gallium/auxiliary/translate/translate.c
+++ b/src/gallium/auxiliary/translate/translate.c
@@ -38,7 +38,7 @@ struct translate *translate_create( const struct translate_key *key )
 {
    struct translate *translate = NULL;
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    translate = translate_sse2_create( key );
    if (translate)
       return translate;
diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h
index eb6f2cc486..a75380228b 100644
--- a/src/gallium/auxiliary/translate/translate.h
+++ b/src/gallium/auxiliary/translate/translate.h
@@ -85,6 +85,18 @@ struct translate {
                                 unsigned instance_id,
                                 void *output_buffer);
 
+   void (PIPE_CDECL *run_elts16)( struct translate *,
+                                const uint16_t *elts,
+                                unsigned count,
+                                unsigned instance_id,
+                                void *output_buffer);
+
+   void (PIPE_CDECL *run_elts8)( struct translate *,
+                                const uint8_t *elts,
+                                unsigned count,
+                                unsigned instance_id,
+                                void *output_buffer);
+
    void (PIPE_CDECL *run)( struct translate *,
                            unsigned start,
                            unsigned count,
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
index 42cfd763e9..ad809db720 100644
--- a/src/gallium/auxiliary/translate/translate_generic.c
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -64,6 +64,14 @@ struct translate_generic {
       unsigned input_stride;
       unsigned max_index;
 
+      /* this value is set to -1 if this is a normal element with output_format != input_format:
+       * in this case, u_format is used to do a full conversion
+       *
+       * this value is set to the format size in bytes if output_format == input_format or for 32-bit instance ids:
+       * in this case, memcpy is used to copy this amount of bytes
+       */
+      int copy_size;
+
    } attrib[PIPE_MAX_ATTRIBS];
 
    unsigned nr_attrib;
@@ -354,7 +362,65 @@ static emit_func get_emit_func( enum pipe_format format )
    }
 }
 
+static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic *tg,
+                                         unsigned elt,
+                                         unsigned instance_id,
+                                         void *vert )
+{
+   unsigned nr_attrs = tg->nr_attrib;
+   unsigned attr;
+
+   for (attr = 0; attr < nr_attrs; attr++) {
+      float data[4];
+      uint8_t *dst = (uint8_t *)vert + tg->attrib[attr].output_offset;
+
+      if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) {
+         const uint8_t *src;
+         unsigned index;
+         int copy_size;
+
+         if (tg->attrib[attr].instance_divisor) {
+            index = instance_id / tg->attrib[attr].instance_divisor;
+         }
+         else {
+            index = elt;
+         }
+
+         /* clamp to void going out of bounds */
+         index = MIN2(index, tg->attrib[attr].max_index);
 
+         src = tg->attrib[attr].input_ptr +
+               tg->attrib[attr].input_stride * index;
+
+         copy_size = tg->attrib[attr].copy_size;
+         if(likely(copy_size >= 0))
+            memcpy(dst, src, copy_size);
+         else
+         {
+            tg->attrib[attr].fetch( data, src, 0, 0 );
+
+            if (0)
+               debug_printf("Fetch linear attr %d  from %p  stride %d  index %d: "
+                         " %f, %f, %f, %f \n",
+                         attr,
+                         tg->attrib[attr].input_ptr,
+                         tg->attrib[attr].input_stride,
+                         index,
+                         data[0], data[1],data[2], data[3]);
+
+            tg->attrib[attr].emit( data, dst );
+         }
+      } else {
+         if(likely(tg->attrib[attr].copy_size >= 0))
+            memcpy(data, &instance_id, 4);
+         else
+         {
+            data[0] = (float)instance_id;
+            tg->attrib[attr].emit( data, dst );
+         }
+      }
+   }
+}
 
 /**
  * Fetch vertex attributes for 'count' vertices.
@@ -367,62 +433,45 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate,
 {
    struct translate_generic *tg = translate_generic(translate);
    char *vert = output_buffer;
-   unsigned nr_attrs = tg->nr_attrib;
-   unsigned attr;
    unsigned i;
 
-   /* loop over vertex attributes (vertex shader inputs)
-    */
    for (i = 0; i < count; i++) {
-      const unsigned elt = *elts++;
-
-      for (attr = 0; attr < nr_attrs; attr++) {
-	 float data[4];
-	 char *dst = vert + tg->attrib[attr].output_offset;
-
-         if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) {
-            const uint8_t *src;
-            unsigned index;
-
-            if (tg->attrib[attr].instance_divisor) {
-               index = instance_id / tg->attrib[attr].instance_divisor;
-            } else {
-               index = elt;
-            }
-
-            /* clamp to void going out of bounds */
-            index = MIN2(index, tg->attrib[attr].max_index);
-
-            src = tg->attrib[attr].input_ptr +
-                  tg->attrib[attr].input_stride * index;
-
-            tg->attrib[attr].fetch( data, src, 0, 0 );
-
-            if (0)
-               debug_printf("Fetch elt attr %d  from %p  stride %d  div %u  max %u  index %d:  "
-                            " %f, %f, %f, %f \n",
-                            attr,
-                            tg->attrib[attr].input_ptr,
-                            tg->attrib[attr].input_stride,
-                            tg->attrib[attr].instance_divisor,
-                            tg->attrib[attr].max_index,
-                            index,
-                            data[0], data[1],data[2], data[3]);
-         } else {
-            data[0] = (float)instance_id;
-         }
+      generic_run_one(tg, *elts++, instance_id, vert);
+      vert += tg->translate.key.output_stride;
+   }
+}
 
-         if (0)
-            debug_printf("vert %d/%d attr %d: %f %f %f %f\n",
-                         i, elt, attr, data[0], data[1], data[2], data[3]);
+static void PIPE_CDECL generic_run_elts16( struct translate *translate,
+                                         const uint16_t *elts,
+                                         unsigned count,
+                                         unsigned instance_id,
+                                         void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned i;
 
-	 tg->attrib[attr].emit( data, dst );
-      }
+   for (i = 0; i < count; i++) {
+      generic_run_one(tg, *elts++, instance_id, vert);
       vert += tg->translate.key.output_stride;
    }
 }
 
+static void PIPE_CDECL generic_run_elts8( struct translate *translate,
+                                         const uint8_t *elts,
+                                         unsigned count,
+                                         unsigned instance_id,
+                                         void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned i;
 
+   for (i = 0; i < count; i++) {
+      generic_run_one(tg, *elts++, instance_id, vert);
+      vert += tg->translate.key.output_stride;
+   }
+}
 
 static void PIPE_CDECL generic_run( struct translate *translate,
                                     unsigned start,
@@ -432,57 +481,10 @@ static void PIPE_CDECL generic_run( struct translate *translate,
 {
    struct translate_generic *tg = translate_generic(translate);
    char *vert = output_buffer;
-   unsigned nr_attrs = tg->nr_attrib;
-   unsigned attr;
    unsigned i;
 
-   /* loop over vertex attributes (vertex shader inputs)
-    */
    for (i = 0; i < count; i++) {
-      unsigned elt = start + i;
-
-      for (attr = 0; attr < nr_attrs; attr++) {
-	 float data[4];
-	 char *dst = vert + tg->attrib[attr].output_offset;
-
-         if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) {
-            const uint8_t *src;
-            unsigned index;
-
-            if (tg->attrib[attr].instance_divisor) {
-               index = instance_id / tg->attrib[attr].instance_divisor;
-            }
-            else {
-               index = elt;
-            }
-
-            /* clamp to void going out of bounds */
-            index = MIN2(index, tg->attrib[attr].max_index);
-
-            src = tg->attrib[attr].input_ptr +
-                  tg->attrib[attr].input_stride * index;
-
-            tg->attrib[attr].fetch( data, src, 0, 0 );
-
-            if (0)
-               debug_printf("Fetch linear attr %d  from %p  stride %d  index %d: "
-                            " %f, %f, %f, %f \n",
-                            attr,
-                            tg->attrib[attr].input_ptr,
-                            tg->attrib[attr].input_stride,
-                            index,
-                            data[0], data[1],data[2], data[3]);
-         } else {
-            data[0] = (float)instance_id;
-         }
-
-         if (0)
-            debug_printf("vert %d attr %d: %f %f %f %f\n",
-                         i, attr, data[0], data[1], data[2], data[3]);
-
-	 tg->attrib[attr].emit( data, dst );
-      }
-      
+      generic_run_one(tg, start + i, instance_id, vert);
       vert += tg->translate.key.output_stride;
    }
 }
@@ -528,6 +530,8 @@ struct translate *translate_generic_create( const struct translate_key *key )
    tg->translate.release = generic_release;
    tg->translate.set_buffer = generic_set_buffer;
    tg->translate.run_elts = generic_run_elts;
+   tg->translate.run_elts16 = generic_run_elts16;
+   tg->translate.run_elts8 = generic_run_elts8;
    tg->translate.run = generic_run;
 
    for (i = 0; i < key->nr_elements; i++) {
@@ -544,9 +548,28 @@ struct translate *translate_generic_create( const struct translate_key *key )
       tg->attrib[i].input_offset = key->element[i].input_offset;
       tg->attrib[i].instance_divisor = key->element[i].instance_divisor;
 
-      tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
       tg->attrib[i].output_offset = key->element[i].output_offset;
 
+      tg->attrib[i].copy_size = -1;
+      if (tg->attrib[i].type == TRANSLATE_ELEMENT_INSTANCE_ID)
+      {
+            if(key->element[i].output_format == PIPE_FORMAT_R32_USCALED
+                  || key->element[i].output_format == PIPE_FORMAT_R32_SSCALED)
+               tg->attrib[i].copy_size = 4;
+      }
+      else
+      {
+         if(key->element[i].input_format == key->element[i].output_format
+               && format_desc->block.width == 1
+               && format_desc->block.height == 1
+               && !(format_desc->block.bits & 7))
+            tg->attrib[i].copy_size = format_desc->block.bits >> 3;
+      }
+
+      if(tg->attrib[i].copy_size < 0)
+	      tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
+      else
+	      tg->attrib[i].emit  = NULL;
    }
 
    tg->nr_attrib = key->nr_elements;
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index ef3aa674a3..f8bf5b4669 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -30,11 +30,12 @@
 #include "pipe/p_compiler.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
+#include "util/u_format.h"
 
 #include "translate.h"
 
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
 
 #include "rtasm/rtasm_cpu.h"
 #include "rtasm/rtasm_x86sse.h"
@@ -46,21 +47,9 @@
 #define W    3
 
 
-typedef void (PIPE_CDECL *run_func)( struct translate *translate,
-                                     unsigned start,
-                                     unsigned count,
-                                     unsigned instance_id,
-                                     void *output_buffer);
-
-typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
-                                          const unsigned *elts,
-                                          unsigned count,
-                                          unsigned instance_id,
-                                          void *output_buffer);
-
 struct translate_buffer {
    const void *base_ptr;
-   unsigned stride;
+   uintptr_t stride;
    unsigned max_index;
 };
 
@@ -73,21 +62,43 @@ struct translate_buffer_varient {
 
 #define ELEMENT_BUFFER_INSTANCE_ID  1001
 
+#define NUM_CONSTS 7
+
+enum
+{
+   CONST_IDENTITY,
+   CONST_INV_127,
+   CONST_INV_255,
+   CONST_INV_32767,
+   CONST_INV_65535,
+   CONST_INV_2147483647,
+   CONST_255
+};
+
+#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
+static float consts[NUM_CONSTS][4] = {
+      {0, 0, 0, 1},
+      C(1.0 / 127.0),
+      C(1.0 / 255.0),
+      C(1.0 / 32767.0),
+      C(1.0 / 65535.0),
+      C(1.0 / 2147483647.0),
+      C(255.0)
+};
+#undef C
 
 struct translate_sse {
    struct translate translate;
 
    struct x86_function linear_func;
    struct x86_function elt_func;
+   struct x86_function elt16_func;
+   struct x86_function elt8_func;
    struct x86_function *func;
 
-   boolean loaded_identity;
-   boolean loaded_255;
-   boolean loaded_inv_255;
-
-   float identity[4];
-   float float_255[4];
-   float inv_255[4];
+   PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
+   int8_t reg_to_const[16];
+   int8_t const_to_reg[NUM_CONSTS];
 
    struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
    unsigned nr_buffers;
@@ -102,17 +113,16 @@ struct translate_sse {
    boolean use_instancing;
    unsigned instance_id;
 
-   run_func      gen_run;
-   run_elts_func gen_run_elts;
-
    /* these are actually known values, but putting them in a struct
     * like this is helpful to keep them in sync across the file.
     */
    struct x86_reg tmp_EAX;
-   struct x86_reg idx_EBX;     /* either start+i or &elt[i] */
-   struct x86_reg outbuf_ECX;
-   struct x86_reg machine_EDX;
-   struct x86_reg count_ESI;    /* decrements to zero */
+   struct x86_reg tmp2_EDX;
+   struct x86_reg src_ECX;
+   struct x86_reg idx_ESI;     /* either start+i or &elt[i] */
+   struct x86_reg machine_EDI;
+   struct x86_reg outbuf_EBX;
+   struct x86_reg count_EBP;    /* decrements to zero */
 };
 
 static int get_offset( const void *a, const void *b )
@@ -120,281 +130,950 @@ static int get_offset( const void *a, const void *b )
    return (const char *)b - (const char *)a;
 }
 
+static struct x86_reg get_const( struct translate_sse *p, unsigned id)
+{
+   struct x86_reg reg;
+   unsigned i;
 
+   if(p->const_to_reg[id] >= 0)
+      return x86_make_reg(file_XMM, p->const_to_reg[id]);
 
-static struct x86_reg get_identity( struct translate_sse *p )
-{
-   struct x86_reg reg = x86_make_reg(file_XMM, 6);
-
-   if (!p->loaded_identity) {
-      p->loaded_identity = TRUE;
-      p->identity[0] = 0;
-      p->identity[1] = 0;
-      p->identity[2] = 0;
-      p->identity[3] = 1;
-
-      sse_movups(p->func, reg, 
-		 x86_make_disp(p->machine_EDX, 
-			       get_offset(p, &p->identity[0])));
+   for(i = 2; i < 8; ++i)
+   {
+      if(p->reg_to_const[i] < 0)
+         break;
    }
 
+   /* TODO: be smarter here */
+   if(i == 8)
+      --i;
+
+   reg = x86_make_reg(file_XMM, i);
+
+   if(p->reg_to_const[i] >= 0)
+      p->const_to_reg[p->reg_to_const[i]] = -1;
+
+   p->reg_to_const[i] = id;
+   p->const_to_reg[id] = i;
+
+   /* TODO: this should happen outside the loop, if possible */
+   sse_movaps(p->func, reg,
+         x86_make_disp(p->machine_EDI,
+               get_offset(p, &p->consts[id][0])));
+
    return reg;
 }
 
-static struct x86_reg get_255( struct translate_sse *p )
+/* load the data in a SSE2 register, padding with zeros */
+static boolean emit_load_sse2( struct translate_sse *p,
+				       struct x86_reg data,
+				       struct x86_reg src,
+				       unsigned size)
 {
-   struct x86_reg reg = x86_make_reg(file_XMM, 7);
-
-   if (!p->loaded_255) {
-      p->loaded_255 = TRUE;
-      p->float_255[0] =
-	 p->float_255[1] =
-	 p->float_255[2] =
-	 p->float_255[3] = 255.0f;
-
-      sse_movups(p->func, reg, 
-		 x86_make_disp(p->machine_EDX, 
-			       get_offset(p, &p->float_255[0])));
+   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
+   struct x86_reg tmp = p->tmp_EAX;
+   switch(size)
+   {
+   case 1:
+      x86_movzx8(p->func, tmp, src);
+      sse2_movd(p->func, data, tmp);
+      break;
+   case 2:
+      x86_movzx16(p->func, tmp, src);
+      sse2_movd(p->func, data, tmp);
+      break;
+   case 3:
+      x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
+      x86_shl_imm(p->func, tmp, 16);
+      x86_mov16(p->func, tmp, src);
+      sse2_movd(p->func, data, tmp);
+      break;
+   case 4:
+      sse2_movd(p->func, data, src);
+      break;
+   case 6:
+      sse2_movd(p->func, data, src);
+      x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
+      sse2_movd(p->func, tmpXMM, tmp);
+      sse2_punpckldq(p->func, data, tmpXMM);
+      break;
+   case 8:
+      sse2_movq(p->func, data, src);
+      break;
+   case 12:
+      sse2_movq(p->func, data, src);
+      sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
+      sse2_punpcklqdq(p->func, data, tmpXMM);
+      break;
+   case 16:
+      sse2_movdqu(p->func, data, src);
+      break;
+   default:
+      return FALSE;
    }
-
-   return reg;
+   return TRUE;
 }
 
-static struct x86_reg get_inv_255( struct translate_sse *p )
+/* this value can be passed for the out_chans argument */
+#define CHANNELS_0001 5
+
+/* this function will load #chans float values, and will
+ * pad the register with zeroes at least up to out_chans.
+ *
+ * If out_chans is set to CHANNELS_0001, then the fourth
+ * value will be padded with 1. Only pass this value if
+ * chans < 4 or results are undefined.
+ */
+static void emit_load_float32( struct translate_sse *p,
+                                       struct x86_reg data,
+                                       struct x86_reg arg0,
+                                       unsigned out_chans,
+                                       unsigned chans)
 {
-   struct x86_reg reg = x86_make_reg(file_XMM, 5);
-
-   if (!p->loaded_inv_255) {
-      p->loaded_inv_255 = TRUE;
-      p->inv_255[0] =
-	 p->inv_255[1] =
-	 p->inv_255[2] =
-	 p->inv_255[3] = 1.0f / 255.0f;
-
-      sse_movups(p->func, reg, 
-		 x86_make_disp(p->machine_EDX, 
-			       get_offset(p, &p->inv_255[0])));
+   switch(chans)
+   {
+   case 1:
+      /* a 0 0 0
+       * a 0 0 1
+       */
+      sse_movss(p->func, data, arg0);
+      if(out_chans == CHANNELS_0001)
+         sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
+      break;
+   case 2:
+      /* 0 0 0 1
+       * a b 0 1
+       */
+      if(out_chans == CHANNELS_0001)
+         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
+      else if(out_chans > 2)
+         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
+      sse_movlps(p->func, data, arg0);
+      break;
+   case 3:
+      /* Have to jump through some hoops:
+       *
+       * c 0 0 0
+       * c 0 0 1 if out_chans == CHANNELS_0001
+       * 0 0 c 0/1
+       * a b c 0/1
+       */
+      sse_movss(p->func, data, x86_make_disp(arg0, 8));
+      if(out_chans == CHANNELS_0001)
+         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) );
+      sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
+      sse_movlps(p->func, data, arg0);
+      break;
+   case 4:
+      sse_movups(p->func, data, arg0);
+      break;
    }
-
-   return reg;
 }
 
+/* this function behaves like emit_load_float32, but loads
+   64-bit floating point numbers, converting them to 32-bit
+  ones */
+static void emit_load_float64to32( struct translate_sse *p,
+                                       struct x86_reg data,
+                                       struct x86_reg arg0,
+                                       unsigned out_chans,
+                                       unsigned chans)
+{
+   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
+   switch(chans)
+   {
+   case 1:
+      sse2_movsd(p->func, data, arg0);
+      if(out_chans > 1)
+         sse2_cvtpd2ps(p->func, data, data);
+      else
+         sse2_cvtsd2ss(p->func, data, data);
+      if(out_chans == CHANNELS_0001)
+         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W)  );
+      break;
+   case 2:
+      sse2_movupd(p->func, data, arg0);
+      sse2_cvtpd2ps(p->func, data, data);
+      if(out_chans == CHANNELS_0001)
+         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
+      else if(out_chans > 2)
+         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
+       break;
+   case 3:
+      sse2_movupd(p->func, data, arg0);
+      sse2_cvtpd2ps(p->func, data, data);
+      sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
+      if(out_chans > 3)
+         sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
+      else
+         sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
+      sse_movlhps(p->func, data, tmpXMM);
+      if(out_chans == CHANNELS_0001)
+         sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
+      break;
+   case 4:
+      sse2_movupd(p->func, data, arg0);
+      sse2_cvtpd2ps(p->func, data, data);
+      sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
+      sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
+      sse_movlhps(p->func, data, tmpXMM);
+      break;
+   }
+}
 
-static void emit_load_R32G32B32A32( struct translate_sse *p, 			   
-				    struct x86_reg data,
-				    struct x86_reg arg0 )
+static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr,  struct x86_reg src_xmm)
 {
-   sse_movups(p->func, data, arg0);
+   if(x86_target(p->func) != X86_32)
+      x64_mov64(p->func, dst_gpr, src_gpr);
+   else
+   {
+      /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
+      if(x86_target_caps(p->func) & X86_SSE2)
+         sse2_movq(p->func, dst_xmm, src_xmm);
+      else
+         sse_movlps(p->func, dst_xmm, src_xmm);
+   }
 }
 
-static void emit_load_R32G32B32( struct translate_sse *p, 			   
-				 struct x86_reg data,
-				 struct x86_reg arg0 )
+static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src)
 {
-   /* Have to jump through some hoops:
-    *
-    * c 0 0 0
-    * c 0 0 1
-    * 0 0 c 1
-    * a b c 1
-    */
-   sse_movss(p->func, data, x86_make_disp(arg0, 8));
-   sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
-   sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
-   sse_movlps(p->func, data, arg0);
+   emit_mov64(p, dst_gpr, dst_xmm, src, src);
 }
 
-static void emit_load_R32G32( struct translate_sse *p, 
-			   struct x86_reg data,
-			   struct x86_reg arg0 )
+static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm)
 {
-   /* 0 0 0 1
-    * a b 0 1
-    */
-   sse_movups(p->func, data, get_identity(p) );
-   sse_movlps(p->func, data, arg0);
+   emit_mov64(p, dst, dst, src_gpr, src_xmm);
 }
 
+static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
+{
+   if(x86_target_caps(p->func) & X86_SSE2)
+      sse2_movdqu(p->func, dst, src);
+   else
+      sse_movups(p->func, dst, src);
+}
 
-static void emit_load_R32( struct translate_sse *p, 
-			   struct x86_reg data,
-			   struct x86_reg arg0 )
+/* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
+ * but may or may not be good on older processors
+ * TODO: may perhaps want to use non-temporal stores here if possible
+ */
+static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size)
 {
-   /* a 0 0 0
-    * a 0 0 1
-    */
-   sse_movss(p->func, data, arg0);
-   sse_orps(p->func, data, get_identity(p) );
+   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+   struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
+   struct x86_reg dataGPR = p->tmp_EAX;
+   struct x86_reg dataGPR2 = p->tmp2_EDX;
+
+   if(size < 8)
+   {
+      switch (size)
+      {
+      case 1:
+         x86_mov8(p->func, dataGPR, src);
+         x86_mov8(p->func, dst, dataGPR);
+         break;
+      case 2:
+         x86_mov16(p->func, dataGPR, src);
+         x86_mov16(p->func, dst, dataGPR);
+         break;
+      case 3:
+         x86_mov16(p->func, dataGPR, src);
+         x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
+         x86_mov16(p->func, dst, dataGPR);
+         x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
+         break;
+      case 4:
+         x86_mov(p->func, dataGPR, src);
+         x86_mov(p->func, dst, dataGPR);
+         break;
+      case 6:
+         x86_mov(p->func, dataGPR, src);
+         x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
+         x86_mov(p->func, dst, dataGPR);
+         x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
+         break;
+      }
+   }
+   else if(!(x86_target_caps(p->func) & X86_SSE))
+   {
+      unsigned i = 0;
+      assert((size & 3) == 0);
+      for(i = 0; i < size; i += 4)
+      {
+         x86_mov(p->func, dataGPR, x86_make_disp(src, i));
+         x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
+      }
+   }
+   else
+   {
+      switch(size)
+      {
+      case 8:
+         emit_load64(p, dataGPR, dataXMM, src);
+         emit_store64(p, dst, dataGPR, dataXMM);
+         break;
+      case 12:
+         emit_load64(p, dataGPR2, dataXMM, src);
+         x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
+         emit_store64(p, dst, dataGPR2, dataXMM);
+         x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
+         break;
+      case 16:
+         emit_mov128(p, dataXMM, src);
+         emit_mov128(p, dst, dataXMM);
+         break;
+      case 24:
+         emit_mov128(p, dataXMM, src);
+         emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
+         emit_mov128(p, dst, dataXMM);
+         emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
+         break;
+      case 32:
+         emit_mov128(p, dataXMM, src);
+         emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
+         emit_mov128(p, dst, dataXMM);
+         emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
+         break;
+      default:
+         assert(0);
+      }
+   }
 }
 
+static boolean translate_attr_convert( struct translate_sse *p,
+                               const struct translate_element *a,
+                               struct x86_reg src,
+                               struct x86_reg dst)
 
-static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
-				       struct x86_reg data,
-				       struct x86_reg src )
 {
+   const struct util_format_description* input_desc = util_format_description(a->input_format);
+   const struct util_format_description* output_desc = util_format_description(a->output_format);
+   unsigned i;
+   boolean id_swizzle = TRUE;
+   unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE};
+   unsigned needed_chans = 0;
+   unsigned imms[2] = {0, 0x3f800000};
 
-   /* Load and unpack twice:
-    */
-   sse_movss(p->func, data, src);
-   sse2_punpcklbw(p->func, data, get_identity(p));
-   sse2_punpcklbw(p->func, data, get_identity(p));
+   if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE)
+      return FALSE;
 
-   /* Convert to float:
-    */
-   sse2_cvtdq2ps(p->func, data, data);
+   if(input_desc->channel[0].size & 7)
+      return FALSE;
 
+   if(input_desc->colorspace != output_desc->colorspace)
+      return FALSE;
 
-   /* Scale by 1/255.0
-    */
-   sse_mulps(p->func, data, get_inv_255(p));
-}
+   for(i = 1; i < input_desc->nr_channels; ++i)
+   {
+      if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0])))
+         return FALSE;
+   }
 
+   for(i = 1; i < output_desc->nr_channels; ++i)
+   {
+      if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0])))
+         return FALSE;
+   }
 
+   for(i = 0; i < output_desc->nr_channels; ++i)
+   {
+      if(output_desc->swizzle[i] < 4)
+         swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
+   }
 
+   if((x86_target_caps(p->func) & X86_SSE) && (0
+         || a->output_format == PIPE_FORMAT_R32_FLOAT
+         || a->output_format == PIPE_FORMAT_R32G32_FLOAT
+         || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
+         || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT))
+   {
+      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
 
-static void emit_store_R32G32B32A32( struct translate_sse *p, 			   
-				     struct x86_reg dest,
-				     struct x86_reg dataXMM )
-{
-   sse_movups(p->func, dest, dataXMM);
-}
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
+            swizzle[i] = i;
+      }
 
-static void emit_store_R32G32B32( struct translate_sse *p, 
-				  struct x86_reg dest,
-				  struct x86_reg dataXMM )
-{
-   /* Emit two, shuffle, emit one.
-    */
-   sse_movlps(p->func, dest, dataXMM);
-   sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
-   sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
-}
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         if(swizzle[i] < 4)
+            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
+         if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
+            id_swizzle = FALSE;
+      }
 
-static void emit_store_R32G32( struct translate_sse *p, 
-			       struct x86_reg dest,
-			       struct x86_reg dataXMM )
-{
-   sse_movlps(p->func, dest, dataXMM);
-}
+      if(needed_chans > 0)
+      {
+         switch(input_desc->channel[0].type)
+         {
+         case UTIL_FORMAT_TYPE_UNSIGNED:
+            if(!(x86_target_caps(p->func) & X86_SSE2))
+               return FALSE;
+            emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+
+            /* TODO: add support for SSE4.1 pmovzx */
+            switch(input_desc->channel[0].size)
+            {
+            case 8:
+               /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
+               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+               break;
+            case 16:
+               sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+               break;
+            case 32: /* we lose precision here */
+               sse2_psrld_imm(p->func, dataXMM, 1);
+               break;
+            default:
+               return FALSE;
+            }
+            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
+            if(input_desc->channel[0].normalized)
+            {
+               struct x86_reg factor;
+               switch(input_desc->channel[0].size)
+               {
+               case 8:
+                  factor = get_const(p, CONST_INV_255);
+                  break;
+               case 16:
+                  factor = get_const(p, CONST_INV_65535);
+                  break;
+               case 32:
+                  factor = get_const(p, CONST_INV_2147483647);
+                  break;
+               default:
+                  assert(0);
+                  factor.disp = 0;
+                  factor.file = 0;
+                  factor.idx = 0;
+                  factor.mod = 0;
+                  break;
+               }
+               sse_mulps(p->func, dataXMM, factor);
+            }
+            else if(input_desc->channel[0].size == 32)
+               sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */
+            break;
+         case UTIL_FORMAT_TYPE_SIGNED:
+            if(!(x86_target_caps(p->func) & X86_SSE2))
+               return FALSE;
+            emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+
+            /* TODO: add support for SSE4.1 pmovsx */
+            switch(input_desc->channel[0].size)
+            {
+            case 8:
+               sse2_punpcklbw(p->func, dataXMM, dataXMM);
+               sse2_punpcklbw(p->func, dataXMM, dataXMM);
+               sse2_psrad_imm(p->func, dataXMM, 24);
+               break;
+            case 16:
+               sse2_punpcklwd(p->func, dataXMM, dataXMM);
+               sse2_psrad_imm(p->func, dataXMM, 16);
+               break;
+            case 32: /* we lose precision here */
+               break;
+            default:
+               return FALSE;
+            }
+            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
+            if(input_desc->channel[0].normalized)
+            {
+               struct x86_reg factor;
+               switch(input_desc->channel[0].size)
+               {
+               case 8:
+                  factor = get_const(p, CONST_INV_127);
+                  break;
+               case 16:
+                  factor = get_const(p, CONST_INV_32767);
+                  break;
+               case 32:
+                  factor = get_const(p, CONST_INV_2147483647);
+                  break;
+               default:
+                  assert(0);
+                  factor.disp = 0;
+                  factor.file = 0;
+                  factor.idx = 0;
+                  factor.mod = 0;
+                  break;
+               }
+               sse_mulps(p->func, dataXMM, factor);
+            }
+            break;
+
+            break;
+         case UTIL_FORMAT_TYPE_FLOAT:
+            if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64)
+               return FALSE;
+            if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3)
+            {
+               swizzle[3] = UTIL_FORMAT_SWIZZLE_W;
+               needed_chans = CHANNELS_0001;
+            }
+            switch(input_desc->channel[0].size)
+            {
+            case 32:
+               emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
+               break;
+            case 64: /* we lose precision here */
+               if(!(x86_target_caps(p->func) & X86_SSE2))
+                  return FALSE;
+               emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
+               break;
+            default:
+               return FALSE;
+            }
+            break;
+         default:
+            return FALSE;
+         }
 
-static void emit_store_R32( struct translate_sse *p, 
-			    struct x86_reg dest,
-			    struct x86_reg dataXMM )
-{
-   sse_movss(p->func, dest, dataXMM);
-}
+         if(!id_swizzle)
+            sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) );
+      }
 
+      if(output_desc->nr_channels >= 4
+            && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
+            )
+         sse_movups(p->func, dst, dataXMM);
+      else
+      {
+         if(output_desc->nr_channels >= 2
+               && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+               && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+            sse_movlps(p->func, dst, dataXMM);
+         else
+         {
+            if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
+               sse_movss(p->func, dst, dataXMM);
+            else
+               x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+
+            if(output_desc->nr_channels >= 2)
+            {
+               if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+               {
+                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
+                  sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
+               }
+               else
+                  x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
+            }
+         }
 
+         if(output_desc->nr_channels >= 3)
+         {
+            if(output_desc->nr_channels >= 4
+                  && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+                  && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
+               sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
+            else
+            {
+               if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
+               {
+                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
+                  sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
+               }
+               else
+                  x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+
+               if(output_desc->nr_channels >= 4)
+               {
+                  if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
+                  {
+                     sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
+                     sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
+                  }
+                  else
+                     x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
+               }
+            }
+         }
+      }
+      return TRUE;
+   }
+   else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16
+         && output_desc->channel[0].normalized == input_desc->channel[0].normalized
+         && (0
+               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
+               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+               ))
+   {
+      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+      struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
+      struct x86_reg tmp = p->tmp_EAX;
+      unsigned imms[2] = {0, 1};
+
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
+            swizzle[i] = i;
+      }
 
-static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
-				       struct x86_reg dest,
-				       struct x86_reg dataXMM )
-{
-   /* Scale by 255.0
-    */
-   sse_mulps(p->func, dataXMM, get_255(p));
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         if(swizzle[i] < 4)
+            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
+         if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
+            id_swizzle = FALSE;
+      }
 
-   /* Pack and emit:
-    */
-   sse2_cvtps2dq(p->func, dataXMM, dataXMM);
-   sse2_packssdw(p->func, dataXMM, dataXMM);
-   sse2_packuswb(p->func, dataXMM, dataXMM);
-   sse_movss(p->func, dest, dataXMM);
-}
+      if(needed_chans > 0)
+      {
+         emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+
+         switch(input_desc->channel[0].type)
+         {
+         case UTIL_FORMAT_TYPE_UNSIGNED:
+            if(input_desc->channel[0].normalized)
+            {
+               sse2_punpcklbw(p->func, dataXMM, dataXMM);
+               if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+        	       sse2_psrlw_imm(p->func, dataXMM, 1);
+            }
+            else
+               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+            break;
+         case UTIL_FORMAT_TYPE_SIGNED:
+            if(input_desc->channel[0].normalized)
+            {
+               sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
+               sse2_punpcklbw(p->func, tmpXMM, dataXMM);
+               sse2_psllw_imm(p->func, dataXMM, 9);
+               sse2_psrlw_imm(p->func, dataXMM, 8);
+               sse2_por(p->func, tmpXMM, dataXMM);
+               sse2_psrlw_imm(p->func, dataXMM, 7);
+               sse2_por(p->func, tmpXMM, dataXMM);
+               {
+                  struct x86_reg t = dataXMM;
+                  dataXMM = tmpXMM;
+                  tmpXMM = t;
+               }
+            }
+            else
+            {
+               sse2_punpcklbw(p->func, dataXMM, dataXMM);
+               sse2_psraw_imm(p->func, dataXMM, 8);
+            }
+            break;
+         default:
+            assert(0);
+         }
 
+         if(output_desc->channel[0].normalized)
+            imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
 
+         if(!id_swizzle)
+            sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
+      }
 
+      if(output_desc->nr_channels >= 4
+            && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
+            )
+         sse2_movq(p->func, dst, dataXMM);
+      else
+      {
+         if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
+         {
+            if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+               sse2_movd(p->func, dst, dataXMM);
+            else
+            {
+               sse2_movd(p->func, tmp, dataXMM);
+               x86_mov16(p->func, dst, tmp);
+               if(output_desc->nr_channels >= 2)
+                  x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
+            }
+         }
+         else
+         {
+            if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0)
+               x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+            else
+            {
+               x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+               if(output_desc->nr_channels >= 2)
+               {
+                  sse2_movd(p->func, tmp, dataXMM);
+                  x86_shr_imm(p->func, tmp, 16);
+                  x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
+               }
+            }
+         }
 
+         if(output_desc->nr_channels >= 3)
+         {
+            if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
+            {
+               if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
+               {
+                  sse2_psrlq_imm(p->func, dataXMM, 32);
+                  sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
+               }
+               else
+               {
+                  sse2_psrlq_imm(p->func, dataXMM, 32);
+                  sse2_movd(p->func, tmp, dataXMM);
+                  x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
+                  if(output_desc->nr_channels >= 4)
+                  {
+                     x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
+                  }
+               }
+            }
+            else
+            {
+               if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0)
+                  x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+               else
+               {
+                  x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+
+                  if(output_desc->nr_channels >= 4)
+                  {
+                     sse2_psrlq_imm(p->func, dataXMM, 48);
+                     sse2_movd(p->func, tmp, dataXMM);
+                     x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
+                  }
+               }
+            }
+         }
+      }
+      return TRUE;
+   }
+   else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0])))
+   {
+      struct x86_reg tmp = p->tmp_EAX;
+      unsigned i;
+      if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4
+                     && swizzle[0] == UTIL_FORMAT_SWIZZLE_W
+                     && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
+                     && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y
+                     && swizzle[3] == UTIL_FORMAT_SWIZZLE_X)
+      {
+         /* TODO: support movbe */
+         x86_mov(p->func, tmp, src);
+         x86_bswap(p->func, tmp);
+         x86_mov(p->func, dst, tmp);
+         return TRUE;
+      }
 
-/* Extended swizzles?  Maybe later.
- */  
-static void emit_swizzle( struct translate_sse *p,
-			  struct x86_reg dest,
-			  struct x86_reg src,
-			  unsigned char shuffle )
-{
-   sse_shufps(p->func, dest, src, shuffle);
-}
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         switch(output_desc->channel[0].size)
+         {
+         case 8:
+            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+            {
+               unsigned v = 0;
+               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+               {
+                  switch(output_desc->channel[0].type)
+                  {
+                  case UTIL_FORMAT_TYPE_UNSIGNED:
+                     v = output_desc->channel[0].normalized ? 0xff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_SIGNED:
+                     v = output_desc->channel[0].normalized ? 0x7f : 1;
+                     break;
+                  default:
+                     return FALSE;
+                  }
+               }
+               x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
+            }
+            else
+            {
+               x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
+               x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
+            }
+            break;
+         case 16:
+            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+            {
+               unsigned v = 0;
+               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+               {
+                  switch(output_desc->channel[1].type)
+                  {
+                  case UTIL_FORMAT_TYPE_UNSIGNED:
+                     v = output_desc->channel[1].normalized ? 0xffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_SIGNED:
+                     v = output_desc->channel[1].normalized ? 0x7fff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_FLOAT:
+                     v = 0x3c00;
+                     break;
+                  default:
+                     return FALSE;
+                  }
+               }
+               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
+            }
+            else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0)
+               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
+            else
+            {
+               x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
+               x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
+            }
+            break;
+         case 32:
+            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+            {
+               unsigned v = 0;
+               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+               {
+                  switch(output_desc->channel[1].type)
+                  {
+                  case UTIL_FORMAT_TYPE_UNSIGNED:
+                     v = output_desc->channel[1].normalized ? 0xffffffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_SIGNED:
+                     v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_FLOAT:
+                     v = 0x3f800000;
+                     break;
+                  default:
+                     return FALSE;
+                  }
+               }
+               x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
+            }
+            else
+            {
+               x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
+               x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
+            }
+            break;
+         case 64:
+            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+            {
+               unsigned l = 0;
+               unsigned h = 0;
+               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+               {
+                  switch(output_desc->channel[1].type)
+                  {
+                  case UTIL_FORMAT_TYPE_UNSIGNED:
+                     h = output_desc->channel[1].normalized ? 0xffffffff : 0;
+                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_SIGNED:
+                     h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
+                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_FLOAT:
+                     h = 0x3ff00000;
+                     l = 0;
+                     break;
+                  default:
+                     return FALSE;
+                  }
+               }
+               x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
+               x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
+            }
+            else
+            {
+               if(x86_target_caps(p->func) & X86_SSE)
+               {
+                  struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
+                  emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8));
+                  emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
+               }
+               else
+               {
+                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
+                  x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
+                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4));
+                  x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
+               }
+            }
+            break;
+         default:
+            return FALSE;
+         }
+      }
+      return TRUE;
+   }
+   /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
+   else if((x86_target_caps(p->func) & X86_SSE2) &&
+         a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0
+               || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
+               || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM
+         ))
+   {
+      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
 
+      /* load */
+      sse_movups(p->func, dataXMM, src);
 
-static boolean translate_attr( struct translate_sse *p,
-			       const struct translate_element *a,
-			       struct x86_reg srcECX,
-			       struct x86_reg dstEAX)
-{
-   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+      if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM)
+         sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3));
 
-   switch (a->input_format) {
-   case PIPE_FORMAT_R32_FLOAT:
-      emit_load_R32(p, dataXMM, srcECX);
-      break;
-   case PIPE_FORMAT_R32G32_FLOAT:
-      emit_load_R32G32(p, dataXMM, srcECX);
-      break;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      emit_load_R32G32B32(p, dataXMM, srcECX);
-      break;
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      emit_load_R32G32B32A32(p, dataXMM, srcECX);
-      break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
-      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
-      break;
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
-      break;
-   default:
-      return FALSE;
-   }
+      /* scale by 255.0 */
+      sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
 
-   switch (a->output_format) {
-   case PIPE_FORMAT_R32_FLOAT:
-      emit_store_R32(p, dstEAX, dataXMM);
-      break;
-   case PIPE_FORMAT_R32G32_FLOAT:
-      emit_store_R32G32(p, dstEAX, dataXMM);
-      break;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      emit_store_R32G32B32(p, dstEAX, dataXMM);
-      break;
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      emit_store_R32G32B32A32(p, dstEAX, dataXMM);
-      break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
-      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
-      break;
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
-      break;
-   default:
-      return FALSE;
+      /* pack and emit */
+      sse2_cvtps2dq(p->func, dataXMM, dataXMM);
+      sse2_packssdw(p->func, dataXMM, dataXMM);
+      sse2_packuswb(p->func, dataXMM, dataXMM);
+      sse2_movd(p->func, dst, dataXMM);
+
+      return TRUE;
    }
 
-   return TRUE;
+   return FALSE;
 }
 
+static boolean translate_attr( struct translate_sse *p,
+			       const struct translate_element *a,
+			       struct x86_reg src,
+			       struct x86_reg dst)
+{
+   if(a->input_format == a->output_format)
+   {
+      emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
+      return TRUE;
+   }
+
+   return translate_attr_convert(p, a, src, dst);
+}
 
 static boolean init_inputs( struct translate_sse *p,
-                            boolean linear )
+                            unsigned index_size )
 {
    unsigned i;
-   struct x86_reg instance_id = x86_make_disp(p->machine_EDX,
+   struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
                                               get_offset(p, &p->instance_id));
 
    for (i = 0; i < p->nr_buffer_varients; i++) {
       struct translate_buffer_varient *varient = &p->buffer_varient[i];
       struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
 
-      if (linear || varient->instance_divisor) {
-         struct x86_reg buf_stride   = x86_make_disp(p->machine_EDX,
+      if (!index_size || varient->instance_divisor) {
+         struct x86_reg buf_stride   = x86_make_disp(p->machine_EDI,
                                                      get_offset(p, &buffer->stride));
-         struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDX,
+         struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDI,
                                                      get_offset(p, &varient->ptr));
-         struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
+         struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI,
                                                      get_offset(p, &buffer->base_ptr));
-         struct x86_reg elt = p->idx_EBX;
+         struct x86_reg elt = p->idx_ESI;
          struct x86_reg tmp_EAX = p->tmp_EAX;
 
          /* Calculate pointer to first attrib:
@@ -406,20 +1085,16 @@ static boolean init_inputs( struct translate_sse *p,
             x86_mov(p->func, tmp_EAX, instance_id);
 
             if (varient->instance_divisor != 1) {
-               struct x86_reg tmp_EDX = p->machine_EDX;
-               struct x86_reg tmp_ECX = p->outbuf_ECX;
+               struct x86_reg tmp_EDX = p->tmp2_EDX;
+               struct x86_reg tmp_ECX = p->src_ECX;
 
                /* TODO: Add x86_shr() to rtasm and use it whenever
                 *       instance divisor is power of two.
                 */
 
-               x86_push(p->func, tmp_EDX);
-               x86_push(p->func, tmp_ECX);
                x86_xor(p->func, tmp_EDX, tmp_EDX);
                x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
                x86_div(p->func, tmp_ECX);    /* EAX = EDX:EAX / ECX */
-               x86_pop(p->func, tmp_ECX);
-               x86_pop(p->func, tmp_EDX);
             }
          } else {
             x86_mov(p->func, tmp_EAX, elt);
@@ -430,16 +1105,23 @@ static boolean init_inputs( struct translate_sse *p,
           */
 
          x86_imul(p->func, tmp_EAX, buf_stride);
+         x64_rexw(p->func);
          x86_add(p->func, tmp_EAX, buf_base_ptr);
 
 
          /* In the linear case, keep the buffer pointer instead of the
           * index number.
           */
-         if (linear && p->nr_buffer_varients == 1)
+         if (!index_size && p->nr_buffer_varients == 1)
+         {
+            x64_rexw(p->func);
             x86_mov(p->func, elt, tmp_EAX);
+         }
          else
+         {
+            x64_rexw(p->func);
             x86_mov(p->func, buf_ptr, tmp_EAX);
+         }
       }
    }
 
@@ -448,44 +1130,57 @@ static boolean init_inputs( struct translate_sse *p,
 
 
 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
-                                      boolean linear,
+                                      unsigned index_size,
                                       unsigned var_idx,
                                       struct x86_reg elt )
 {
    if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
-      return x86_make_disp(p->machine_EDX,
+      return x86_make_disp(p->machine_EDI,
                            get_offset(p, &p->instance_id));
    }
-   if (linear && p->nr_buffer_varients == 1) {
-      return p->idx_EBX;
+   if (!index_size && p->nr_buffer_varients == 1) {
+      return p->idx_ESI;
    }
-   else if (linear || p->buffer_varient[var_idx].instance_divisor) {
-      struct x86_reg ptr = p->tmp_EAX;
+   else if (!index_size || p->buffer_varient[var_idx].instance_divisor) {
+      struct x86_reg ptr = p->src_ECX;
       struct x86_reg buf_ptr = 
-         x86_make_disp(p->machine_EDX, 
+         x86_make_disp(p->machine_EDI,
                        get_offset(p, &p->buffer_varient[var_idx].ptr));
       
+      x64_rexw(p->func);
       x86_mov(p->func, ptr, buf_ptr);
       return ptr;
    }
    else {
-      struct x86_reg ptr = p->tmp_EAX;
+      struct x86_reg ptr = p->src_ECX;
       const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
 
       struct x86_reg buf_stride = 
-         x86_make_disp(p->machine_EDX, 
+         x86_make_disp(p->machine_EDI,
                        get_offset(p, &p->buffer[varient->buffer_index].stride));
 
       struct x86_reg buf_base_ptr = 
-         x86_make_disp(p->machine_EDX, 
+         x86_make_disp(p->machine_EDI,
                        get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
 
 
 
       /* Calculate pointer to current attrib:
        */
-      x86_mov(p->func, ptr, buf_stride);
-      x86_imul(p->func, ptr, elt);
+      switch(index_size)
+      {
+      case 1:
+         x86_movzx8(p->func, ptr, elt);
+         break;
+      case 2:
+         x86_movzx16(p->func, ptr, elt);
+         break;
+      case 4:
+         x86_mov(p->func, ptr, elt);
+         break;
+      }
+      x86_imul(p->func, ptr, buf_stride);
+      x64_rexw(p->func);
       x86_add(p->func, ptr, buf_base_ptr);
       return ptr;
    }
@@ -494,39 +1189,43 @@ static struct x86_reg get_buffer_ptr( struct translate_sse *p,
 
 
 static boolean incr_inputs( struct translate_sse *p, 
-                            boolean linear )
+                            unsigned index_size )
 {
-   if (linear && p->nr_buffer_varients == 1) {
-      struct x86_reg stride = x86_make_disp(p->machine_EDX,
+   if (!index_size && p->nr_buffer_varients == 1) {
+      struct x86_reg stride = x86_make_disp(p->machine_EDI,
                                             get_offset(p, &p->buffer[0].stride));
 
       if (p->buffer_varient[0].instance_divisor == 0) {
-         x86_add(p->func, p->idx_EBX, stride);
-         sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
+         x64_rexw(p->func);
+         x86_add(p->func, p->idx_ESI, stride);
+         sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
       }
    }
-   else if (linear) {
+   else if (!index_size) {
       unsigned i;
 
       /* Is this worthwhile??
        */
       for (i = 0; i < p->nr_buffer_varients; i++) {
          struct translate_buffer_varient *varient = &p->buffer_varient[i];
-         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
+         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
                                                 get_offset(p, &varient->ptr));
-         struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
+         struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
                                                    get_offset(p, &p->buffer[varient->buffer_index].stride));
 
          if (varient->instance_divisor == 0) {
-            x86_mov(p->func, p->tmp_EAX, buf_ptr);
-            x86_add(p->func, p->tmp_EAX, buf_stride);
+            x86_mov(p->func, p->tmp_EAX, buf_stride);
+            x64_rexw(p->func);
+            x86_add(p->func, p->tmp_EAX, buf_ptr);
             if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
+            x64_rexw(p->func);
             x86_mov(p->func, buf_ptr, p->tmp_EAX);
          }
       }
    } 
    else {
-      x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4));
+      x64_rexw(p->func);
+      x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
    }
    
    return TRUE;
@@ -551,35 +1250,52 @@ static boolean incr_inputs( struct translate_sse *p,
  */
 static boolean build_vertex_emit( struct translate_sse *p,
 				  struct x86_function *func,
-				  boolean linear )
+				  unsigned index_size )
 {
    int fixup, label;
    unsigned j;
 
+   memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
+   memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
+
    p->tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
-   p->idx_EBX       = x86_make_reg(file_REG32, reg_BX);
-   p->outbuf_ECX    = x86_make_reg(file_REG32, reg_CX);
-   p->machine_EDX   = x86_make_reg(file_REG32, reg_DX);
-   p->count_ESI     = x86_make_reg(file_REG32, reg_SI);
+   p->idx_ESI       = x86_make_reg(file_REG32, reg_SI);
+   p->outbuf_EBX    = x86_make_reg(file_REG32, reg_BX);
+   p->machine_EDI   = x86_make_reg(file_REG32, reg_DI);
+   p->count_EBP     = x86_make_reg(file_REG32, reg_BP);
+   p->tmp2_EDX     = x86_make_reg(file_REG32, reg_DX);
+   p->src_ECX     = x86_make_reg(file_REG32, reg_CX);
 
    p->func = func;
-   p->loaded_inv_255 = FALSE;
-   p->loaded_255 = FALSE;
-   p->loaded_identity = FALSE;
 
    x86_init_func(p->func);
 
-   /* Push a few regs?
-    */
-   x86_push(p->func, p->idx_EBX);
-   x86_push(p->func, p->count_ESI);
+   if(x86_target(p->func) == X86_64_WIN64_ABI)
+   {
+	   /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
+	   sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6));
+	   sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7));
+   }
 
-   /* Load arguments into regs:
-    */
-   x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
-   x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
-   x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
-   x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5));
+   x86_push(p->func, p->outbuf_EBX);
+   x86_push(p->func, p->count_EBP);
+
+/* on non-Win64 x86-64, these are already in the right registers */
+   if(x86_target(p->func) != X86_64_STD_ABI)
+   {
+      x86_push(p->func, p->machine_EDI);
+      x86_push(p->func, p->idx_ESI);
+
+      x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
+      x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
+   }
+
+   x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
+
+   if(x86_target(p->func) != X86_32)
+      x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
+   else
+      x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
 
    /* Load instance ID.
     */
@@ -588,25 +1304,25 @@ static boolean build_vertex_emit( struct translate_sse *p,
               p->tmp_EAX,
               x86_fn_arg(p->func, 4));
       x86_mov(p->func,
-              x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)),
+              x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
               p->tmp_EAX);
    }
 
    /* Get vertex count, compare to zero
     */
    x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
-   x86_cmp(p->func, p->count_ESI, p->tmp_EAX);
+   x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
    fixup = x86_jcc_forward(p->func, cc_E);
 
    /* always load, needed or not:
     */
-   init_inputs(p, linear);
+   init_inputs(p, index_size);
 
    /* Note address for loop jump
     */
    label = x86_get_label(p->func);
    {
-      struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
+      struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
       int last_varient = -1;
       struct x86_reg vb;
 
@@ -618,30 +1334,31 @@ static boolean build_vertex_emit( struct translate_sse *p,
           */
          if (varient != last_varient) {
             last_varient = varient;
-            vb = get_buffer_ptr(p, linear, varient, elt);
+            vb = get_buffer_ptr(p, index_size, varient, elt);
          }
          
          if (!translate_attr( p, a, 
                               x86_make_disp(vb, a->input_offset), 
-                              x86_make_disp(p->outbuf_ECX, a->output_offset)))
+                              x86_make_disp(p->outbuf_EBX, a->output_offset)))
             return FALSE;
       }
 
       /* Next output vertex:
        */
+      x64_rexw(p->func);
       x86_lea(p->func, 
-              p->outbuf_ECX, 
-              x86_make_disp(p->outbuf_ECX, 
+              p->outbuf_EBX,
+              x86_make_disp(p->outbuf_EBX,
                             p->translate.key.output_stride));
 
       /* Incr index
        */ 
-      incr_inputs( p, linear );
+      incr_inputs( p, index_size );
    }
 
    /* decr count, loop if not zero
     */
-   x86_dec(p->func, p->count_ESI);
+   x86_dec(p->func, p->count_EBP);
    x86_jcc(p->func, cc_NZ, label);
 
    /* Exit mmx state?
@@ -656,8 +1373,20 @@ static boolean build_vertex_emit( struct translate_sse *p,
    /* Pop regs and return
     */
    
-   x86_pop(p->func, p->count_ESI);
-   x86_pop(p->func, p->idx_EBX);
+   if(x86_target(p->func) != X86_64_STD_ABI)
+   {
+      x86_pop(p->func, p->idx_ESI);
+      x86_pop(p->func, p->machine_EDI);
+   }
+
+   x86_pop(p->func, p->count_EBP);
+   x86_pop(p->func, p->outbuf_EBX);
+
+   if(x86_target(p->func) == X86_64_WIN64_ABI)
+   {
+	   sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
+	   sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
+   }
    x86_ret(p->func);
 
    return TRUE;
@@ -697,37 +1426,7 @@ static void translate_sse_release( struct translate *translate )
    x86_release_func( &p->linear_func );
    x86_release_func( &p->elt_func );
 
-   FREE(p);
-}
-
-static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
-			      const unsigned *elts,
-			      unsigned count,
-                              unsigned instance_id,
-			      void *output_buffer )
-{
-   struct translate_sse *p = (struct translate_sse *)translate;
-
-   p->gen_run_elts( translate,
-		    elts,
-		    count,
-                    instance_id,
-                    output_buffer);
-}
-
-static void PIPE_CDECL translate_sse_run( struct translate *translate,
-			 unsigned start,
-			 unsigned count,
-                         unsigned instance_id,
-			 void *output_buffer )
-{
-   struct translate_sse *p = (struct translate_sse *)translate;
-
-   p->gen_run( translate,
-	       start,
-	       count,
-               instance_id,
-               output_buffer);
+   os_free_aligned(p);
 }
 
 
@@ -736,18 +1435,19 @@ struct translate *translate_sse2_create( const struct translate_key *key )
    struct translate_sse *p = NULL;
    unsigned i;
 
-   if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
+   /* this is misnamed, it actually refers to whether rtasm is enabled or not */
+   if (!rtasm_cpu_has_sse())
       goto fail;
 
-   p = CALLOC_STRUCT( translate_sse );
+   p = os_malloc_aligned(sizeof(struct translate_sse), 16);
    if (p == NULL) 
       goto fail;
+   memset(p, 0, sizeof(*p));
+   memcpy(p->consts, consts, sizeof(consts));
 
    p->translate.key = *key;
    p->translate.release = translate_sse_release;
    p->translate.set_buffer = translate_sse_set_buffer;
-   p->translate.run_elts = translate_sse_run_elts;
-   p->translate.run = translate_sse_run;
 
    for (i = 0; i < key->nr_elements; i++) {
       if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
@@ -783,18 +1483,32 @@ struct translate *translate_sse2_create( const struct translate_key *key )
 
    if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
 
-   if (!build_vertex_emit(p, &p->linear_func, TRUE))
+   if (!build_vertex_emit(p, &p->linear_func, 0))
+      goto fail;
+
+   if (!build_vertex_emit(p, &p->elt_func, 4))
+      goto fail;
+
+   if (!build_vertex_emit(p, &p->elt16_func, 2))
+      goto fail;
+
+   if (!build_vertex_emit(p, &p->elt8_func, 1))
+      goto fail;
+
+   p->translate.run = (void*)x86_get_func(&p->linear_func);
+   if (p->translate.run == NULL)
       goto fail;
 
-   if (!build_vertex_emit(p, &p->elt_func, FALSE))
+   p->translate.run_elts = (void*)x86_get_func(&p->elt_func);
+   if (p->translate.run_elts == NULL)
       goto fail;
 
-   p->gen_run = (run_func)x86_get_func(&p->linear_func);
-   if (p->gen_run == NULL)
+   p->translate.run_elts16 = (void*)x86_get_func(&p->elt16_func);
+   if (p->translate.run_elts16 == NULL)
       goto fail;
 
-   p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
-   if (p->gen_run_elts == NULL)
+   p->translate.run_elts8 = (void*)x86_get_func(&p->elt8_func);
+   if (p->translate.run_elts8 == NULL)
       goto fail;
 
    return &p->translate;
diff --git a/src/gallium/auxiliary/util/u_bitmask.h b/src/gallium/auxiliary/util/u_bitmask.h
index 87f1110296..98b85ddecd 100644
--- a/src/gallium/auxiliary/util/u_bitmask.h
+++ b/src/gallium/auxiliary/util/u_bitmask.h
@@ -36,6 +36,9 @@
 #define U_HANDLE_BITMASK_H_
 
 
+#include "pipe/p_compiler.h"
+
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index 97fa99ec65..dfb142b9e1 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -42,6 +42,7 @@
 
 #include "util/u_blit.h"
 #include "util/u_draw_quad.h"
+#include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_sampler.h"
@@ -56,15 +57,18 @@ struct blit_state
    struct cso_context *cso;
 
    struct pipe_blend_state blend;
-   struct pipe_depth_stencil_alpha_state depthstencil;
+   struct pipe_depth_stencil_alpha_state depthstencil_keep;
+   struct pipe_depth_stencil_alpha_state depthstencil_write;
    struct pipe_rasterizer_state rasterizer;
    struct pipe_sampler_state sampler;
    struct pipe_viewport_state viewport;
    struct pipe_clip_state clip;
    struct pipe_vertex_element velem[2];
+   enum pipe_texture_target internal_target;
 
    void *vs;
    void *fs[TGSI_WRITEMASK_XYZW + 1];
+   void *fs_depth;
 
    struct pipe_resource *vbuf;  /**< quad vertices */
    unsigned vbuf_slot;
@@ -95,7 +99,11 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
    ctx->blend.rt[0].colormask = PIPE_MASK_RGBA;
 
    /* no-op depth/stencil/alpha */
-   memset(&ctx->depthstencil, 0, sizeof(ctx->depthstencil));
+   memset(&ctx->depthstencil_keep, 0, sizeof(ctx->depthstencil_keep));
+   memset(&ctx->depthstencil_write, 0, sizeof(ctx->depthstencil_write));
+   ctx->depthstencil_write.depth.enabled = 1;
+   ctx->depthstencil_write.depth.writemask = 1;
+   ctx->depthstencil_write.depth.func = PIPE_FUNC_ALWAYS;
 
    /* rasterizer */
    memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer));
@@ -110,7 +118,6 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
    ctx->sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
    ctx->sampler.min_img_filter = 0; /* set later */
    ctx->sampler.mag_img_filter = 0; /* set later */
-   ctx->sampler.normalized_coords = 1;
 
    /* vertex elements state */
    memset(&ctx->velem[0], 0, sizeof(ctx->velem[0]) * 2);
@@ -145,6 +152,11 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
       ctx->vertices[i][1][3] = 1.0f; /* q */
    }
 
+   if(pipe->screen->get_param(pipe->screen, PIPE_CAP_NPOT_TEXTURES))
+      ctx->internal_target = PIPE_TEXTURE_2D;
+   else
+      ctx->internal_target = PIPE_TEXTURE_RECT;
+
    return ctx;
 }
 
@@ -164,6 +176,9 @@ util_destroy_blit(struct blit_state *ctx)
       if (ctx->fs[i])
          pipe->delete_fs_state(pipe, ctx->fs[i]);
 
+   if (ctx->fs_depth)
+      pipe->delete_fs_state(pipe, ctx->fs_depth);
+
    pipe_resource_reference(&ctx->vbuf, NULL);
 
    FREE(ctx);
@@ -271,7 +286,7 @@ regions_overlap(int srcX0, int srcY0,
  * \param writemask  controls which channels in the dest surface are sourced
  *                   from the src surface.  Disabled channels are sourced
  *                   from (0,0,0,1).
- * XXX need some control over blitting Z and/or stencil.
+ * XXX need some control over blitting stencil.
  */
 void
 util_blit_pixels_writemask(struct blit_state *ctx,
@@ -294,8 +309,9 @@ util_blit_pixels_writemask(struct blit_state *ctx,
    const int srcW = abs(srcX1 - srcX0);
    const int srcH = abs(srcY1 - srcY0);
    unsigned offset;
-   boolean overlap;
+   boolean overlap, dst_is_depth;
    float s0, t0, s1, t1;
+   boolean normalized;
 
    assert(filter == PIPE_TEX_MIPFILTER_NEAREST ||
           filter == PIPE_TEX_MIPFILTER_LINEAR);
@@ -335,7 +351,6 @@ util_blit_pixels_writemask(struct blit_state *ctx,
       return;
    }
 
-
    /* Create a temporary texture when src and dest alias or when src
     * is anything other than a 2d texture.
     * XXX should just use appropriate shader to access 1d / 3d slice / cube face,
@@ -347,7 +362,8 @@ util_blit_pixels_writemask(struct blit_state *ctx,
        dst->face == srcsub.face &&
        dst->level == srcsub.level &&
        dst->zslice == srcZ0) ||
-       src_tex->target != PIPE_TEXTURE_2D)
+       (src_tex->target != PIPE_TEXTURE_2D &&
+       src_tex->target != PIPE_TEXTURE_RECT))
    {
       struct pipe_resource texTemp;
       struct pipe_resource *tex;
@@ -372,7 +388,7 @@ util_blit_pixels_writemask(struct blit_state *ctx,
 
       /* create temp texture */
       memset(&texTemp, 0, sizeof(texTemp));
-      texTemp.target = PIPE_TEXTURE_2D;
+      texTemp.target = ctx->internal_target;
       texTemp.format = src_tex->format;
       texTemp.last_level = 0;
       texTemp.width0 = srcW;
@@ -392,10 +408,19 @@ util_blit_pixels_writemask(struct blit_state *ctx,
                                  src_tex, srcsub, srcLeft, srcTop, srcZ0, /* src */
                                  srcW, srcH);     /* size */
 
-      s0 = 0.0f; 
-      s1 = 1.0f;
-      t0 = 0.0f;
-      t1 = 1.0f;
+      normalized = tex->target != PIPE_TEXTURE_RECT;
+      if(normalized) {
+         s0 = 0.0f;
+         s1 = 1.0f;
+         t0 = 0.0f;
+         t1 = 1.0f;
+      }
+      else {
+         s0 = 0;
+         s1 = srcW;
+         t0 = 0;
+         t1 = srcH;
+      }
 
       u_sampler_view_default_template(&sv_templ, tex, tex->format);
       sampler_view = pipe->create_sampler_view(pipe, tex, &sv_templ);
@@ -415,20 +440,29 @@ util_blit_pixels_writemask(struct blit_state *ctx,
          return;
       }
 
-      s0 = srcX0 / (float)(u_minify(sampler_view->texture->width0, srcsub.level));
-      s1 = srcX1 / (float)(u_minify(sampler_view->texture->width0, srcsub.level));
-      t0 = srcY0 / (float)(u_minify(sampler_view->texture->height0, srcsub.level));
-      t1 = srcY1 / (float)(u_minify(sampler_view->texture->height0, srcsub.level));
+      s0 = srcX0;
+      s1 = srcX1;
+      t0 = srcY0;
+      t1 = srcY1;
+      normalized = sampler_view->texture->target != PIPE_TEXTURE_RECT;
+      if(normalized)
+      {
+         s0 /= (float)(u_minify(sampler_view->texture->width0, srcsub.level));
+         s1 /= (float)(u_minify(sampler_view->texture->width0, srcsub.level));
+         t0 /= (float)(u_minify(sampler_view->texture->height0, srcsub.level));
+         t1 /= (float)(u_minify(sampler_view->texture->height0, srcsub.level));
+      }
    }
 
+   dst_is_depth = util_format_is_depth_or_stencil(dst->format);
 
-   assert(screen->is_format_supported(screen, sampler_view->format, PIPE_TEXTURE_2D,
+   assert(screen->is_format_supported(screen, sampler_view->format, ctx->internal_target,
                                       sampler_view->texture->nr_samples,
                                       PIPE_BIND_SAMPLER_VIEW, 0));
-   assert(screen->is_format_supported(screen, dst->format, PIPE_TEXTURE_2D,
+   assert(screen->is_format_supported(screen, dst->format, ctx->internal_target,
                                       dst->texture->nr_samples,
-                                      PIPE_BIND_RENDER_TARGET, 0));
-
+                                      dst_is_depth ? PIPE_BIND_DEPTH_STENCIL :
+                                                     PIPE_BIND_RENDER_TARGET, 0));
    /* save state (restored below) */
    cso_save_blend(ctx->cso);
    cso_save_depth_stencil_alpha(ctx->cso);
@@ -444,12 +478,15 @@ util_blit_pixels_writemask(struct blit_state *ctx,
 
    /* set misc state we care about */
    cso_set_blend(ctx->cso, &ctx->blend);
-   cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil);
+   cso_set_depth_stencil_alpha(ctx->cso,
+                               dst_is_depth ? &ctx->depthstencil_write :
+                                              &ctx->depthstencil_keep);
    cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
    cso_set_clip(ctx->cso, &ctx->clip);
    cso_set_vertex_elements(ctx->cso, 2, ctx->velem);
 
    /* sampler */
+   ctx->sampler.normalized_coords = normalized;
    ctx->sampler.min_img_filter = filter;
    ctx->sampler.mag_img_filter = filter;
    /* we've limited this already with the sampler view but you never know... */
@@ -472,22 +509,35 @@ util_blit_pixels_writemask(struct blit_state *ctx,
    /* texture */
    cso_set_fragment_sampler_views(ctx->cso, 1, &sampler_view);
 
-   if (ctx->fs[writemask] == NULL)
-      ctx->fs[writemask] =
-         util_make_fragment_tex_shader_writemask(pipe, TGSI_TEXTURE_2D,
-                                                 TGSI_INTERPOLATE_LINEAR,
-                                                 writemask);
-
    /* shaders */
-   cso_set_fragment_shader_handle(ctx->cso, ctx->fs[writemask]);
+   if (dst_is_depth) {
+      if (ctx->fs_depth == NULL)
+         ctx->fs_depth =
+            util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_2D,
+                                                     TGSI_INTERPOLATE_LINEAR);
+
+      cso_set_fragment_shader_handle(ctx->cso, ctx->fs_depth);
+   } else {
+      if (ctx->fs[writemask] == NULL)
+         ctx->fs[writemask] =
+            util_make_fragment_tex_shader_writemask(pipe, TGSI_TEXTURE_2D,
+                                                    TGSI_INTERPOLATE_LINEAR,
+                                                    writemask);
+
+      cso_set_fragment_shader_handle(ctx->cso, ctx->fs[writemask]);
+   }
    cso_set_vertex_shader_handle(ctx->cso, ctx->vs);
 
    /* drawing dest */
    memset(&fb, 0, sizeof(fb));
    fb.width = dst->width;
    fb.height = dst->height;
-   fb.nr_cbufs = 1;
-   fb.cbufs[0] = dst;
+   if (dst_is_depth) {
+      fb.zsbuf = dst;
+   } else {
+      fb.nr_cbufs = 1;
+      fb.cbufs[0] = dst;
+   }
    cso_set_framebuffer(ctx->cso, &fb);
 
    /* draw quad */
@@ -574,6 +624,7 @@ util_blit_pixels_tex(struct blit_state *ctx,
                      int dstX1, int dstY1,
                      float z, uint filter)
 {
+   boolean normalized = src_sampler_view->texture->target != PIPE_TEXTURE_RECT;
    struct pipe_framebuffer_state fb;
    float s0, t0, s1, t1;
    unsigned offset;
@@ -586,10 +637,18 @@ util_blit_pixels_tex(struct blit_state *ctx,
    assert(tex->width0 != 0);
    assert(tex->height0 != 0);
 
-   s0 = srcX0 / (float)tex->width0;
-   s1 = srcX1 / (float)tex->width0;
-   t0 = srcY0 / (float)tex->height0;
-   t1 = srcY1 / (float)tex->height0;
+   s0 = srcX0;
+   s1 = srcX1;
+   t0 = srcY0;
+   t1 = srcY1;
+
+   if(normalized)
+   {
+      s0 /= (float)tex->width0;
+      s1 /= (float)tex->width0;
+      t0 /= (float)tex->height0;
+      t1 /= (float)tex->height0;
+   }
 
    assert(ctx->pipe->screen->is_format_supported(ctx->pipe->screen, dst->format,
                                                  PIPE_TEXTURE_2D,
@@ -611,12 +670,13 @@ util_blit_pixels_tex(struct blit_state *ctx,
 
    /* set misc state we care about */
    cso_set_blend(ctx->cso, &ctx->blend);
-   cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil);
+   cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil_keep);
    cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
    cso_set_clip(ctx->cso, &ctx->clip);
    cso_set_vertex_elements(ctx->cso, 2, ctx->velem);
 
    /* sampler */
+   ctx->sampler.normalized_coords = normalized;
    ctx->sampler.min_img_filter = filter;
    ctx->sampler.mag_img_filter = filter;
    cso_single_sampler(ctx->cso, 0, &ctx->sampler);
diff --git a/src/gallium/auxiliary/util/u_blit.h b/src/gallium/auxiliary/util/u_blit.h
index ef95134f32..b8a0dfce13 100644
--- a/src/gallium/auxiliary/util/u_blit.h
+++ b/src/gallium/auxiliary/util/u_blit.h
@@ -30,18 +30,20 @@
 #define U_BLIT_H
 
 
+#include "pipe/p_compiler.h"
+
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
    
+struct cso_context;
 struct pipe_context;
-struct pipe_surface;
 struct pipe_resource;
-struct cso_context;
-
-
-struct blit_state;
+struct pipe_sampler_view;
+struct pipe_subresource;
+struct pipe_surface;
 
 
 extern struct blit_state *
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index b5b86b7214..f93ef26ae7 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -92,7 +92,7 @@ struct blitter_context_priv
    void *velem_state;
 
    /* Sampler state for clamping to a miplevel. */
-   void *sampler_state[PIPE_MAX_TEXTURE_LEVELS];
+   void *sampler_state[PIPE_MAX_TEXTURE_LEVELS * 2];
 
    /* Rasterizer state. */
    void *rs_state;
@@ -254,6 +254,7 @@ void util_blitter_destroy(struct blitter_context *blitter)
                                           ctx->dsa_write_depth_keep_stencil);
    pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_stencil);
    pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_write_stencil);
+   pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_flush_depth_stencil);
 
    pipe->delete_rasterizer_state(pipe, ctx->rs_state);
    pipe->delete_vs_state(pipe, ctx->vs_col);
@@ -271,7 +272,7 @@ void util_blitter_destroy(struct blitter_context *blitter)
       if (ctx->fs_col[i])
          pipe->delete_fs_state(pipe, ctx->fs_col[i]);
 
-   for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++)
+   for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS * 2; i++)
       if (ctx->sampler_state[i])
          pipe->delete_sampler_state(pipe, ctx->sampler_state[i]);
 
@@ -319,7 +320,7 @@ static void blitter_restore_CSOs(struct blitter_context_priv *ctx)
     */
    if (ctx->base.saved_fb_state.nr_cbufs != ~0) {
       pipe->set_framebuffer_state(pipe, &ctx->base.saved_fb_state);
-      util_assign_framebuffer_state(&ctx->base.saved_fb_state, NULL);
+      util_unreference_framebuffer_state(&ctx->base.saved_fb_state);
       ctx->base.saved_fb_state.nr_cbufs = ~0;
    }
 
@@ -417,16 +418,26 @@ static void blitter_set_clear_color(struct blitter_context_priv *ctx,
    }
 }
 
-static void get_normalized_texcoords(struct pipe_resource *src,
+static void get_texcoords(struct pipe_resource *src,
                                      struct pipe_subresource subsrc,
                                      unsigned x1, unsigned y1,
                                      unsigned x2, unsigned y2,
-                                     float out[4])
+                                     boolean normalized, float out[4])
 {
-   out[0] = x1 / (float)u_minify(src->width0,  subsrc.level);
-   out[1] = y1 / (float)u_minify(src->height0, subsrc.level);
-   out[2] = x2 / (float)u_minify(src->width0,  subsrc.level);
-   out[3] = y2 / (float)u_minify(src->height0, subsrc.level);
+   if(normalized)
+   {
+      out[0] = x1 / (float)u_minify(src->width0,  subsrc.level);
+      out[1] = y1 / (float)u_minify(src->height0, subsrc.level);
+      out[2] = x2 / (float)u_minify(src->width0,  subsrc.level);
+      out[3] = y2 / (float)u_minify(src->height0, subsrc.level);
+   }
+   else
+   {
+      out[0] = x1;
+      out[1] = y1;
+      out[2] = x2;
+      out[3] = y2;
+   }
 }
 
 static void set_texcoords_in_vertices(const float coord[4],
@@ -454,7 +465,7 @@ static void blitter_set_texcoords_2d(struct blitter_context_priv *ctx,
    unsigned i;
    float coord[4];
 
-   get_normalized_texcoords(src, subsrc, x1, y1, x2, y2, coord);
+   get_texcoords(src, subsrc, x1, y1, x2, y2, TRUE, coord);
    set_texcoords_in_vertices(coord, &ctx->vertices[0][1][0], 8);
 
    for (i = 0; i < 4; i++) {
@@ -489,7 +500,7 @@ static void blitter_set_texcoords_cube(struct blitter_context_priv *ctx,
    float coord[4];
    float st[4][2];
 
-   get_normalized_texcoords(src, subsrc, x1, y1, x2, y2, coord);
+   get_texcoords(src, subsrc, x1, y1, x2, y2, TRUE, coord);
    set_texcoords_in_vertices(coord, &st[0][0], 2);
 
    util_map_texcoords2d_onto_cubemap(subsrc.face,
@@ -523,7 +534,7 @@ static void blitter_draw_quad(struct blitter_context_priv *ctx)
 
 static INLINE
 void **blitter_get_sampler_state(struct blitter_context_priv *ctx,
-                                 int miplevel)
+                                 int miplevel, boolean normalized)
 {
    struct pipe_context *pipe = ctx->base.pipe;
    struct pipe_sampler_state *sampler_state = &ctx->template_sampler_state;
@@ -531,18 +542,19 @@ void **blitter_get_sampler_state(struct blitter_context_priv *ctx,
    assert(miplevel < PIPE_MAX_TEXTURE_LEVELS);
 
    /* Create the sampler state on-demand. */
-   if (!ctx->sampler_state[miplevel]) {
+   if (!ctx->sampler_state[miplevel * 2 + normalized]) {
       sampler_state->lod_bias = miplevel;
       sampler_state->min_lod = miplevel;
       sampler_state->max_lod = miplevel;
+      sampler_state->normalized_coords = normalized;
 
-      ctx->sampler_state[miplevel] = pipe->create_sampler_state(pipe,
+      ctx->sampler_state[miplevel * 2 + normalized] = pipe->create_sampler_state(pipe,
                                                                 sampler_state);
    }
 
    /* Return void** so that it can be passed to bind_fragment_sampler_states
     * directly. */
-   return &ctx->sampler_state[miplevel];
+   return &ctx->sampler_state[miplevel * 2 + normalized];
 }
 
 static INLINE
@@ -568,6 +580,8 @@ pipe_tex_to_tgsi_tex(enum pipe_texture_target pipe_tex_target)
       return TGSI_TEXTURE_1D;
    case PIPE_TEXTURE_2D:
       return TGSI_TEXTURE_2D;
+   case PIPE_TEXTURE_RECT:
+      return TGSI_TEXTURE_RECT;
    case PIPE_TEXTURE_3D:
       return TGSI_TEXTURE_3D;
    case PIPE_TEXTURE_CUBE:
@@ -716,6 +730,7 @@ void util_blitter_copy_region(struct blitter_context *blitter,
    struct pipe_sampler_view viewTempl, *view;
    unsigned bind;
    boolean is_stencil, is_depth;
+   boolean normalized;
 
    /* Give up if textures are not set. */
    assert(dst && src);
@@ -787,6 +802,8 @@ void util_blitter_copy_region(struct blitter_context *blitter,
       fb_state.zsbuf = 0;
    }
 
+   normalized = src->target != PIPE_TEXTURE_RECT;
+
    /* Initialize sampler view. */
    u_sampler_view_default_template(&viewTempl, src, src->format);
    view = pipe->create_sampler_view(pipe, src, &viewTempl);
@@ -795,7 +812,7 @@ void util_blitter_copy_region(struct blitter_context *blitter,
    pipe->bind_rasterizer_state(pipe, ctx->rs_state);
    pipe->bind_vs_state(pipe, ctx->vs_tex);
    pipe->bind_fragment_sampler_states(pipe, 1,
-                                      blitter_get_sampler_state(ctx, subsrc.level));
+                                      blitter_get_sampler_state(ctx, subsrc.level, normalized));
    pipe->bind_vertex_elements_state(pipe, ctx->velem_state);
    pipe->set_fragment_sampler_views(pipe, 1, &view);
    pipe->set_framebuffer_state(pipe, &fb_state);
@@ -806,11 +823,12 @@ void util_blitter_copy_region(struct blitter_context *blitter,
       /* Draw the quad with the draw_rectangle callback. */
       case PIPE_TEXTURE_1D:
       case PIPE_TEXTURE_2D:
+      case PIPE_TEXTURE_RECT:
          {
             /* Set texture coordinates. */
             float coord[4];
-            get_normalized_texcoords(src, subsrc, srcx, srcy,
-                                     srcx+width, srcy+height, coord);
+            get_texcoords(src, subsrc, srcx, srcy,
+                                     srcx+width, srcy+height, normalized, coord);
 
             /* Draw. */
             blitter->draw_rectangle(blitter, dstx, dsty, dstx+width, dsty+height, 0,
diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h
index f316587dea..e33d2e283f 100644
--- a/src/gallium/auxiliary/util/u_blitter.h
+++ b/src/gallium/auxiliary/util/u_blitter.h
@@ -27,6 +27,7 @@
 #ifndef U_BLITTER_H
 #define U_BLITTER_H
 
+#include "util/u_framebuffer.h"
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
 
@@ -258,45 +259,12 @@ void util_blitter_save_vertex_shader(struct blitter_context *blitter,
    blitter->saved_vs = vs;
 }
 
-/* XXX This should probably be moved elsewhere. */
-static INLINE
-void util_assign_framebuffer_state(struct pipe_framebuffer_state *dst,
-                                   const struct pipe_framebuffer_state *src)
-{
-   unsigned i;
-
-   if (src) {
-      /* Reference all surfaces. */
-      for (i = 0; i < src->nr_cbufs; i++) {
-         pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]);
-      }
-      for (; i < dst->nr_cbufs; i++) {
-         pipe_surface_reference(&dst->cbufs[i], NULL);
-      }
-
-      pipe_surface_reference(&dst->zsbuf, src->zsbuf);
-
-      dst->nr_cbufs = src->nr_cbufs;
-      dst->width = src->width;
-      dst->height = src->height;
-   } else {
-      /* Set all surfaces to NULL. */
-      for (i = 0; i < dst->nr_cbufs; i++) {
-         pipe_surface_reference(&dst->cbufs[i], NULL);
-      }
-
-      pipe_surface_reference(&dst->zsbuf, NULL);
-
-      dst->nr_cbufs = 0;
-   }
-}
-
 static INLINE
 void util_blitter_save_framebuffer(struct blitter_context *blitter,
                                    const struct pipe_framebuffer_state *state)
 {
    blitter->saved_fb_state.nr_cbufs = 0; /* It's ~0 now, meaning it's unsaved. */
-   util_assign_framebuffer_state(&blitter->saved_fb_state, state);
+   util_copy_framebuffer_state(&blitter->saved_fb_state, state);
 }
 
 static INLINE
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c
index 5056351307..32519b148b 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -73,7 +73,9 @@
 #endif
 
 
+#ifdef DEBUG
 DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", FALSE)
+#endif
 
 
 struct util_cpu_caps util_cpu_caps;
@@ -83,61 +85,6 @@ static int has_cpuid(void);
 #endif
 
 
-#if defined(PIPE_ARCH_X86)
-
-/* The sigill handlers */
-#if defined(PIPE_OS_LINUX) /*&& defined(_POSIX_SOURCE) && defined(X86_FXSR_MAGIC)*/
-static void
-sigill_handler_sse(int signal, struct sigcontext sc)
-{
-   /* Both the "xorps %%xmm0,%%xmm0" and "divps %xmm0,%%xmm1"
-    * instructions are 3 bytes long.  We must increment the instruction
-    * pointer manually to avoid repeated execution of the offending
-    * instruction.
-    *
-    * If the SIGILL is caused by a divide-by-zero when unmasked
-    * exceptions aren't supported, the SIMD FPU status and control
-    * word will be restored at the end of the test, so we don't need
-    * to worry about doing it here.  Besides, we may not be able to...
-    */
-   sc.eip += 3;
-
-   util_cpu_caps.has_sse=0;
-}
-
-static void
-sigfpe_handler_sse(int signal, struct sigcontext sc)
-{
-   if (sc.fpstate->magic != 0xffff) {
-      /* Our signal context has the extended FPU state, so reset the
-       * divide-by-zero exception mask and clear the divide-by-zero
-       * exception bit.
-       */
-      sc.fpstate->mxcsr |= 0x00000200;
-      sc.fpstate->mxcsr &= 0xfffffffb;
-   } else {
-      /* If we ever get here, we're completely hosed.
-      */
-   }
-}
-#endif /* PIPE_OS_LINUX && _POSIX_SOURCE && X86_FXSR_MAGIC */
-
-#if defined(PIPE_OS_WINDOWS)
-static LONG CALLBACK
-win32_sig_handler_sse(EXCEPTION_POINTERS* ep)
-{
-   if(ep->ExceptionRecord->ExceptionCode==EXCEPTION_ILLEGAL_INSTRUCTION){
-      ep->ContextRecord->Eip +=3;
-      util_cpu_caps.has_sse=0;
-      return EXCEPTION_CONTINUE_EXECUTION;
-   }
-   return EXCEPTION_CONTINUE_SEARCH;
-}
-#endif /* PIPE_OS_WINDOWS */
-
-#endif /* PIPE_ARCH_X86 */
-
-
 #if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE)
 static jmp_buf  __lv_powerpc_jmpbuf;
 static volatile sig_atomic_t __lv_powerpc_canjump = 0;
@@ -194,123 +141,8 @@ check_os_altivec_support(void)
 }
 #endif /* PIPE_ARCH_PPC */
 
-/* If we're running on a processor that can do SSE, let's see if we
- * are allowed to or not.  This will catch 2.4.0 or later kernels that
- * haven't been configured for a Pentium III but are running on one,
- * and RedHat patched 2.2 kernels that have broken exception handling
- * support for user space apps that do SSE.
- */
-#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64)
-static void
-check_os_katmai_support(void)
-{
-#if defined(PIPE_ARCH_X86)
-#if defined(PIPE_OS_FREEBSD)
-   int has_sse=0, ret;
-   int len = sizeof (has_sse);
-
-   ret = sysctlbyname("hw.instruction_sse", &has_sse, &len, NULL, 0);
-   if (ret || !has_sse)
-      util_cpu_caps.has_sse=0;
-
-#elif defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
-   int has_sse, has_sse2, ret, mib[2];
-   int varlen;
-
-   mib[0] = CTL_MACHDEP;
-   mib[1] = CPU_SSE;
-   varlen = sizeof (has_sse);
-
-   ret = sysctl(mib, 2, &has_sse, &varlen, NULL, 0);
-   if (ret < 0 || !has_sse) {
-      util_cpu_caps.has_sse = 0;
-   } else {
-      util_cpu_caps.has_sse = 1;
-   }
-
-   mib[1] = CPU_SSE2;
-   varlen = sizeof (has_sse2);
-   ret = sysctl(mib, 2, &has_sse2, &varlen, NULL, 0);
-   if (ret < 0 || !has_sse2) {
-      util_cpu_caps.has_sse2 = 0;
-   } else {
-      util_cpu_caps.has_sse2 = 1;
-   }
-   util_cpu_caps.has_sse = 0; /* FIXME ?!?!? */
-
-#elif defined(PIPE_OS_WINDOWS)
-   LPTOP_LEVEL_EXCEPTION_FILTER exc_fil;
-   if (util_cpu_caps.has_sse) {
-      exc_fil = SetUnhandledExceptionFilter(win32_sig_handler_sse);
-#if defined(PIPE_CC_GCC)
-      __asm __volatile ("xorps %xmm0, %xmm0");
-#elif defined(PIPE_CC_MSVC)
-      __asm {
-          xorps xmm0, xmm0        /* executing SSE instruction */
-      }
-#else
-#error Unsupported compiler
-#endif
-      SetUnhandledExceptionFilter(exc_fil);
-   }
-#elif defined(PIPE_OS_LINUX)
-   struct sigaction saved_sigill;
-   struct sigaction saved_sigfpe;
-
-   /* Save the original signal handlers.
-   */
-   sigaction(SIGILL, NULL, &saved_sigill);
-   sigaction(SIGFPE, NULL, &saved_sigfpe);
-
-   signal(SIGILL, (void (*)(int))sigill_handler_sse);
-   signal(SIGFPE, (void (*)(int))sigfpe_handler_sse);
-
-   /* Emulate test for OSFXSR in CR4.  The OS will set this bit if it
-    * supports the extended FPU save and restore required for SSE.  If
-    * we execute an SSE instruction on a PIII and get a SIGILL, the OS
-    * doesn't support Streaming SIMD Exceptions, even if the processor
-    * does.
-    */
-   if (util_cpu_caps.has_sse) {
-      __asm __volatile ("xorps %xmm1, %xmm0");
-   }
-
-   /* Emulate test for OSXMMEXCPT in CR4.  The OS will set this bit if
-    * it supports unmasked SIMD FPU exceptions.  If we unmask the
-    * exceptions, do a SIMD divide-by-zero and get a SIGILL, the OS
-    * doesn't support unmasked SIMD FPU exceptions.  If we get a SIGFPE
-    * as expected, we're okay but we need to clean up after it.
-    *
-    * Are we being too stringent in our requirement that the OS support
-    * unmasked exceptions?  Certain RedHat 2.2 kernels enable SSE by
-    * setting CR4.OSFXSR but don't support unmasked exceptions.  Win98
-    * doesn't even support them.  We at least know the user-space SSE
-    * support is good in kernels that do support unmasked exceptions,
-    * and therefore to be safe I'm going to leave this test in here.
-    */
-   if (util_cpu_caps.has_sse) {
-      /* test_os_katmai_exception_support(); */
-   }
-
-   /* Restore the original signal handlers.
-   */
-   sigaction(SIGILL, &saved_sigill, NULL);
-   sigaction(SIGFPE, &saved_sigfpe, NULL);
-
-#else
-   /* We can't use POSIX signal handling to test the availability of
-    * SSE, so we disable it by default.
-    */
-   util_cpu_caps.has_sse = 0;
-#endif /* __linux__ */
-#endif
-
-#if defined(PIPE_ARCH_X86_64)
-   util_cpu_caps.has_sse = 1;
-#endif
-}
-
 
+#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64)
 static int has_cpuid(void)
 {
 #if defined(PIPE_ARCH_X86)
@@ -469,9 +301,6 @@ util_cpu_detect(void)
          util_cpu_caps.cacheline = regs2[2] & 0xFF;
       }
 
-      if (util_cpu_caps.has_sse)
-         check_os_katmai_support();
-
       if (!util_cpu_caps.has_sse) {
          util_cpu_caps.has_sse2 = 0;
          util_cpu_caps.has_sse3 = 0;
diff --git a/src/gallium/auxiliary/util/u_debug_describe.c b/src/gallium/auxiliary/util/u_debug_describe.c
new file mode 100644
index 0000000000..1c90ff3106
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_debug_describe.c
@@ -0,0 +1,81 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <pipe/p_state.h>
+#include <util/u_format.h>
+#include <util/u_debug_describe.h>
+#include <util/u_string.h>
+
+void
+debug_describe_reference(char* buf, const struct pipe_reference*ptr)
+{
+   strcpy(buf, "pipe_object");
+}
+
+void
+debug_describe_resource(char* buf, const struct pipe_resource *ptr)
+{
+   switch(ptr->target)
+   {
+   case PIPE_BUFFER:
+      util_sprintf(buf, "pipe_buffer<%u>", (unsigned)util_format_get_stride(ptr->format, ptr->width0));
+      break;
+   case PIPE_TEXTURE_1D:
+      util_sprintf(buf, "pipe_texture1d<%u,%s,%u>", ptr->width0, util_format_short_name(ptr->format), ptr->last_level);
+      break;
+   case PIPE_TEXTURE_2D:
+      util_sprintf(buf, "pipe_texture2d<%u,%u,%s,%u>", ptr->width0, ptr->height0, util_format_short_name(ptr->format), ptr->last_level);
+      break;
+   case PIPE_TEXTURE_RECT:
+      util_sprintf(buf, "pipe_texture_rect<%u,%u,%s>", ptr->width0, ptr->height0, util_format_short_name(ptr->format));
+      break;
+   case PIPE_TEXTURE_CUBE:
+      util_sprintf(buf, "pipe_texture_cube<%u,%u,%s,%u>", ptr->width0, ptr->height0, util_format_short_name(ptr->format), ptr->last_level);
+      break;
+   case PIPE_TEXTURE_3D:
+      util_sprintf(buf, "pipe_texture3d<%u,%u,%u,%s,%u>", ptr->width0, ptr->height0, ptr->depth0, util_format_short_name(ptr->format), ptr->last_level);
+      break;
+   default:
+      util_sprintf(buf, "pipe_martian_resource<%u>", ptr->target);
+      break;
+   }
+}
+
+void
+debug_describe_surface(char* buf, const struct pipe_surface *ptr)
+{
+   char res[128];
+   debug_describe_resource(res, ptr->texture);
+   util_sprintf(buf, "pipe_surface<%s,%u,%u,%u>", res, ptr->face, ptr->level, ptr->zslice);
+}
+
+void
+debug_describe_sampler_view(char* buf, const struct pipe_sampler_view *ptr)
+{
+   char res[128];
+   debug_describe_resource(res, ptr->texture);
+   util_sprintf(buf, "pipe_sampler_view<%s,%s>", res, util_format_short_name(ptr->format));
+}
diff --git a/src/gallium/auxiliary/util/u_debug_describe.h b/src/gallium/auxiliary/util/u_debug_describe.h
new file mode 100644
index 0000000000..26d1f803bf
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_debug_describe.h
@@ -0,0 +1,49 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_DEBUG_DESCRIBE_H_
+#define U_DEBUG_DESCRIBE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct pipe_reference;
+struct pipe_resource;
+struct pipe_surface;
+struct pipe_sampler_view;
+
+/* a 256-byte buffer is necessary and sufficient */
+void debug_describe_reference(char* buf, const struct pipe_reference*ptr);
+void debug_describe_resource(char* buf, const struct pipe_resource *ptr);
+void debug_describe_surface(char* buf, const struct pipe_surface *ptr);
+void debug_describe_sampler_view(char* buf, const struct pipe_sampler_view *ptr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_DEBUG_DESCRIBE_H_ */
diff --git a/src/gallium/auxiliary/util/u_debug_refcnt.c b/src/gallium/auxiliary/util/u_debug_refcnt.c
new file mode 100644
index 0000000000..40a26c9c69
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_debug_refcnt.c
@@ -0,0 +1,181 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#if defined(DEBUG) && (!defined(PIPE_OS_WINDOWS) || defined(PIPE_SUBSYSTEM_WINDOWS_USER))
+
+/* see http://www.mozilla.org/performance/refcnt-balancer.html for what do with the output
+ * on Linux, use tools/addr2line.sh to postprocess it before anything else
+ **/
+#include <util/u_debug.h>
+#include <util/u_debug_refcnt.h>
+#include <util/u_debug_stack.h>
+#include <util/u_debug_symbol.h>
+#include <util/u_string.h>
+#include <util/u_hash_table.h>
+#include <os/os_thread.h>
+#include <os/os_stream.h>
+
+int debug_refcnt_state;
+
+struct os_stream* stream;
+
+/* TODO: maybe move this serial machinery to a stand-alone module and expose it? */
+static pipe_mutex serials_mutex;
+static struct util_hash_table* serials_hash;
+static unsigned serials_last;
+
+static unsigned hash_ptr(void* p)
+{
+   return (unsigned)(uintptr_t)p;
+}
+
+static int compare_ptr(void* a, void* b)
+{
+   if(a == b)
+      return 0;
+   else if(a < b)
+      return -1;
+   else
+      return 1;
+}
+
+static boolean debug_serial(void* p, unsigned* pserial)
+{
+   unsigned serial;
+   boolean found = TRUE;
+   pipe_mutex_lock(serials_mutex);
+   if(!serials_hash)
+      serials_hash = util_hash_table_create(hash_ptr, compare_ptr);
+   serial = (unsigned)(uintptr_t)util_hash_table_get(serials_hash, p);
+   if(!serial)
+   {
+      /* time to stop logging... (you'll have a 100 GB logfile at least at this point)
+       * TODO: avoid this
+       */
+      serial = ++serials_last;
+      if(!serial)
+      {
+         debug_error("More than 2^32 objects detected, aborting.\n");
+         os_abort();
+      }
+
+      util_hash_table_set(serials_hash, p, (void*)(uintptr_t)serial);
+      found = FALSE;
+   }
+   pipe_mutex_unlock(serials_mutex);
+   *pserial = serial;
+   return found;
+}
+
+static void debug_serial_delete(void* p)
+{
+   pipe_mutex_lock(serials_mutex);
+   util_hash_table_remove(serials_hash, p);
+   pipe_mutex_unlock(serials_mutex);
+}
+
+#define STACK_LEN 64
+
+static void dump_stack(const char* symbols[STACK_LEN])
+{
+   unsigned i;
+   for(i = 0; i < STACK_LEN; ++i)
+   {
+      if(symbols[i])
+         os_stream_printf(stream, "%s\n", symbols[i]);
+   }
+   os_stream_write(stream, "\n", 1);
+}
+
+void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
+{
+   if(debug_refcnt_state < 0)
+      return;
+
+   if(!debug_refcnt_state)
+   {
+      const char* filename = debug_get_option("GALLIUM_REFCNT_LOG", NULL);
+      if(filename && filename[0])
+         stream = os_file_stream_create(filename);
+
+      if(stream)
+         debug_refcnt_state = 1;
+      else
+         debug_refcnt_state = -1;
+   }
+
+   if(debug_refcnt_state > 0)
+   {
+      struct debug_stack_frame frames[STACK_LEN];
+      const char* symbols[STACK_LEN];
+      char buf[1024];
+
+      unsigned i;
+      unsigned refcnt = p->count;
+      unsigned serial;
+      boolean existing = debug_serial((void*)p, &serial);
+
+      debug_backtrace_capture(frames, 1, STACK_LEN);
+      for(i = 0; i < STACK_LEN; ++i)
+      {
+         if(frames[i].function)
+            symbols[i] = debug_symbol_name_cached(frames[i].function);
+         else
+            symbols[i] = 0;
+      }
+
+      get_desc(buf, p);
+
+      if(!existing)
+      {
+         os_stream_printf(stream, "<%s> %p %u Create\n", buf, p, serial);
+         dump_stack(symbols);
+
+         /* this is there to provide a gradual change even if we don't see the initialization */
+         for(i = 1; i <= refcnt - change; ++i)
+         {
+            os_stream_printf(stream, "<%s> %p %u AddRef %u\n", buf, p, serial, i);
+            dump_stack(symbols);
+         }
+      }
+
+      if(change)
+      {
+         os_stream_printf(stream, "<%s> %p %u %s %u\n", buf, p, serial, change > 0 ? "AddRef" : "Release", refcnt);
+         dump_stack(symbols);
+      }
+
+      if(!refcnt)
+      {
+         debug_serial_delete((void*)p);
+         os_stream_printf(stream, "<%s> %p %u Destroy\n", buf, p, serial);
+         dump_stack(symbols);
+      }
+
+      os_stream_flush(stream);
+   }
+}
+#endif
diff --git a/src/gallium/auxiliary/util/u_debug_refcnt.h b/src/gallium/auxiliary/util/u_debug_refcnt.h
new file mode 100644
index 0000000000..bea2d1c478
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_debug_refcnt.h
@@ -0,0 +1,63 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_DEBUG_REFCNT_H_
+#define U_DEBUG_REFCNT_H_
+
+#include <pipe/p_config.h>
+#include <pipe/p_state.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*debug_reference_descriptor)(char*, const struct pipe_reference*);
+
+#if defined(DEBUG) && (!defined(PIPE_OS_WINDOWS) || defined(PIPE_SUBSYSTEM_WINDOWS_USER))
+
+extern int debug_refcnt_state;
+
+void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change);
+
+static INLINE void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
+{
+   if (debug_refcnt_state >= 0)
+      debug_reference_slowpath(p, get_desc, change);
+}
+
+#else
+
+static INLINE void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
+{
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_DEBUG_REFCNT_H_ */
diff --git a/src/gallium/auxiliary/util/u_debug_symbol.c b/src/gallium/auxiliary/util/u_debug_symbol.c
index 6e250575d6..332952af88 100644
--- a/src/gallium/auxiliary/util/u_debug_symbol.c
+++ b/src/gallium/auxiliary/util/u_debug_symbol.c
@@ -33,9 +33,12 @@
  */
 
 #include "pipe/p_compiler.h"
+#include "os/os_thread.h"
+#include "u_string.h"
 
 #include "u_debug.h"
 #include "u_debug_symbol.h"
+#include "u_hash_table.h"
 
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER) && defined(PIPE_ARCH_X86)
    
@@ -113,8 +116,8 @@ BOOL WINAPI j_SymGetSymFromAddr(HANDLE hProcess, DWORD Address, PDWORD Displacem
 }
 
 
-static INLINE boolean
-debug_symbol_print_imagehlp(const void *addr)
+static INLINE void
+debug_symbol_name_imagehlp(const void *addr, char* buf, unsigned size)
 {
    HANDLE hProcess;
    BYTE symbolBuffer[1024];
@@ -131,25 +134,95 @@ debug_symbol_print_imagehlp(const void *addr)
       if(j_SymInitialize(hProcess, NULL, TRUE))
          bSymInitialized = TRUE;
    }
-      
+
    if(!j_SymGetSymFromAddr(hProcess, (DWORD)addr, &dwDisplacement, pSymbol))
-      return FALSE;
+      buf[0] = 0;
+   else
+   {
+      strncpy(buf, pSymbol->Name, size);
+      buf[size - 1] = 0;
+   }
+}
+#endif
 
-   debug_printf("\t%s\n", pSymbol->Name);
+#ifdef __GLIBC__
+#include <execinfo.h>
 
-   return TRUE;
-   
+/* This can only provide dynamic symbols, or binary offsets into a file.
+ *
+ * To fix this, post-process the output with tools/addr2line.sh
+ */
+static INLINE void
+debug_symbol_name_glibc(const void *addr, char* buf, unsigned size)
+{
+   char** syms = backtrace_symbols((void**)&addr, 1);
+   strncpy(buf, syms[0], size);
+   buf[size - 1] = 0;
+   free(syms);
 }
 #endif
 
-
 void
-debug_symbol_print(const void *addr)
+debug_symbol_name(const void *addr, char* buf, unsigned size)
 {
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER) && defined(PIPE_ARCH_X86)
-   if(debug_symbol_print_imagehlp(addr))
+   debug_symbol_name_imagehlp(addr, buf, size);
+   if(buf[0])
       return;
 #endif
-   
-   debug_printf("\t%p\n", addr);
+
+#ifdef __GLIBC__
+   debug_symbol_name_glibc(addr, buf, size);
+   if(buf[0])
+      return;
+#endif
+
+   util_snprintf(buf, size, "%p", addr);
+   buf[size - 1] = 0;
+}
+
+void
+debug_symbol_print(const void *addr)
+{
+   char buf[1024];
+   debug_symbol_name(addr, buf, sizeof(buf));
+   debug_printf("\t%s\n", buf);
+}
+
+struct util_hash_table* symbols_hash;
+pipe_mutex symbols_mutex;
+
+static unsigned hash_ptr(void* p)
+{
+   return (unsigned)(uintptr_t)p;
+}
+
+static int compare_ptr(void* a, void* b)
+{
+   if(a == b)
+      return 0;
+   else if(a < b)
+      return -1;
+   else
+      return 1;
+}
+
+const char*
+debug_symbol_name_cached(const void *addr)
+{
+   const char* name;
+   pipe_mutex_lock(symbols_mutex);
+   if(!symbols_hash)
+      symbols_hash = util_hash_table_create(hash_ptr, compare_ptr);
+   name = util_hash_table_get(symbols_hash, (void*)addr);
+   if(!name)
+   {
+      char buf[1024];
+      debug_symbol_name(addr, buf, sizeof(buf));
+      name = strdup(buf);
+
+      util_hash_table_set(symbols_hash, (void*)addr, (void*)name);
+   }
+   pipe_mutex_unlock(symbols_mutex);
+   return name;
 }
diff --git a/src/gallium/auxiliary/util/u_debug_symbol.h b/src/gallium/auxiliary/util/u_debug_symbol.h
index 021586987b..b247706c2a 100644
--- a/src/gallium/auxiliary/util/u_debug_symbol.h
+++ b/src/gallium/auxiliary/util/u_debug_symbol.h
@@ -43,8 +43,13 @@ extern "C" {
 
 
 void
-debug_symbol_print(const void *addr);
+debug_symbol_name(const void *addr, char* buf, unsigned size);
+
+const char*
+debug_symbol_name_cached(const void *addr);
 
+void
+debug_symbol_print(const void *addr);
 
 #ifdef	__cplusplus
 }
diff --git a/src/gallium/auxiliary/util/u_dirty_surfaces.h b/src/gallium/auxiliary/util/u_dirty_surfaces.h
index 99f260bf96..fd1bbe5ffd 100644
--- a/src/gallium/auxiliary/util/u_dirty_surfaces.h
+++ b/src/gallium/auxiliary/util/u_dirty_surfaces.h
@@ -1,9 +1,39 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
 #ifndef U_DIRTY_SURFACES_H_
 #define U_DIRTY_SURFACES_H_
 
+#include "pipe/p_state.h"
+
 #include "util/u_double_list.h"
 #include "util/u_math.h"
 
+struct pipe_context;
+
 typedef void (*util_dirty_surface_flush_t) (struct pipe_context *, struct pipe_surface *);
 
 struct util_dirty_surfaces
diff --git a/src/gallium/auxiliary/util/u_draw.h b/src/gallium/auxiliary/util/u_draw.h
index 2a91ea0f9a..f06d09ef91 100644
--- a/src/gallium/auxiliary/util/u_draw.h
+++ b/src/gallium/auxiliary/util/u_draw.h
@@ -31,6 +31,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_context.h"
+#include "pipe/p_state.h"
 
 
 static INLINE void
diff --git a/src/gallium/auxiliary/util/u_dynarray.h b/src/gallium/auxiliary/util/u_dynarray.h
index 9d1c1713a7..980cadf22d 100644
--- a/src/gallium/auxiliary/util/u_dynarray.h
+++ b/src/gallium/auxiliary/util/u_dynarray.h
@@ -106,6 +106,9 @@ util_dynarray_trim(struct util_dynarray *buf)
 #define util_dynarray_pop_ptr(buf, type) (type*)((char*)(buf)->data + ((buf)->size -= sizeof(type)))
 #define util_dynarray_pop(buf, type) *util_dynarray_pop_ptr(buf, type)
 #define util_dynarray_contains(buf, type) ((buf)->size >= sizeof(type))
+#define util_dynarray_element(buf, type, idx) ((type*)(buf)->data + (idx))
+#define util_dynarray_begin(buf) ((buf)->data)
+#define util_dynarray_end(buf) ((void*)util_dynarray_element((buf), char, (buf)->size))
 
 #endif /* U_DYNARRAY_H */
 
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index b7fe2d3003..6a931a9581 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -1255,6 +1255,7 @@ fallback_gen_mipmap(struct gen_mipmap_state *ctx,
       make_1d_mipmap(ctx, pt, face, baseLevel, lastLevel);
       break;
    case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
    case PIPE_TEXTURE_CUBE:
       make_2d_mipmap(ctx, pt, face, baseLevel, lastLevel);
       break;
diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h
index 540305c146..78473bf35a 100644
--- a/src/gallium/auxiliary/util/u_inlines.h
+++ b/src/gallium/auxiliary/util/u_inlines.h
@@ -33,6 +33,8 @@
 #include "pipe/p_state.h"
 #include "pipe/p_screen.h"
 #include "util/u_debug.h"
+#include "util/u_debug_describe.h"
+#include "util/u_debug_refcnt.h"
 #include "util/u_atomic.h"
 #include "util/u_box.h"
 #include "util/u_math.h"
@@ -67,7 +69,9 @@ pipe_is_referenced(struct pipe_reference *reference)
  * \return TRUE if the object's refcount hits zero and should be destroyed.
  */
 static INLINE boolean
-pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference)
+pipe_reference_described(struct pipe_reference *ptr, 
+                         struct pipe_reference *reference, 
+                         debug_reference_descriptor get_desc)
 {
    boolean destroy = FALSE;
 
@@ -76,6 +80,7 @@ pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference)
       if (reference) {
          assert(pipe_is_referenced(reference));
          p_atomic_inc(&reference->count);
+         debug_reference(reference, get_desc, 1);
       }
 
       if (ptr) {
@@ -83,41 +88,49 @@ pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference)
          if (p_atomic_dec_zero(&ptr->count)) {
             destroy = TRUE;
          }
+         debug_reference(ptr, get_desc, -1);
       }
    }
 
    return destroy;
 }
 
+static INLINE boolean
+pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference)
+{
+   return pipe_reference_described(ptr, reference, 
+                                   (debug_reference_descriptor)debug_describe_reference);
+}
 
 static INLINE void
 pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf)
 {
    struct pipe_surface *old_surf = *ptr;
 
-   if (pipe_reference(&(*ptr)->reference, &surf->reference))
+   if (pipe_reference_described(&(*ptr)->reference, &surf->reference, 
+                                (debug_reference_descriptor)debug_describe_surface))
       old_surf->texture->screen->tex_surface_destroy(old_surf);
    *ptr = surf;
 }
 
-
 static INLINE void
 pipe_resource_reference(struct pipe_resource **ptr, struct pipe_resource *tex)
 {
    struct pipe_resource *old_tex = *ptr;
 
-   if (pipe_reference(&(*ptr)->reference, &tex->reference))
+   if (pipe_reference_described(&(*ptr)->reference, &tex->reference, 
+                                (debug_reference_descriptor)debug_describe_resource))
       old_tex->screen->resource_destroy(old_tex->screen, old_tex);
    *ptr = tex;
 }
 
-
 static INLINE void
 pipe_sampler_view_reference(struct pipe_sampler_view **ptr, struct pipe_sampler_view *view)
 {
    struct pipe_sampler_view *old_view = *ptr;
 
-   if (pipe_reference(&(*ptr)->reference, &view->reference))
+   if (pipe_reference_described(&(*ptr)->reference, &view->reference,
+                                (debug_reference_descriptor)debug_describe_sampler_view))
       old_view->context->sampler_view_destroy(old_view->context, old_view);
    *ptr = view;
 }
diff --git a/src/gallium/auxiliary/util/u_linkage.c b/src/gallium/auxiliary/util/u_linkage.c
new file mode 100644
index 0000000000..2f6f41ba84
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_linkage.c
@@ -0,0 +1,149 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_debug.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
+#include "util/u_linkage.h"
+
+/* we must only record the registers that are actually used, not just declared */
+static INLINE boolean
+util_semantic_set_test_and_set(struct util_semantic_set *set, unsigned value)
+{
+   unsigned mask = 1 << (value % (sizeof(long) * 8));
+   unsigned long *p = &set->masks[value / (sizeof(long) * 8)];
+   unsigned long v = *p & mask;
+   *p |= mask;
+   return !!v;
+}
+
+unsigned
+util_semantic_set_from_program_file(struct util_semantic_set *set, const struct tgsi_token *tokens, enum tgsi_file_type file)
+{
+   struct tgsi_shader_info info;
+   struct tgsi_parse_context parse;
+   unsigned count = 0;
+   ubyte *semantic_name;
+   ubyte *semantic_index;
+
+   tgsi_scan_shader(tokens, &info);
+
+   if(file == TGSI_FILE_INPUT)
+   {
+      semantic_name = info.input_semantic_name;
+      semantic_index = info.input_semantic_index;
+   }
+   else if(file == TGSI_FILE_OUTPUT)
+   {
+      semantic_name = info.output_semantic_name;
+      semantic_index = info.output_semantic_index;
+   }
+   else
+   {
+      assert(0);
+      semantic_name = NULL;
+      semantic_index = NULL;
+   }
+
+   tgsi_parse_init(&parse, tokens);
+
+   memset(set->masks, 0, sizeof(set->masks));
+   while(!tgsi_parse_end_of_tokens(&parse))
+   {
+      tgsi_parse_token(&parse);
+
+      if(parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION)
+      {
+	 const struct tgsi_full_instruction *finst = &parse.FullToken.FullInstruction;
+	 unsigned i;
+	 for(i = 0; i < finst->Instruction.NumDstRegs; ++i)
+	 {
+	    if(finst->Dst[i].Register.File == file)
+	    {
+	       unsigned idx = finst->Dst[i].Register.Index;
+	       if(semantic_name[idx] == TGSI_SEMANTIC_GENERIC)
+	       {
+		  if(!util_semantic_set_test_and_set(set, semantic_index[idx]))
+		     ++count;
+	       }
+	    }
+	 }
+
+	 for(i = 0; i < finst->Instruction.NumSrcRegs; ++i)
+	 {
+	    if(finst->Src[i].Register.File == file)
+	    {
+	       unsigned idx = finst->Src[i].Register.Index;
+	       if(semantic_name[idx] == TGSI_SEMANTIC_GENERIC)
+	       {
+		  if(!util_semantic_set_test_and_set(set, semantic_index[idx]))
+		     ++count;
+	       }
+	    }
+	 }
+      }
+   }
+   tgsi_parse_free(&parse);
+
+   return count;
+}
+
+#define UTIL_SEMANTIC_SET_FOR_EACH(i, set) for(i = 0; i < 256; ++i) if(set->masks[i / (sizeof(long) * 8)] & (1 << (i % (sizeof(long) * 8))))
+
+void
+util_semantic_layout_from_set(unsigned char *layout, const struct util_semantic_set *set, unsigned efficient_slots, unsigned num_slots)
+{
+   int first = -1;
+   int last = -1;
+   unsigned i;
+
+   memset(layout, 0xff, num_slots);
+
+   UTIL_SEMANTIC_SET_FOR_EACH(i, set)
+   {
+      if(first < 0)
+	 first = i;
+      last = i;
+   }
+
+   if(last < efficient_slots)
+   {
+      UTIL_SEMANTIC_SET_FOR_EACH(i, set)
+         layout[i] = i;
+   }
+   else if((last - first) < efficient_slots)
+   {
+      UTIL_SEMANTIC_SET_FOR_EACH(i, set)
+         layout[i - first] = i;
+   }
+   else
+   {
+      unsigned idx = 0;
+      UTIL_SEMANTIC_SET_FOR_EACH(i, set)
+         layout[idx++] = i;
+   }
+}
diff --git a/src/gallium/auxiliary/util/u_linkage.h b/src/gallium/auxiliary/util/u_linkage.h
new file mode 100644
index 0000000000..4720e0ee60
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_linkage.h
@@ -0,0 +1,66 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_LINKAGE_H_
+#define U_LINKAGE_H_
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_shader_tokens.h"
+
+struct util_semantic_set
+{
+   unsigned long masks[256 / 8 / sizeof(unsigned long)];
+};
+
+static INLINE bool
+util_semantic_set_contains(struct util_semantic_set *set, unsigned char value)
+{
+   return !!(set->masks[value / (sizeof(long) * 8)] & (1 << (value / (sizeof(long) * 8))));
+}
+
+unsigned util_semantic_set_from_program_file(struct util_semantic_set *set, const struct tgsi_token *tokens, enum tgsi_file_type file);
+
+/* efficient_slots is the number of slots such that hardware performance is
+ * the same for using that amount, with holes, or less slots but with less
+ * holes.
+ *
+ * num_slots is the size of the layout array and hardware limit instead.
+ *
+ * efficient_slots == 0 or efficient_solts == num_slots are typical settings.
+ */
+void util_semantic_layout_from_set(unsigned char *layout, const struct util_semantic_set *set, unsigned efficient_slots, unsigned num_slots);
+
+static INLINE void
+util_semantic_table_from_layout(unsigned char *table, unsigned char *layout, unsigned char first_slot_value, unsigned char num_slots)
+{
+   int i;
+   memset(table, 0xff, sizeof(table));
+
+   for(i = 0; i < num_slots; ++i)
+      table[layout[i]] = first_slot_value + i;
+}
+
+#endif /* U_LINKAGE_H_ */
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index fe19466436..69a7681494 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -361,16 +361,6 @@ util_is_inf_or_nan(float x)
 
 
 /**
- * Test whether x is a power of two.
- */
-static INLINE boolean
-util_is_pot(unsigned x)
-{
-   return (x & (x - 1)) == 0;
-}
-
-
-/**
  * Find first bit set in word.  Least significant bit is 1.
  * Return 0 if no bits set.
  */
@@ -566,6 +556,9 @@ util_bswap16(uint16_t n)
 #define MIN3( A, B, C ) MIN2( MIN2( A, B ), C )
 #define MAX3( A, B, C ) MAX2( MAX2( A, B ), C )
 
+#define MIN4( A, B, C, D ) MIN2( MIN2( A, B ), MIN2(C, D) )
+#define MAX4( A, B, C, D ) MAX2( MAX2( A, B ), MAX2(C, D) )
+
 
 /**
  * Align a value, only works pot alignemnts.
diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h
index 5f113f742b..aae8b8bdf1 100644
--- a/src/gallium/auxiliary/util/u_pack_color.h
+++ b/src/gallium/auxiliary/util/u_pack_color.h
@@ -42,12 +42,18 @@
 #include "util/u_math.h"
 
 
-
+/**
+ * Helper union for packing pixel values.
+ * Will often contain values in formats which are too complex to be described
+ * in simple terms, hence might just effectively contain a number of bytes.
+ * Must be big enough to hold data for all formats (currently 256 bits).
+ */
 union util_color {
    ubyte ub;
    ushort us;
    uint ui;
    float f[4];
+   double d[4];
 };
 
 /**
diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c
index 9bbcf1c8c4..56fcfac069 100644
--- a/src/gallium/auxiliary/util/u_rect.c
+++ b/src/gallium/auxiliary/util/u_rect.c
@@ -32,6 +32,7 @@
 
 #include "util/u_format.h"
 #include "util/u_rect.h"
+#include "util/u_pack_color.h"
 
 
 /**
@@ -94,7 +95,7 @@ util_fill_rect(ubyte * dst,
                unsigned dst_y,
                unsigned width,
                unsigned height,
-               uint32_t value)
+               union util_color *uc)
 {
    unsigned i, j;
    unsigned width_size;
@@ -110,40 +111,54 @@ util_fill_rect(ubyte * dst,
    dst_y /= blockheight;
    width = (width + blockwidth - 1)/blockwidth;
    height = (height + blockheight - 1)/blockheight;
-   
+
    dst += dst_x * blocksize;
    dst += dst_y * dst_stride;
    width_size = width * blocksize;
-   
+
    switch (blocksize) {
    case 1:
       if(dst_stride == width_size)
-	 memset(dst, (ubyte) value, height * width_size);
+         memset(dst, uc->ub, height * width_size);
       else {
-	 for (i = 0; i < height; i++) {
-	    memset(dst, (ubyte) value, width_size);
-	    dst += dst_stride;
-	 }
+         for (i = 0; i < height; i++) {
+            memset(dst, uc->ub, width_size);
+            dst += dst_stride;
+         }
       }
       break;
    case 2:
       for (i = 0; i < height; i++) {
-	 uint16_t *row = (uint16_t *)dst;
-	 for (j = 0; j < width; j++)
-	    *row++ = (uint16_t) value;
-	 dst += dst_stride;
+         uint16_t *row = (uint16_t *)dst;
+         for (j = 0; j < width; j++)
+            *row++ = uc->us;
+         dst += dst_stride;
       }
       break;
    case 4:
       for (i = 0; i < height; i++) {
-	 uint32_t *row = (uint32_t *)dst;
-	 for (j = 0; j < width; j++)
-	    *row++ = value;
-	 dst += dst_stride;
+         uint32_t *row = (uint32_t *)dst;
+         for (j = 0; j < width; j++)
+            *row++ = uc->ui;
+         dst += dst_stride;
+      }
+      break;
+   case 8:
+   case 12:
+   case 16:
+   case 24:
+   case 32:
+      for (i = 0; i < height; i++) {
+         ubyte *row = dst;
+         for (j = 0; j < width; j++) {
+            memcpy(row, uc, blocksize);
+            row += blocksize;
+         }
+         dst += dst_stride;
       }
       break;
    default:
-	 assert(0);
-	 break;
+      assert(0);
+      break;
    }
 }
diff --git a/src/gallium/auxiliary/util/u_rect.h b/src/gallium/auxiliary/util/u_rect.h
index 40d57e662d..4cb90d3c31 100644
--- a/src/gallium/auxiliary/util/u_rect.h
+++ b/src/gallium/auxiliary/util/u_rect.h
@@ -26,17 +26,67 @@
  **************************************************************************/
 
 
-/**
- * Pipe copy/fill rect helpers.
+#ifndef U_RECT_H
+#define U_RECT_H
+
+#include "pipe/p_compiler.h"
+
+struct u_rect {
+   int x0, x1;
+   int y0, y1;
+};
+
+/* Do two rectangles intersect?
  */
+static INLINE boolean
+u_rect_test_intersection(const struct u_rect *a,
+                         const struct u_rect *b)
+{
+   return (!(a->x1 < b->x0 ||
+             b->x1 < a->x0 ||
+             a->y1 < b->y0 ||
+             b->y1 < a->y0));
+}
 
+/* Find the intersection of two rectangles known to intersect.
+ */
+static INLINE void
+u_rect_find_intersection(const struct u_rect *a,
+                         struct u_rect *b)
+{
+   /* Caller should verify intersection exists before calling.
+    */
+   if (b->x0 < a->x0) b->x0 = a->x0;
+   if (b->x1 > a->x1) b->x1 = a->x1;
+   if (b->y0 < a->y0) b->y0 = a->y0;
+   if (b->y1 > a->y1) b->y1 = a->y1;
+}
 
-#ifndef U_RECT_H
-#define U_RECT_H
 
+static INLINE void
+u_rect_possible_intersection(const struct u_rect *a,
+                             struct u_rect *b)
+{
+   if (u_rect_test_intersection(a,b)) {
+      u_rect_find_intersection(a,b);
+   }
+   else {
+      b->x0 = b->x1 = b->y0 = b->y1 = 0;
+   }
+}
 
 #include "pipe/p_format.h"
+#include "util/u_pack_color.h"
+
+
+
+/**********************************************************************
+ * Pipe copy/fill rect helpers.
+ */
 
+/* These really should move to a different file:
+ */
+#include "pipe/p_format.h"
 
 extern void
 util_copy_rect(ubyte * dst, enum pipe_format format,
@@ -47,7 +97,7 @@ util_copy_rect(ubyte * dst, enum pipe_format format,
 extern void
 util_fill_rect(ubyte * dst, enum pipe_format format,
                unsigned dst_stride, unsigned dst_x, unsigned dst_y,
-               unsigned width, unsigned height, uint32_t value);
+               unsigned width, unsigned height, union util_color *uc);
 
 
 #endif /* U_RECT_H */
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
index 5b682f496c..58ef68377f 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -37,6 +37,7 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_state.h"
 #include "util/u_simple_shaders.h"
 #include "util/u_debug.h"
 #include "tgsi/tgsi_ureg.h"
diff --git a/src/gallium/auxiliary/util/u_split_prim.h b/src/gallium/auxiliary/util/u_split_prim.h
index 206e1ec311..7f80fc1270 100644
--- a/src/gallium/auxiliary/util/u_split_prim.h
+++ b/src/gallium/auxiliary/util/u_split_prim.h
@@ -1,5 +1,12 @@
 /* Originally written by Ben Skeggs for the nv50 driver*/
-#include <pipe/p_defines.h>
+
+#ifndef U_SPLIT_PRIM_H
+#define U_SPLIT_PRIM_H
+
+#include "pipe/p_defines.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_debug.h"
 
 struct util_split_prim {
    void *priv;
@@ -48,7 +55,7 @@ util_split_prim_next(struct util_split_prim *s, unsigned max_verts)
       }
    }
 
-   if (s->p_start + s->close_first + max_verts >= s->p_end) {
+   if ((s->p_end - s->p_start) + s->close_first <= max_verts) {
       s->emit(s->priv, s->p_start, s->p_end - s->p_start);
       if (s->close_first)
          s->emit(s->priv, s->start, 1);
@@ -103,3 +110,5 @@ util_split_prim_next(struct util_split_prim *s, unsigned max_verts)
    s->p_start += (max_verts - repeat);
    return FALSE;
 }
+
+#endif /* U_SPLIT_PRIM_H */
diff --git a/src/gallium/auxiliary/util/u_staging.c b/src/gallium/auxiliary/util/u_staging.c
index 607c31f5ee..c5d68f8df8 100644
--- a/src/gallium/auxiliary/util/u_staging.c
+++ b/src/gallium/auxiliary/util/u_staging.c
@@ -1,3 +1,29 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
 #include "util/u_staging.h"
 #include "pipe/p_context.h"
 #include "util/u_memory.h"
@@ -8,7 +34,7 @@ util_staging_resource_template(struct pipe_resource *pt, unsigned width, unsigne
 {
    memset(template, 0, sizeof(struct pipe_resource));
    if(pt->target != PIPE_BUFFER && depth <= 1)
-      template->target = PIPE_TEXTURE_2D;
+      template->target = PIPE_TEXTURE_RECT;
    else
       template->target = pt->target;
    template->format = pt->format;
@@ -23,20 +49,16 @@ util_staging_resource_template(struct pipe_resource *pt, unsigned width, unsigne
 }
 
 struct util_staging_transfer *
-util_staging_transfer_new(struct pipe_context *pipe,
+util_staging_transfer_init(struct pipe_context *pipe,
            struct pipe_resource *pt,
            struct pipe_subresource sr,
            unsigned usage,
            const struct pipe_box *box,
-           bool direct)
+           bool direct, struct util_staging_transfer *tx)
 {
    struct pipe_screen *pscreen = pipe->screen;
-   struct util_staging_transfer *tx;
-   struct pipe_resource staging_resource_template;
 
-   tx = CALLOC_STRUCT(util_staging_transfer);
-   if (!tx)
-      return NULL;
+   struct pipe_resource staging_resource_template;
 
    pipe_resource_reference(&tx->base.resource, pt);
    tx->base.sr = sr;
diff --git a/src/gallium/auxiliary/util/u_staging.h b/src/gallium/auxiliary/util/u_staging.h
index 602faa2971..1aab78cc88 100644
--- a/src/gallium/auxiliary/util/u_staging.h
+++ b/src/gallium/auxiliary/util/u_staging.h
@@ -1,3 +1,29 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
 /* Direct3D 10/11 has no concept of transfers. Applications instead
  * create resources with a STAGING or DYNAMIC usage, copy between them
  * and the real resource and use Map to map the STAGING/DYNAMIC resource.
@@ -21,15 +47,15 @@ struct util_staging_transfer {
 };
 
 /* user must be stride, slice_stride and offset */
-/* pt->usage == PIPE_USAGE_DYNAMIC should be a good value to pass for direct */
-/* staging resource is currently created with PIPE_USAGE_DYNAMIC */
+/* pt->usage == PIPE_USAGE_DYNAMIC || pt->usage == PIPE_USAGE_STAGING should be a good value to pass for direct */
+/* staging resource is currently created with PIPE_USAGE_STAGING */
 struct util_staging_transfer *
-util_staging_transfer_new(struct pipe_context *pipe,
+util_staging_transfer_init(struct pipe_context *pipe,
            struct pipe_resource *pt,
            struct pipe_subresource sr,
            unsigned usage,
            const struct pipe_box *box,
-           bool direct);
+           bool direct, struct util_staging_transfer *tx);
 
 void
 util_staging_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx);
diff --git a/src/gallium/auxiliary/util/u_surface.c b/src/gallium/auxiliary/util/u_surface.c
index cab7691c70..af99163b2e 100644
--- a/src/gallium/auxiliary/util/u_surface.c
+++ b/src/gallium/auxiliary/util/u_surface.c
@@ -216,7 +216,7 @@ util_clear_render_target(struct pipe_context *pipe,
    assert(dst->texture);
    if (!dst->texture)
       return;
-   util_pack_color(rgba, dst->texture->format, &uc);
+
    dst_trans = pipe_get_transfer(pipe,
 				 dst->texture,
 				 dst->face,
@@ -232,46 +232,10 @@ util_clear_render_target(struct pipe_context *pipe,
    if (dst_map) {
       assert(dst_trans->stride > 0);
 
-      switch (util_format_get_blocksize(dst->texture->format)) {
-      case 1:
-      case 2:
-      case 4:
-         util_pack_color(rgba, dst->texture->format, &uc);
-         util_fill_rect(dst_map, dst->texture->format,
-                        dst_trans->stride,
-                        0, 0, width, height, uc.ui);
-         break;
-      case 8:
-      {
-	 /* expand the 4-byte clear value to an 8-byte value */
-	 /* should probably not convert back from ubyte but not
-	    sure what this code really achieved since it doesn't even
-	    check for format type... */
-	 ushort *row = (ushort *) dst_map;
-	 ushort val0 = UBYTE_TO_USHORT((uc.ui >>  0) & 0xff);
-	 ushort val1 = UBYTE_TO_USHORT((uc.ui >>  8) & 0xff);
-	 ushort val2 = UBYTE_TO_USHORT((uc.ui >> 16) & 0xff);
-	 ushort val3 = UBYTE_TO_USHORT((uc.ui >> 24) & 0xff);
-	 unsigned i, j;
-	 val0 = (val0 << 8) | val0;
-	 val1 = (val1 << 8) | val1;
-	 val2 = (val2 << 8) | val2;
-	 val3 = (val3 << 8) | val3;
-	 for (i = 0; i < height; i++) {
-	    for (j = 0; j < width; j++) {
-	       row[j*4+0] = val0;
-	       row[j*4+1] = val1;
-	       row[j*4+2] = val2;
-	       row[j*4+3] = val3;
-	    }
-	    row += dst_trans->stride/2;
-	 }
-      }
-      break;
-      default:
-         assert(0);
-         break;
-      }
+      util_pack_color(rgba, dst->texture->format, &uc);
+      util_fill_rect(dst_map, dst->texture->format,
+                     dst_trans->stride,
+                     0, 0, width, height, &uc);
    }
 
    pipe->transfer_unmap(pipe, dst_trans);
diff --git a/src/gallium/auxiliary/util/u_surfaces.c b/src/gallium/auxiliary/util/u_surfaces.c
index 7733ad24d0..404e121995 100644
--- a/src/gallium/auxiliary/util/u_surfaces.c
+++ b/src/gallium/auxiliary/util/u_surfaces.c
@@ -1,3 +1,29 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
 #include "u_surfaces.h"
 #include "util/u_hash_table.h"
 #include "util/u_inlines.h"
diff --git a/src/gallium/auxiliary/util/u_surfaces.h b/src/gallium/auxiliary/util/u_surfaces.h
index af978c7057..17d8a5d3a5 100644
--- a/src/gallium/auxiliary/util/u_surfaces.h
+++ b/src/gallium/auxiliary/util/u_surfaces.h
@@ -1,3 +1,29 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
 #ifndef U_SURFACES_H_
 #define U_SURFACES_H_
 
@@ -22,7 +48,7 @@ struct pipe_surface *util_surfaces_do_get(struct util_surfaces *us, unsigned sur
 static INLINE struct pipe_surface *
 util_surfaces_get(struct util_surfaces *us, unsigned surface_struct_size, struct pipe_screen *pscreen, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice, unsigned flags)
 {
-   if(likely(pt->target == PIPE_TEXTURE_2D && us->u.array))
+   if(likely((pt->target == PIPE_TEXTURE_2D || pt->target == PIPE_TEXTURE_RECT) && us->u.array))
    {
       struct pipe_surface *ps = us->u.array[level];
       if(ps)
@@ -52,7 +78,7 @@ void util_surfaces_do_detach(struct util_surfaces *us, struct pipe_surface *ps);
 static INLINE void
 util_surfaces_detach(struct util_surfaces *us, struct pipe_surface *ps)
 {
-   if(likely(ps->texture->target == PIPE_TEXTURE_2D))
+   if(likely(ps->texture->target == PIPE_TEXTURE_2D || ps->texture->target == PIPE_TEXTURE_RECT))
    {
       us->u.array[ps->level] = 0;
       return;
diff --git a/src/gallium/auxiliary/util/u_tile.h b/src/gallium/auxiliary/util/u_tile.h
index 986eee0743..558351d0ce 100644
--- a/src/gallium/auxiliary/util/u_tile.h
+++ b/src/gallium/auxiliary/util/u_tile.h
@@ -29,7 +29,10 @@
 #define P_TILE_H
 
 #include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
 
+struct pipe_context;
 struct pipe_transfer;
 
 /**
diff --git a/src/gallium/auxiliary/util/u_transfer.h b/src/gallium/auxiliary/util/u_transfer.h
index eb07945d15..e3a38730f2 100644
--- a/src/gallium/auxiliary/util/u_transfer.h
+++ b/src/gallium/auxiliary/util/u_transfer.h
@@ -8,6 +8,7 @@
 #include "pipe/p_state.h"
 
 struct pipe_context;
+struct winsys_handle;
 
 boolean u_default_resource_get_handle(struct pipe_screen *screen,
 				      struct pipe_resource *resource,
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.h b/src/gallium/auxiliary/util/u_upload_mgr.h
index a124924fc8..de016df02e 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.h
+++ b/src/gallium/auxiliary/util/u_upload_mgr.h
@@ -32,11 +32,8 @@
 #ifndef U_UPLOAD_MGR_H
 #define U_UPLOAD_MGR_H
 
-#include "pipe/p_defines.h"
-
-struct pipe_screen;
+struct pipe_context;
 struct pipe_resource;
-struct u_upload_mgr;
 
 
 struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst
index f241411a00..8250c30f2a 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -63,7 +63,9 @@ objects. They all follow simple, one-method binding calls, e.g.
 * ``set_scissor_state`` sets the bounds for the scissor test, which culls
   pixels before blending to render targets. If the :ref:`Rasterizer` does
   not have the scissor test enabled, then the scissor bounds never need to
-  be set since they will not be used.
+  be set since they will not be used.  Note that scissor xmin and ymin are
+  inclusive, but  xmax and ymax are exclusive.  The inclusive ranges in x
+  and y would be [xmin..xmax-1] and [ymin..ymax-1].
 * ``set_viewport_state``
 
 
diff --git a/src/gallium/docs/source/debugging.rst b/src/gallium/docs/source/debugging.rst
index 42bda5aee9..e081cbf74e 100644
--- a/src/gallium/docs/source/debugging.rst
+++ b/src/gallium/docs/source/debugging.rst
@@ -21,6 +21,10 @@ This option controls if the debug variables should be printed to stderr. This
 is probably the most useful variable, since it allows you to find which
 variables a driver uses.
 
+.. envvar:: GALLIUM_GALAHAD <bool> (false)
+
+Controls if the :ref:`galahad` sanity checker module should be used.
+
 .. envvar:: GALLIUM_RBUG <bool> (false)
 
 Controls if the :ref:`rbug` should be used.
diff --git a/src/gallium/docs/source/distro.rst b/src/gallium/docs/source/distro.rst
index 70d75b51e6..08c8eab890 100644
--- a/src/gallium/docs/source/distro.rst
+++ b/src/gallium/docs/source/distro.rst
@@ -79,6 +79,15 @@ Rbug
 
 Wrapper driver. :ref:`rbug` driver used with stand alone rbug-gui.
 
+.. _galahad:
+
+Galahad
+^^^^^^^
+
+Wrapper driver. Sanity checker for the internal gallium state. Normally
+a driver should n't have to sanity check the input it gets from a state
+tracker. Any wrong state received should be perceived as a state tracker bug.
+
 State Trackers
 --------------
 
diff --git a/src/gallium/docs/source/index.rst b/src/gallium/docs/source/index.rst
index 6c19842dac..2a73e3ab59 100644
--- a/src/gallium/docs/source/index.rst
+++ b/src/gallium/docs/source/index.rst
@@ -15,6 +15,7 @@ Contents:
    debugging
    tgsi
    screen
+   resources
    context
    cso
    distro
diff --git a/src/gallium/docs/source/resources.rst b/src/gallium/docs/source/resources.rst
new file mode 100644
index 0000000000..c8a5766821
--- /dev/null
+++ b/src/gallium/docs/source/resources.rst
@@ -0,0 +1,195 @@
+Resources and derived objects
+=============================
+
+Resources represent objects that hold data: textures and buffers.
+
+They are mostly modelled after the resources in Direct3D 10/11, but with a
+different transfer/update mechanism, and more features for OpenGL support.
+
+Resources can be used in several ways, and it is required to specify all planned uses through an appropriate set of bind flags.
+
+TODO: write much more on resources
+
+Transfers
+---------
+
+Transfers are the mechanism used to access resources with the CPU.
+
+OpenGL: OpenGL supports mapping buffers and has inline transfer functions for both buffers and textures
+
+D3D11: D3D11 lacks transfers, but has special resource types that are mappable to the CPU address space
+
+TODO: write much more on transfers
+
+Resource targets
+----------------
+
+Resource targets determine the type of a resource.
+
+Note that drivers may not actually have the restrictions listed regarding
+coordinate normalization and wrap modes, and in fact efficient OpenCL
+support will probably require drivers that don't have any of them, which
+will probably be advertised with an appropriate cap.
+
+TODO: document all targets. Note that both 3D and cube have restrictions
+that depend on the hardware generation.
+
+TODO: can buffers have a non-R8 format?
+
+PIPE_BUFFER
+^^^^^^^^^^^
+
+Buffer resource: can be used as a vertex, index, constant buffer (appropriate bind flags must be requested).
+
+They can be bound to stream output if supported.
+TODO: what about the restrictions lifted by the several later GL transform feedback extensions? How does one advertise that in Gallium?
+
+They can be also be bound to a shader stage as usual.
+TODO: are all drivers supposed to support this? how does this work exactly? are there size limits?
+
+They can be also be bound to the framebuffer as usual.
+TODO: are all drivers supposed to support this? how does this work exactly? are there size limits?
+TODO: is there any chance of supporting GL pixel buffer object acceleration with this?
+
+- depth0 must be 1
+- last_level must be 0
+- TODO: what about normalization?
+- TODO: wrap modes/other sampling state?
+- TODO: are arbitrary formats supported? in which cases?
+
+OpenGL: vertex buffers in GL 1.5 or GL_ARB_vertex_buffer_object
+
+- Binding to stream out requires GL 3.0 or GL_NV_transform_feedback
+- Binding as constant buffers requires GL 3.1 or GL_ARB_uniform_buffer_object
+- Binding to a sampling stage requires GL 3.1 or GL_ARB_texture_buffer_object
+- TODO: can they be bound to an FBO?
+
+D3D11: buffer resources
+- Binding to a render target requires D3D_FEATURE_LEVEL_10_0
+
+PIPE_TEXTURE_1D
+^^^^^^^^^^^^^^^
+1D surface accessed with normalized coordinates.
+
+UNIMPLEMENTED: 1D texture arrays not supported
+
+- If PIPE_CAP_NPOT_TEXTURES is not supported,
+      width must be a power of two
+- height0 must be 1
+- depth0 must be 1
+- Mipmaps can be used
+- Must use normalized coordinates
+
+OpenGL: GL_TEXTURE_1D in GL 1.0
+
+- PIPE_CAP_NPOT_TEXTURES is equivalent to GL 2.0 or GL_ARB_texture_non_power_of_two
+
+D3D11: 1D textures in D3D_FEATURE_LEVEL_10_0
+
+PIPE_TEXTURE_RECT
+^^^^^^^^^^^^^^^^^
+2D surface with OpenGL GL_TEXTURE_RECTANGLE semantics.
+
+- depth0 must be 1
+- last_level must be 0
+- Must use unnormalized coordinates
+- Must use a clamp wrap mode
+
+OpenGL: GL_TEXTURE_RECTANGLE in GL 3.1 or GL_ARB_texture_rectangle or GL_NV_texture_rectangle
+
+OpenCL: can create OpenCL images based on this, that can then be sampled arbitrarily
+
+D3D11: not supported (only PIPE_TEXTURE_2D with normalized coordinates is supported)
+
+PIPE_TEXTURE_2D
+^^^^^^^^^^^^^^^
+2D surface accessed with normalized coordinates.
+
+UNIMPLEMENTED: 2D texture arrays not supported
+
+- If PIPE_CAP_NPOT_TEXTURES is not supported,
+      width and height must be powers of two
+- depth0 must be 1
+- Mipmaps can be used
+- Must use normalized coordinates
+- No special restrictions on wrap modes
+
+OpenGL: GL_TEXTURE_2D in GL 1.0
+
+- PIPE_CAP_NPOT_TEXTURES is equivalent to GL 2.0 or GL_ARB_texture_non_power_of_two
+
+OpenCL: can create OpenCL images based on this, that can then be sampled arbitrarily
+
+D3D11: 2D textures
+
+- PIPE_CAP_NPOT_TEXTURES is equivalent to D3D_FEATURE_LEVEL_9_3
+
+PIPE_TEXTURE_3D
+^^^^^^^^^^^^^^^
+
+3-dimensional array of texels.
+Mipmap dimensions are reduced in all 3 coordinates.
+
+- If PIPE_CAP_NPOT_TEXTURES is not supported,
+      width, height and depth must be powers of two
+- Must use normalized coordinates
+
+OpenGL: GL_TEXTURE_3D in GL 1.2 or GL_EXT_texture3D
+
+- PIPE_CAP_NPOT_TEXTURES is equivalent to GL 2.0 or GL_ARB_texture_non_power_of_two
+
+D3D11: 3D textures
+
+- PIPE_CAP_NPOT_TEXTURES is equivalent to D3D_FEATURE_LEVEL_10_0
+
+PIPE_TEXTURE_CUBE
+^^^^^^^^^^^^^^^^^
+
+Cube maps consist of 6 2D faces.
+The 6 surfaces form an imaginary cube, and sampling happens by mapping an
+input 3-vector to the point of the cube surface in that direction.
+
+Sampling may be optionally seamless, resulting in filtering taking samples
+from multiple surfaces near to the edge.
+UNIMPLEMENTED: seamless cube map sampling not supported
+
+UNIMPLEMENTED: cube map arrays not supported
+
+- Width and height must be equal
+- If PIPE_CAP_NPOT_TEXTURES is not supported,
+      width and height must be powers of two
+- Must use normalized coordinates
+
+OpenGL: GL_TEXTURE_CUBE_MAP in GL 1.3 or EXT_texture_cube_map
+
+- PIPE_CAP_NPOT_TEXTURES is equivalent to GL 2.0 or GL_ARB_texture_non_power_of_two
+- Seamless cube maps require GL 3.2 or GL_ARB_seamless_cube_map or GL_AMD_seamless_cubemap_per_texture
+- Cube map arrays require GL 4.0 or GL_ARB_texture_cube_map_array
+
+D3D11: 2D array textures with the D3D11_RESOURCE_MISC_TEXTURECUBE flag
+
+- PIPE_CAP_NPOT_TEXTURES is equivalent to D3D_FEATURE_LEVEL_10_0
+- Cube map arrays require D3D_FEATURE_LEVEL_10_1
+- TODO: are (non)seamless cube maps supported in D3D11? how?
+
+Surfaces
+--------
+
+Surfaces are views of a resource that can be bound as a framebuffer to serve as the render target or depth buffer.
+
+TODO: write much more on surfaces
+
+OpenGL: FBOs are collections of surfaces in GL 3.0 or GL_ARB_framebuffer_object
+
+D3D11: render target views and depth/stencil views
+
+Sampler views
+-------------
+
+Sampler views are views of a resource that can be bound to a pipeline stage to be sampled from shaders.
+
+TODO: write much more on sampler views
+
+OpenGL: texture objects are actually sampler view and resource in a single unit
+
+D3D11: shader resource views
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index 4adef5b8c0..a367fa3fe1 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -78,20 +78,13 @@ cell_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       draw_set_mapped_vertex_buffer(draw, i, buf);
    }
    /* Map index buffer, if present */
-   if (info->indexed && cell->index_buffer.buffer) {
+   if (info->indexed && cell->index_buffer.buffer)
       mapped_indices = cell_resource(cell->index_buffer.buffer)->data;
-      mapped_indices += cell->index_buffer.offset;
-   }
 
-   draw_set_mapped_element_buffer_range(draw, (mapped_indices) ?
-                                        lp->index_buffer.index_size : 0,
-                                        info->index_bias,
-                                        info->min_index,
-                                        info->max_index,
-                                        mapped_indices);
+   draw_set_mapped_index_buffer(draw, mapped_indices);
 
    /* draw! */
-   draw_arrays(draw, info->mode, info->start, info->count);
+   draw_vbo(draw, info);
 
    /*
     * unmap vertex/index buffers - will cause draw module to flush
@@ -100,7 +93,7 @@ cell_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       draw_set_mapped_vertex_buffer(draw, i, NULL);
    }
    if (mapped_indices) {
-      draw_set_mapped_element_buffer(draw, 0, 0, NULL);
+      draw_set_mapped_index_buffer(draw, NULL);
    }
 
    /*
diff --git a/src/gallium/drivers/cell/ppu/cell_state_vertex.c b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
index 4e3701cd0a..a065d68b5a 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_vertex.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
@@ -102,7 +102,7 @@ cell_set_index_buffer(struct pipe_context *pipe,
    else
       memset(&cell->index_buffer, 0, sizeof(cell->index_buffer));
 
-   /* TODO make this more like a state */
+   draw_set_index_buffer(cell->draw, ib);
 }
 
 
diff --git a/src/gallium/drivers/galahad/glhd_context.c b/src/gallium/drivers/galahad/glhd_context.c
index fe14a287ef..383c448926 100644
--- a/src/gallium/drivers/galahad/glhd_context.c
+++ b/src/gallium/drivers/galahad/glhd_context.c
@@ -185,6 +185,12 @@ galahad_bind_fragment_sampler_states(struct pipe_context *_pipe,
    struct galahad_context *glhd_pipe = galahad_context(_pipe);
    struct pipe_context *pipe = glhd_pipe->pipe;
 
+   if (num_samplers > PIPE_MAX_SAMPLERS) {
+      glhd_error("%u fragment samplers requested, "
+         "but only %u are permitted by API",
+         num_samplers, PIPE_MAX_SAMPLERS);
+   }
+
    pipe->bind_fragment_sampler_states(pipe,
                                       num_samplers,
                                       samplers);
@@ -198,6 +204,12 @@ galahad_bind_vertex_sampler_states(struct pipe_context *_pipe,
    struct galahad_context *glhd_pipe = galahad_context(_pipe);
    struct pipe_context *pipe = glhd_pipe->pipe;
 
+   if (num_samplers > PIPE_MAX_VERTEX_SAMPLERS) {
+      glhd_error("%u vertex samplers requested, "
+         "but only %u are permitted by API",
+         num_samplers, PIPE_MAX_VERTEX_SAMPLERS);
+   }
+
    pipe->bind_vertex_sampler_states(pipe,
                                     num_samplers,
                                     samplers);
@@ -447,6 +459,19 @@ galahad_set_constant_buffer(struct pipe_context *_pipe,
    struct pipe_resource *unwrapped_resource;
    struct pipe_resource *resource = NULL;
 
+   if (shader >= PIPE_SHADER_TYPES) {
+      glhd_error("Unknown shader type %u", shader);
+   }
+
+   if (index &&
+      index >=
+         pipe->screen->get_param(pipe->screen, PIPE_CAP_MAX_CONST_BUFFERS)) {
+      glhd_error("Access to constant buffer %u requested, "
+         "but only %d are supported",
+         index,
+         pipe->screen->get_param(pipe->screen, PIPE_CAP_MAX_CONST_BUFFERS));
+   }
+
    /* XXX hmm? unwrap the input state */
    if (_resource) {
       unwrapped_resource = galahad_resource_unwrap(_resource);
@@ -972,5 +997,7 @@ galahad_context_create(struct pipe_screen *_screen, struct pipe_context *pipe)
 
    glhd_pipe->pipe = pipe;
 
+   glhd_warn("Created context %p", glhd_pipe);
+
    return &glhd_pipe->base;
 }
diff --git a/src/gallium/drivers/galahad/glhd_screen.c b/src/gallium/drivers/galahad/glhd_screen.c
index 4117485702..75e4c2d82e 100644
--- a/src/gallium/drivers/galahad/glhd_screen.c
+++ b/src/gallium/drivers/galahad/glhd_screen.c
@@ -30,6 +30,7 @@
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
 #include "util/u_memory.h"
+#include "util/u_math.h"
 
 #include "glhd_public.h"
 #include "glhd_screen.h"
@@ -134,6 +135,33 @@ galahad_screen_resource_create(struct pipe_screen *_screen,
    struct pipe_screen *screen = glhd_screen->screen;
    struct pipe_resource *result;
 
+   if (templat->target >= PIPE_MAX_TEXTURE_TYPES)
+      glhd_warn("Received bogus resource target %d", templat->target);
+
+   if(templat->target != PIPE_TEXTURE_RECT && templat->target != PIPE_BUFFER && !screen->get_param(screen, PIPE_CAP_NPOT_TEXTURES))
+   {
+      if(!util_is_power_of_two(templat->width0) || !util_is_power_of_two(templat->height0))
+         glhd_warn("Requested NPOT (%ux%u) non-rectangle texture without NPOT support", templat->width0, templat->height0);
+   }
+
+   if(templat->target == PIPE_TEXTURE_RECT && templat->last_level)
+      glhd_warn("Rectangle textures cannot have mipmaps, but last_level = %u", templat->last_level);
+
+   if(templat->target == PIPE_BUFFER && templat->last_level)
+      glhd_warn("Buffers cannot have mipmaps, but last_level = %u", templat->last_level);
+
+   if(templat->target != PIPE_TEXTURE_3D && templat->depth0 != 1)
+      glhd_warn("Only 3D textures can have depth != 1, but received target %u and depth %u", templat->target, templat->depth0);
+
+   if(templat->target == PIPE_TEXTURE_1D && templat->height0 != 1)
+     glhd_warn("1D textures must have height 1 but got asked for height %u", templat->height0);
+
+   if(templat->target == PIPE_BUFFER && templat->height0 != 1)
+     glhd_warn("Buffers must have height 1 but got asked for height %u", templat->height0);
+
+   if(templat->target == PIPE_TEXTURE_CUBE && templat->width0 != templat->height0)
+      glhd_warn("Cube maps must be square, but got asked for %ux%u", templat->width0, templat->height0);
+
    result = screen->resource_create(screen,
                                     templat);
 
@@ -330,5 +358,7 @@ galahad_screen_create(struct pipe_screen *screen)
 
    glhd_screen->screen = screen;
 
+   glhd_warn("Created screen %p", glhd_screen);
+
    return &glhd_screen->base;
 }
diff --git a/src/gallium/drivers/i915/i915_context.c b/src/gallium/drivers/i915/i915_context.c
index 2beb9e3091..847dd6dd47 100644
--- a/src/gallium/drivers/i915/i915_context.c
+++ b/src/gallium/drivers/i915/i915_context.c
@@ -66,18 +66,9 @@ i915_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    /*
     * Map index buffer, if present
     */
-   if (info->indexed && i915->index_buffer.buffer) {
-      char *indices = (char *) i915_buffer(i915->index_buffer.buffer)->data;
-      mapped_indices = (void *) (indices + i915->index_buffer.offset);
-   }
-
-   draw_set_mapped_element_buffer_range(draw, (mapped_indices) ?
-                                        i915->index_buffer.index_size : 0,
-                                        info->index_bias,
-                                        info->min_index,
-                                        info->max_index,
-                                        mapped_indices);
-
+   if (info->indexed && i915->index_buffer.buffer)
+      mapped_indices = i915_buffer(i915->index_buffer.buffer)->data;
+   draw_set_mapped_index_buffer(draw, mapped_indices);
 
    draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 0,
                                    i915->current.constants[PIPE_SHADER_VERTEX],
@@ -87,7 +78,7 @@ i915_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    /*
     * Do the drawing
     */
-   draw_arrays(i915->draw, info->mode, info->start, info->count);
+   draw_vbo(i915->draw, info);
 
    /*
     * unmap vertex/index buffers
@@ -96,9 +87,8 @@ i915_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       draw_set_mapped_vertex_buffer(draw, i, NULL);
    }
 
-   if (mapped_indices) {
-      draw_set_mapped_element_buffer(draw, 0, 0, NULL);
-   }
+   if (mapped_indices)
+      draw_set_mapped_index_buffer(draw, NULL);
 }
 
 
diff --git a/src/gallium/drivers/i915/i915_resource_texture.c b/src/gallium/drivers/i915/i915_resource_texture.c
index 752ddaae7b..c5c6179b16 100644
--- a/src/gallium/drivers/i915/i915_resource_texture.c
+++ b/src/gallium/drivers/i915/i915_resource_texture.c
@@ -360,6 +360,7 @@ i915_texture_layout(struct i915_texture * tex)
    switch (pt->target) {
    case PIPE_TEXTURE_1D:
    case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
       if (!i9x5_special_layout(tex))
          i915_texture_layout_2d(tex);
       break;
@@ -605,6 +606,7 @@ i945_texture_layout(struct i915_texture * tex)
    switch (pt->target) {
    case PIPE_TEXTURE_1D:
    case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
       if (!i9x5_special_layout(tex))
          i945_texture_layout_2d(tex);
       break;
@@ -829,7 +831,8 @@ i915_texture_from_handle(struct pipe_screen * screen,
    buffer = iws->buffer_from_handle(iws, whandle, &stride);
 
    /* Only supports one type */
-   if (template->target != PIPE_TEXTURE_2D ||
+   if ((template->target != PIPE_TEXTURE_2D &&
+       template->target != PIPE_TEXTURE_RECT) ||
        template->last_level != 0 ||
        template->depth0 != 1) {
       return NULL;
diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index 385c3b2d2d..bbfcff6bc4 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -294,8 +294,6 @@ static void i915_bind_sampler_states(struct pipe_context *pipe,
    struct i915_context *i915 = i915_context(pipe);
    unsigned i;
 
-   assert(num <= PIPE_MAX_SAMPLERS);
-
    /* Check for no-op */
    if (num == i915->num_samplers &&
        !memcmp(i915->sampler, sampler, num * sizeof(void *)))
@@ -529,9 +527,6 @@ static void i915_set_constant_buffer(struct pipe_context *pipe,
    struct i915_context *i915 = i915_context(pipe);
    draw_flush(i915->draw);
 
-   assert(shader < PIPE_SHADER_TYPES);
-   assert(index == 0);
-
    /* Make a copy of shader constants.
     * During fragment program translation we may add additional
     * constants to the array.
@@ -822,7 +817,8 @@ static void i915_set_index_buffer(struct pipe_context *pipe,
    else
       memset(&i915->index_buffer, 0, sizeof(i915->index_buffer));
 
-   /* TODO make this more like a state */
+   /* pass-through to draw module */
+   draw_set_index_buffer(i915->draw, ib);
 }
 
 static void
diff --git a/src/gallium/drivers/i965/brw_batchbuffer.c b/src/gallium/drivers/i965/brw_batchbuffer.c
index 8b3f46f2c1..e80067f3b1 100644
--- a/src/gallium/drivers/i965/brw_batchbuffer.c
+++ b/src/gallium/drivers/i965/brw_batchbuffer.c
@@ -162,7 +162,7 @@ brw_batchbuffer_emit_reloc(struct brw_batchbuffer *batch,
 
    if (batch->ptr - batch->map > batch->buf->size) {
       debug_printf("bad relocation ptr %p map %p offset %li size %i\n",
-		   batch->ptr, batch->map, batch->ptr - batch->map, batch->buf->size);
+		   batch->ptr, batch->map, (long) (batch->ptr - batch->map), batch->buf->size);
 
       return PIPE_ERROR_OUT_OF_MEMORY;
    }
diff --git a/src/gallium/drivers/i965/brw_resource_texture.c b/src/gallium/drivers/i965/brw_resource_texture.c
index ffd0f38672..3860d18a7a 100644
--- a/src/gallium/drivers/i965/brw_resource_texture.c
+++ b/src/gallium/drivers/i965/brw_resource_texture.c
@@ -66,6 +66,7 @@ static GLuint translate_tex_target( unsigned target )
       return BRW_SURFACE_1D;
 
    case PIPE_TEXTURE_2D: 
+   case PIPE_TEXTURE_RECT:
       return BRW_SURFACE_2D;
 
    case PIPE_TEXTURE_3D: 
@@ -498,7 +499,8 @@ brw_texture_from_handle(struct pipe_screen *screen,
    unsigned pitch;
    GLuint format;
 
-   if (template->target != PIPE_TEXTURE_2D ||
+   if ((template->target != PIPE_TEXTURE_2D
+         && template->target != PIPE_TEXTURE_RECT)  ||
        template->last_level != 0 ||
        template->depth0 != 1)
       return NULL;
diff --git a/src/gallium/drivers/i965/brw_wm_debug.c b/src/gallium/drivers/i965/brw_wm_debug.c
index e2767264e7..1b2aa93bef 100644
--- a/src/gallium/drivers/i965/brw_wm_debug.c
+++ b/src/gallium/drivers/i965/brw_wm_debug.c
@@ -101,16 +101,16 @@ void brw_wm_print_value( struct brw_wm_compile *c,
       debug_printf("undef");
    else if( value - c->vreg >= 0 &&
 	    value - c->vreg < BRW_WM_MAX_VREG)
-      debug_printf("r%d", value - c->vreg);
+      debug_printf("r%ld", (long) (value - c->vreg));
    else if (value - c->creg >= 0 &&
 	    value - c->creg < BRW_WM_MAX_PARAM)
-      debug_printf("c%d", value - c->creg);
+      debug_printf("c%ld", (long) (value - c->creg));
    else if (value - c->payload.input_interp >= 0 &&
 	    value - c->payload.input_interp < PIPE_MAX_SHADER_INPUTS)
-      debug_printf("i%d", value - c->payload.input_interp);
+      debug_printf("i%ld", (long) (value - c->payload.input_interp));
    else if (value - c->payload.depth >= 0 &&
 	    value - c->payload.depth < PIPE_MAX_SHADER_INPUTS)
-      debug_printf("d%d", value - c->payload.depth);
+      debug_printf("d%ld", (long) (value - c->payload.depth));
    else 
       debug_printf("?");
 }
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index 2892b62920..dec874623e 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -27,6 +27,8 @@ C_SOURCES = \
 	lp_scene_queue.c \
 	lp_screen.c \
 	lp_setup.c \
+	lp_setup_coef.c \
+	lp_setup_coef_intrin.c \
 	lp_setup_line.c \
 	lp_setup_point.c \
 	lp_setup_tri.c \
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index 5583fca38e..8d57db72cf 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -63,6 +63,8 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_setup_line.c',
 		'lp_setup_point.c',
 		'lp_setup_tri.c',
+		'lp_setup_coef.c',
+		'lp_setup_coef_intrin.c',
 		'lp_setup_vbuf.c',
 		'lp_state_blend.c',
 		'lp_state_clip.c',
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 7543bd7b2b..39f2c6085e 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -85,6 +85,14 @@ static void llvmpipe_destroy( struct pipe_context *pipe )
    align_free( llvmpipe );
 }
 
+static void
+do_flush( struct pipe_context *pipe,
+          unsigned flags,
+          struct pipe_fence_handle **fence)
+{
+   llvmpipe_flush(pipe, flags, fence, __FUNCTION__);
+}
+
 
 struct pipe_context *
 llvmpipe_create_context( struct pipe_screen *screen, void *priv )
@@ -109,7 +117,7 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv )
    llvmpipe->pipe.destroy = llvmpipe_destroy;
    llvmpipe->pipe.set_framebuffer_state = llvmpipe_set_framebuffer_state;
    llvmpipe->pipe.clear = llvmpipe_clear;
-   llvmpipe->pipe.flush = llvmpipe_flush;
+   llvmpipe->pipe.flush = do_flush;
 
    llvmpipe_init_blend_funcs(llvmpipe);
    llvmpipe_init_clip_funcs(llvmpipe);
@@ -147,9 +155,13 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv )
    draw_install_aapoint_stage(llvmpipe->draw, &llvmpipe->pipe);
    draw_install_pstipple_stage(llvmpipe->draw, &llvmpipe->pipe);
 
-   /* convert points and lines into triangles: */
-   draw_wide_point_threshold(llvmpipe->draw, 0.0);
-   draw_wide_line_threshold(llvmpipe->draw, 0.0);
+   /* convert points and lines into triangles: 
+    * (otherwise, draw points and lines natively)
+    */
+   draw_wide_point_sprites(llvmpipe->draw, FALSE);
+   draw_enable_point_sprites(llvmpipe->draw, FALSE);
+   draw_wide_point_threshold(llvmpipe->draw, 10000.0);
+   draw_wide_line_threshold(llvmpipe->draw, 10000.0);
 
 #if USE_DRAW_STAGE_PSTIPPLE
    /* Do polygon stipple w/ texture map + frag prog? */
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 50f9091c3c..34fa20e204 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -101,6 +101,9 @@ struct llvmpipe_context {
    
    /** Vertex format */
    struct vertex_info vertex_info;
+   
+   /** Which vertex shader output slot contains point size */
+   int psize_slot;
 
    /** Fragment shader input interpolation info */
    unsigned num_inputs;
diff --git a/src/gallium/drivers/llvmpipe/lp_debug.h b/src/gallium/drivers/llvmpipe/lp_debug.h
index 92fb2b3ee5..a928ee38be 100644
--- a/src/gallium/drivers/llvmpipe/lp_debug.h
+++ b/src/gallium/drivers/llvmpipe/lp_debug.h
@@ -46,6 +46,8 @@ st_print_current(void);
 #define DEBUG_SHOW_TILES    0x200
 #define DEBUG_SHOW_SUBTILES 0x400
 #define DEBUG_COUNTERS      0x800
+#define DEBUG_SCENE         0x1000
+#define DEBUG_FENCE         0x2000
 
 
 #ifdef DEBUG
diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
index e73b431cb4..3af5c8d5c5 100644
--- a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
+++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
@@ -68,25 +68,17 @@ llvmpipe_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    }
 
    /* Map index buffer, if present */
-   if (info->indexed && lp->index_buffer.buffer) {
-      char *indices = (char *) llvmpipe_resource_data(lp->index_buffer.buffer);
-      mapped_indices = (void *) (indices + lp->index_buffer.offset);
-   }
+   if (info->indexed && lp->index_buffer.buffer)
+      mapped_indices = llvmpipe_resource_data(lp->index_buffer.buffer);
 
-   draw_set_mapped_element_buffer_range(draw, (mapped_indices) ?
-                                        lp->index_buffer.index_size : 0,
-                                        info->index_bias,
-                                        info->min_index,
-                                        info->max_index,
-                                        mapped_indices);
+   draw_set_mapped_index_buffer(draw, mapped_indices);
 
    llvmpipe_prepare_vertex_sampling(lp,
                                     lp->num_vertex_sampler_views,
                                     lp->vertex_sampler_views);
 
    /* draw! */
-   draw_arrays_instanced(draw, info->mode, info->start, info->count,
-         info->start_instance, info->instance_count);
+   draw_vbo(draw, info);
 
    /*
     * unmap vertex/index buffers
@@ -95,7 +87,7 @@ llvmpipe_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       draw_set_mapped_vertex_buffer(draw, i, NULL);
    }
    if (mapped_indices) {
-      draw_set_mapped_element_buffer(draw, 0, 0, NULL);
+      draw_set_mapped_index_buffer(draw, NULL);
    }
    llvmpipe_cleanup_vertex_sampling(lp);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_fence.c b/src/gallium/drivers/llvmpipe/lp_fence.c
index f9805e5d68..3a55e76bc3 100644
--- a/src/gallium/drivers/llvmpipe/lp_fence.c
+++ b/src/gallium/drivers/llvmpipe/lp_fence.c
@@ -44,6 +44,7 @@
 struct lp_fence *
 lp_fence_create(unsigned rank)
 {
+   static int fence_id;
    struct lp_fence *fence = CALLOC_STRUCT(lp_fence);
 
    pipe_reference_init(&fence->reference, 1);
@@ -51,8 +52,12 @@ lp_fence_create(unsigned rank)
    pipe_mutex_init(fence->mutex);
    pipe_condvar_init(fence->signalled);
 
+   fence->id = fence_id++;
    fence->rank = rank;
 
+   if (LP_DEBUG & DEBUG_FENCE)
+      debug_printf("%s %d\n", __FUNCTION__, fence->id);
+
    return fence;
 }
 
@@ -61,6 +66,9 @@ lp_fence_create(unsigned rank)
 void
 lp_fence_destroy(struct lp_fence *fence)
 {
+   if (LP_DEBUG & DEBUG_FENCE)
+      debug_printf("%s %d\n", __FUNCTION__, fence->id);
+
    pipe_mutex_destroy(fence->mutex);
    pipe_condvar_destroy(fence->signalled);
    FREE(fence);
@@ -68,82 +76,49 @@ lp_fence_destroy(struct lp_fence *fence)
 
 
 /**
- * For reference counting.
- * This is a Gallium API function.
- */
-static void
-llvmpipe_fence_reference(struct pipe_screen *screen,
-                         struct pipe_fence_handle **ptr,
-                         struct pipe_fence_handle *fence)
-{
-   struct lp_fence **old = (struct lp_fence **) ptr;
-   struct lp_fence *f = (struct lp_fence *) fence;
-
-   lp_fence_reference(old, f);
-}
-
-
-/**
- * Has the fence been executed/finished?
- * This is a Gallium API function.
- */
-static int
-llvmpipe_fence_signalled(struct pipe_screen *screen,
-                         struct pipe_fence_handle *fence,
-                         unsigned flag)
-{
-   struct lp_fence *f = (struct lp_fence *) fence;
-
-   return f->count == f->rank;
-}
-
-
-/**
- * Wait for the fence to finish.
- * This is a Gallium API function.
- */
-static int
-llvmpipe_fence_finish(struct pipe_screen *screen,
-                      struct pipe_fence_handle *fence_handle,
-                      unsigned flag)
-{
-   struct lp_fence *fence = (struct lp_fence *) fence_handle;
-
-   pipe_mutex_lock(fence->mutex);
-   while (fence->count < fence->rank) {
-      pipe_condvar_wait(fence->signalled, fence->mutex);
-   }
-   pipe_mutex_unlock(fence->mutex);
-
-   return 0;
-}
-
-
-/**
  * Called by the rendering threads to increment the fence counter.
  * When the counter == the rank, the fence is finished.
  */
 void
 lp_fence_signal(struct lp_fence *fence)
 {
+   if (LP_DEBUG & DEBUG_FENCE)
+      debug_printf("%s %d\n", __FUNCTION__, fence->id);
+
    pipe_mutex_lock(fence->mutex);
 
    fence->count++;
    assert(fence->count <= fence->rank);
 
-   LP_DBG(DEBUG_RAST, "%s count=%u rank=%u\n", __FUNCTION__,
-          fence->count, fence->rank);
+   if (LP_DEBUG & DEBUG_FENCE)
+      debug_printf("%s count=%u rank=%u\n", __FUNCTION__,
+                   fence->count, fence->rank);
 
-   pipe_condvar_signal(fence->signalled);
+   /* Wakeup all threads waiting on the mutex:
+    */
+   pipe_condvar_broadcast(fence->signalled);
 
    pipe_mutex_unlock(fence->mutex);
 }
 
+boolean
+lp_fence_signalled(struct lp_fence *f)
+{
+   return f->count == f->rank;
+}
 
 void
-llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen)
+lp_fence_wait(struct lp_fence *f)
 {
-   screen->fence_reference = llvmpipe_fence_reference;
-   screen->fence_signalled = llvmpipe_fence_signalled;
-   screen->fence_finish = llvmpipe_fence_finish;
+   if (LP_DEBUG & DEBUG_FENCE)
+      debug_printf("%s %d\n", __FUNCTION__, f->id);
+
+   pipe_mutex_lock(f->mutex);
+   assert(f->issued);
+   while (f->count < f->rank) {
+      pipe_condvar_wait(f->signalled, f->mutex);
+   }
+   pipe_mutex_unlock(f->mutex);
 }
+
+
diff --git a/src/gallium/drivers/llvmpipe/lp_fence.h b/src/gallium/drivers/llvmpipe/lp_fence.h
index 13358fb99f..3c59118780 100644
--- a/src/gallium/drivers/llvmpipe/lp_fence.h
+++ b/src/gallium/drivers/llvmpipe/lp_fence.h
@@ -41,10 +41,12 @@ struct pipe_screen;
 struct lp_fence
 {
    struct pipe_reference reference;
+   unsigned id;
 
    pipe_mutex mutex;
    pipe_condvar signalled;
 
+   boolean issued;
    unsigned rank;
    unsigned count;
 };
@@ -57,6 +59,11 @@ lp_fence_create(unsigned rank);
 void
 lp_fence_signal(struct lp_fence *fence);
 
+boolean
+lp_fence_signalled(struct lp_fence *fence);
+
+void
+lp_fence_wait(struct lp_fence *fence);
 
 void
 llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen);
@@ -78,5 +85,11 @@ lp_fence_reference(struct lp_fence **ptr,
    *ptr = f;
 }
 
+static INLINE boolean
+lp_fence_issued(const struct lp_fence *fence)
+{
+   return fence->issued;
+}
+
 
 #endif /* LP_FENCE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.c b/src/gallium/drivers/llvmpipe/lp_flush.c
index 845292f4ab..e2c723b7a8 100644
--- a/src/gallium/drivers/llvmpipe/lp_flush.c
+++ b/src/gallium/drivers/llvmpipe/lp_flush.c
@@ -31,6 +31,7 @@
 
 
 #include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
 #include "util/u_string.h"
 #include "draw/draw_context.h"
 #include "lp_flush.h"
@@ -45,14 +46,15 @@
 void
 llvmpipe_flush( struct pipe_context *pipe,
                 unsigned flags,
-                struct pipe_fence_handle **fence )
+                struct pipe_fence_handle **fence,
+                const char *reason)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
 
    draw_flush(llvmpipe->draw);
 
    /* ask the setup module to flush */
-   lp_setup_flush(llvmpipe->setup, flags, fence);
+   lp_setup_flush(llvmpipe->setup, flags, fence, reason);
 
    /* Enable to dump BMPs of the color/depth buffers each frame */
    if (0) {
@@ -76,6 +78,17 @@ llvmpipe_flush( struct pipe_context *pipe,
    }
 }
 
+void
+llvmpipe_finish( struct pipe_context *pipe,
+                 const char *reason )
+{
+   struct pipe_fence_handle *fence = NULL;
+   llvmpipe_flush(pipe, 0, &fence, reason);
+   if (fence) {
+      pipe->screen->fence_finish(pipe->screen, fence, 0);
+      pipe->screen->fence_reference(pipe->screen, &fence, NULL);
+   }
+}
 
 /**
  * Flush context if necessary.
@@ -93,7 +106,8 @@ llvmpipe_flush_resource(struct pipe_context *pipe,
                         unsigned flush_flags,
                         boolean read_only,
                         boolean cpu_access,
-                        boolean do_not_block)
+                        boolean do_not_block,
+                        const char *reason)
 {
    unsigned referenced;
 
@@ -106,31 +120,16 @@ llvmpipe_flush_resource(struct pipe_context *pipe,
          /*
           * Flush and wait.
           */
-
-         struct pipe_fence_handle *fence = NULL;
-
          if (do_not_block)
             return FALSE;
 
-         /*
-          * Do the unswizzling in parallel.
-          *
-          * XXX: Don't abuse the PIPE_FLUSH_FRAME flag for this.
-          */
-         flush_flags |= PIPE_FLUSH_FRAME;
-
-         llvmpipe_flush(pipe, flush_flags, &fence);
-
-         if (fence) {
-            pipe->screen->fence_finish(pipe->screen, fence, 0);
-            pipe->screen->fence_reference(pipe->screen, &fence, NULL);
-         }
+         llvmpipe_finish(pipe, reason);
       } else {
          /*
           * Just flush.
           */
 
-         llvmpipe_flush(pipe, flush_flags, NULL);
+         llvmpipe_flush(pipe, flush_flags, NULL, reason);
       }
    }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.h b/src/gallium/drivers/llvmpipe/lp_flush.h
index 7b605681a9..bb538b2bd8 100644
--- a/src/gallium/drivers/llvmpipe/lp_flush.h
+++ b/src/gallium/drivers/llvmpipe/lp_flush.h
@@ -34,8 +34,14 @@ struct pipe_context;
 struct pipe_fence_handle;
 
 void
-llvmpipe_flush(struct pipe_context *pipe, unsigned flags,
-               struct pipe_fence_handle **fence);
+llvmpipe_flush(struct pipe_context *pipe,
+               unsigned flags,
+               struct pipe_fence_handle **fence,
+               const char *reason);
+
+void
+llvmpipe_finish( struct pipe_context *pipe,
+                 const char *reason );
 
 boolean
 llvmpipe_flush_resource(struct pipe_context *pipe,
@@ -45,6 +51,7 @@ llvmpipe_flush_resource(struct pipe_context *pipe,
                         unsigned flush_flags,
                         boolean read_only,
                         boolean cpu_access,
-                        boolean do_not_block);
+                        boolean do_not_block,
+                        const char *reason);
 
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_perf.c b/src/gallium/drivers/llvmpipe/lp_perf.c
index 083e7e30a5..e22532f25c 100644
--- a/src/gallium/drivers/llvmpipe/lp_perf.c
+++ b/src/gallium/drivers/llvmpipe/lp_perf.c
@@ -46,7 +46,7 @@ lp_print_counters(void)
 {
    if (LP_DEBUG & DEBUG_COUNTERS) {
       unsigned total_64, total_16, total_4;
-      float p1, p2, p3, p4;
+      float p1, p2, p3, p5, p6;
 
       debug_printf("llvmpipe: nr_triangles:                 %9u\n", lp_count.nr_tris);
       debug_printf("llvmpipe: nr_culled_triangles:          %9u\n", lp_count.nr_culled_tris);
@@ -58,11 +58,15 @@ lp_print_counters(void)
       p1 = 100.0 * (float) lp_count.nr_empty_64 / (float) total_64;
       p2 = 100.0 * (float) lp_count.nr_fully_covered_64 / (float) total_64;
       p3 = 100.0 * (float) lp_count.nr_partially_covered_64 / (float) total_64;
-      p4 = 100.0 * (float) lp_count.nr_shade_opaque_64 / (float) total_64;
+      p5 = 100.0 * (float) lp_count.nr_shade_opaque_64 / (float) total_64;
+      p6 = 100.0 * (float) lp_count.nr_shade_64 / (float) total_64;
 
       debug_printf("llvmpipe: nr_64x64:                     %9u\n", total_64);
       debug_printf("llvmpipe:   nr_fully_covered_64x64:     %9u (%3.0f%% of %u)\n", lp_count.nr_fully_covered_64, p2, total_64);
-      debug_printf("llvmpipe:     nr_shade_opaque_64x64:    %9u (%3.0f%% of %u)\n", lp_count.nr_shade_opaque_64, p4, total_64);
+      debug_printf("llvmpipe:     nr_shade_opaque_64x64:    %9u (%3.0f%% of %u)\n", lp_count.nr_shade_opaque_64, p5, total_64);
+      debug_printf("llvmpipe:        nr_pure_shade_opaque:  %9u (%3.0f%% of %u)\n", lp_count.nr_pure_shade_opaque_64, 0.0, lp_count.nr_shade_opaque_64);
+      debug_printf("llvmpipe:     nr_shade_64x64:           %9u (%3.0f%% of %u)\n", lp_count.nr_shade_64, p6, total_64);
+      debug_printf("llvmpipe:        nr_pure_shade:         %9u (%3.0f%% of %u)\n", lp_count.nr_pure_shade_64, 0.0, lp_count.nr_shade_64);
       debug_printf("llvmpipe:   nr_partially_covered_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_64, p3, total_64);
       debug_printf("llvmpipe:   nr_empty_64x64:             %9u (%3.0f%% of %u)\n", lp_count.nr_empty_64, p1, total_64);
 
@@ -79,12 +83,17 @@ lp_print_counters(void)
       debug_printf("llvmpipe:   nr_partially_covered_16x16: %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_16, p3, total_16);
       debug_printf("llvmpipe:   nr_empty_16x16:             %9u (%3.0f%% of %u)\n", lp_count.nr_empty_16, p1, total_16);
 
-      total_4 = (lp_count.nr_empty_4 + lp_count.nr_non_empty_4);
+      total_4 = (lp_count.nr_empty_4 +
+                 lp_count.nr_fully_covered_4 +
+                 lp_count.nr_partially_covered_4);
 
       p1 = 100.0 * (float) lp_count.nr_empty_4 / (float) total_4;
-      p2 = 100.0 * (float) lp_count.nr_non_empty_4 / (float) total_4;
+      p2 = 100.0 * (float) lp_count.nr_fully_covered_4 / (float) total_4;
+      p3 = 100.0 * (float) lp_count.nr_partially_covered_4 / (float) total_4;
 
-      debug_printf("llvmpipe: nr_4x4:                       %9u\n", total_4);
+      debug_printf("llvmpipe: nr_tri_4x4:                   %9u\n", total_4);
+      debug_printf("llvmpipe:   nr_fully_covered_4x4:       %9u (%3.0f%% of %u)\n", lp_count.nr_fully_covered_4, p2, total_4);
+      debug_printf("llvmpipe:   nr_partially_covered_4x4:   %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_4, p3, total_4);
       debug_printf("llvmpipe:   nr_empty_4x4:               %9u (%3.0f%% of %u)\n", lp_count.nr_empty_4, p1, total_4);
       debug_printf("llvmpipe:   nr_non_empty_4x4:           %9u (%3.0f%% of %u)\n", lp_count.nr_non_empty_4, p2, total_4);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_perf.h b/src/gallium/drivers/llvmpipe/lp_perf.h
index 4774f64550..c28652fc30 100644
--- a/src/gallium/drivers/llvmpipe/lp_perf.h
+++ b/src/gallium/drivers/llvmpipe/lp_perf.h
@@ -44,11 +44,16 @@ struct lp_counters
    unsigned nr_empty_64;
    unsigned nr_fully_covered_64;
    unsigned nr_partially_covered_64;
+   unsigned nr_pure_shade_opaque_64;
+   unsigned nr_pure_shade_64;
+   unsigned nr_shade_64;
    unsigned nr_shade_opaque_64;
    unsigned nr_empty_16;
    unsigned nr_fully_covered_16;
    unsigned nr_partially_covered_16;
    unsigned nr_empty_4;
+   unsigned nr_fully_covered_4;
+   unsigned nr_partially_covered_4;
    unsigned nr_non_empty_4;
    unsigned nr_llvm_compiles;
    int64_t llvm_compile_time;  /**< total, in microseconds */
@@ -66,9 +71,11 @@ extern struct lp_counters lp_count;
 #ifdef DEBUG
 #define LP_COUNT(counter) lp_count.counter++
 #define LP_COUNT_ADD(counter, incr)  lp_count.counter += (incr)
+#define LP_COUNT_GET(counter) (lp_count.counter)
 #else
 #define LP_COUNT(counter)
 #define LP_COUNT_ADD(counter, incr) (void) incr
+#define LP_COUNT_GET(counter) 0
 #endif
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_query.c b/src/gallium/drivers/llvmpipe/lp_query.c
index 02eeaf6487..67fd797af2 100644
--- a/src/gallium/drivers/llvmpipe/lp_query.c
+++ b/src/gallium/drivers/llvmpipe/lp_query.c
@@ -35,9 +35,8 @@
 #include "util/u_memory.h"
 #include "lp_context.h"
 #include "lp_flush.h"
+#include "lp_fence.h"
 #include "lp_query.h"
-#include "lp_rast.h"
-#include "lp_rast_priv.h"
 #include "lp_state.h"
 
 
@@ -69,12 +68,7 @@ llvmpipe_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
    struct llvmpipe_query *pq = llvmpipe_query(q);
    /* query might still be in process if we never waited for the result */
    if (!pq->done) {
-     struct pipe_fence_handle *fence = NULL;
-     llvmpipe_flush(pipe, 0, &fence);
-     if (fence) {
-         pipe->screen->fence_finish(pipe->screen, fence, 0);
-         pipe->screen->fence_reference(pipe->screen, &fence, NULL);
-      }
+      llvmpipe_finish(pipe, __FUNCTION__);
    }
 
    pipe_mutex_destroy(pq->mutex);
@@ -93,16 +87,11 @@ llvmpipe_get_query_result(struct pipe_context *pipe,
 
    if (!pq->done) {
       if (wait) {
-         struct pipe_fence_handle *fence = NULL;
-         llvmpipe_flush(pipe, 0, &fence);
-         if (fence) {
-            pipe->screen->fence_finish(pipe->screen, fence, 0);
-            pipe->screen->fence_reference(pipe->screen, &fence, NULL);
-         }
+         llvmpipe_finish(pipe, __FUNCTION__);
       }
       /* this is a bit inconsequent but should be ok */
       else {
-         llvmpipe_flush(pipe, 0, NULL);
+         llvmpipe_flush(pipe, 0, NULL, __FUNCTION__);
       }
    }
 
@@ -125,12 +114,7 @@ llvmpipe_begin_query(struct pipe_context *pipe, struct pipe_query *q)
     * frame of rendering.
     */
    if (pq->binned) {
-      struct pipe_fence_handle *fence;
-      llvmpipe_flush(pipe, 0, &fence);
-      if (fence) {
-         pipe->screen->fence_finish(pipe->screen, fence, 0);
-         pipe->screen->fence_reference(pipe->screen, &fence, NULL);
-      }
+      llvmpipe_finish(pipe, __FUNCTION__);
    }
 
    lp_setup_begin_query(llvmpipe->setup, pq);
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index 3215d0f652..b1c306bbe9 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -316,43 +316,6 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
 }
 
 
-/**
- * Load tile color from the framebuffer surface.
- * This is a bin command called during bin processing.
- */
-#if 0
-void
-lp_rast_load_color(struct lp_rasterizer_task *task,
-                   const union lp_rast_cmd_arg arg)
-{
-   struct lp_rasterizer *rast = task->rast;
-   unsigned buf;
-   enum lp_texture_usage usage;
-
-   LP_DBG(DEBUG_RAST, "%s at %u, %u\n", __FUNCTION__, x, y);
-
-   if (scene->has_color_clear)
-      usage = LP_TEX_USAGE_WRITE_ALL;
-   else
-      usage = LP_TEX_USAGE_READ_WRITE;
-
-   /* Get pointers to color tile(s).
-    * This will convert linear data to tiled if needed.
-    */
-   for (buf = 0; buf < rast->state.nr_cbufs; buf++) {
-      struct pipe_surface *cbuf = rast->curr_scene->fb.cbufs[buf];
-      struct llvmpipe_texture *lpt;
-      assert(cbuf);
-      lpt = llvmpipe_texture(cbuf->texture);
-      task->color_tiles[buf] = llvmpipe_get_texture_tile(lpt,
-                                                         cbuf->face + cbuf->zslice,
-                                                         cbuf->level,
-                                                         usage,
-                                                         task->x, task->y);
-      assert(task->color_tiles[buf]);
-   }
-}
-#endif
 
 
 /**
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index 44319a0ad6..b4564ef33b 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -120,7 +120,7 @@ struct lp_rast_triangle {
    float v[3][2];
 #endif
 
-   struct lp_rast_plane plane[7]; /* NOTE: may allocate fewer planes */
+   struct lp_rast_plane plane[8]; /* NOTE: may allocate fewer planes */
 };
 
 
@@ -236,6 +236,8 @@ void lp_rast_triangle_6( struct lp_rasterizer_task *,
                          const union lp_rast_cmd_arg );
 void lp_rast_triangle_7( struct lp_rasterizer_task *, 
                          const union lp_rast_cmd_arg );
+void lp_rast_triangle_8( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
 
 void lp_rast_shade_tile( struct lp_rasterizer_task *,
                          const union lp_rast_cmd_arg );
@@ -256,5 +258,9 @@ void lp_rast_begin_query(struct lp_rasterizer_task *,
 void lp_rast_end_query(struct lp_rasterizer_task *,
                        const union lp_rast_cmd_arg );
 
+void
+lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg);
+
 
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 980c18c024..dbaa8e023a 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -67,7 +67,7 @@ block_full_16(struct lp_rasterizer_task *task,
 	 block_full_4(task, tri, x + ix, y + iy);
 }
 
-
+#if !defined(PIPE_ARCH_SSE)
 static INLINE unsigned
 build_mask(int c, int dcdx, int dcdy)
 {
@@ -98,6 +98,7 @@ build_mask(int c, int dcdx, int dcdy)
    return mask;
 }
 
+
 static INLINE unsigned
 build_mask_linear(int c, int dcdx, int dcdy)
 {
@@ -129,6 +130,137 @@ build_mask_linear(int c, int dcdx, int dcdy)
 }
 
 
+static INLINE void
+build_masks(int c, 
+	    int cdiff,
+	    int dcdx,
+	    int dcdy,
+	    unsigned *outmask,
+	    unsigned *partmask)
+{
+   *outmask |= build_mask_linear(c, dcdx, dcdy);
+   *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
+}
+
+#else
+#include <emmintrin.h>
+#include "util/u_sse.h"
+
+
+static INLINE void
+build_masks(int c, 
+	    int cdiff,
+	    int dcdx,
+	    int dcdy,
+	    unsigned *outmask,
+	    unsigned *partmask)
+{
+   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+   /* Get values across the quad
+    */
+   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
+
+   {
+      __m128i cstep01, cstep23, result;
+
+      cstep01 = _mm_packs_epi32(cstep0, cstep1);
+      cstep23 = _mm_packs_epi32(cstep2, cstep3);
+      result = _mm_packs_epi16(cstep01, cstep23);
+
+      *outmask |= _mm_movemask_epi8(result);
+   }
+
+
+   {
+      __m128i cio4 = _mm_set1_epi32(cdiff);
+      __m128i cstep01, cstep23, result;
+
+      cstep0 = _mm_add_epi32(cstep0, cio4);
+      cstep1 = _mm_add_epi32(cstep1, cio4);
+      cstep2 = _mm_add_epi32(cstep2, cio4);
+      cstep3 = _mm_add_epi32(cstep3, cio4);
+
+      cstep01 = _mm_packs_epi32(cstep0, cstep1);
+      cstep23 = _mm_packs_epi32(cstep2, cstep3);
+      result = _mm_packs_epi16(cstep01, cstep23);
+
+      *partmask |= _mm_movemask_epi8(result);
+   }
+}
+
+
+static INLINE unsigned
+build_mask_linear(int c, int dcdx, int dcdy)
+{
+   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+   /* Get values across the quad
+    */
+   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
+
+   /* pack pairs of results into epi16
+    */
+   __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
+   __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
+
+   /* pack into epi8, preserving sign bits
+    */
+   __m128i result = _mm_packs_epi16(cstep01, cstep23);
+
+   /* extract sign bits to create mask
+    */
+   return _mm_movemask_epi8(result);
+}
+
+static INLINE unsigned
+build_mask(int c, int dcdx, int dcdy)
+{
+   __m128i step = _mm_setr_epi32(0, dcdx, dcdy, dcdx + dcdy);
+   __m128i c0 = _mm_set1_epi32(c);
+
+   /* Get values across the quad
+    */
+   __m128i cstep0 = _mm_add_epi32(c0, step);
+
+   /* Scale up step for moving between quads.
+    */
+   __m128i step4 = _mm_add_epi32(step, step);
+
+   /* Get values for the remaining quads:
+    */
+   __m128i cstep1 = _mm_add_epi32(cstep0, 
+				  _mm_shuffle_epi32(step4, _MM_SHUFFLE(1,1,1,1)));
+   __m128i cstep2 = _mm_add_epi32(cstep0,
+				  _mm_shuffle_epi32(step4, _MM_SHUFFLE(2,2,2,2)));
+   __m128i cstep3 = _mm_add_epi32(cstep2,
+				  _mm_shuffle_epi32(step4, _MM_SHUFFLE(1,1,1,1)));
+
+   /* pack pairs of results into epi16
+    */
+   __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
+   __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
+
+   /* pack into epi8, preserving sign bits
+    */
+   __m128i result = _mm_packs_epi16(cstep01, cstep23);
+
+   /* extract sign bits to create mask
+    */
+   return _mm_movemask_epi8(result);
+}
+
+#endif
+
+
+
+
 #define TAG(x) x##_1
 #define NR_PLANES 1
 #include "lp_rast_tri_tmp.h"
@@ -157,3 +289,92 @@ build_mask_linear(int c, int dcdx, int dcdy)
 #define NR_PLANES 7
 #include "lp_rast_tri_tmp.h"
 
+#define TAG(x) x##_8
+#define NR_PLANES 8
+#include "lp_rast_tri_tmp.h"
+
+
+/* Special case for 3 plane triangle which is contained entirely
+ * within a 16x16 block.
+ */
+void
+lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = tri->plane;
+   unsigned mask = arg.triangle.plane_mask;
+   const int x = task->x + (mask & 0xf) * 16;
+   const int y = task->y + (mask >> 4) * 16;
+   unsigned outmask, inmask, partmask, partial_mask;
+   unsigned j;
+   int c[3];
+
+   outmask = 0;                 /* outside one or more trivial reject planes */
+   partmask = 0;                /* outside one or more trivial accept planes */
+
+   for (j = 0; j < 3; j++) {
+      c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
+
+      {
+	 const int dcdx = -plane[j].dcdx * 4;
+	 const int dcdy = plane[j].dcdy * 4;
+	 const int cox = plane[j].eo * 4;
+	 const int cio = plane[j].ei * 4 - 1;
+
+	 build_masks(c[j] + cox,
+		     cio - cox,
+		     dcdx, dcdy, 
+		     &outmask,   /* sign bits from c[i][0..15] + cox */
+		     &partmask); /* sign bits from c[i][0..15] + cio */
+      }
+   }
+
+   if (outmask == 0xffff)
+      return;
+
+   /* Mask of sub-blocks which are inside all trivial accept planes:
+    */
+   inmask = ~partmask & 0xffff;
+
+   /* Mask of sub-blocks which are inside all trivial reject planes,
+    * but outside at least one trivial accept plane:
+    */
+   partial_mask = partmask & ~outmask;
+
+   assert((partial_mask & inmask) == 0);
+
+   /* Iterate over partials:
+    */
+   while (partial_mask) {
+      int i = ffs(partial_mask) - 1;
+      int ix = (i & 3) * 4;
+      int iy = (i >> 2) * 4;
+      int px = x + ix;
+      int py = y + iy; 
+      int cx[3];
+
+      partial_mask &= ~(1 << i);
+
+      for (j = 0; j < 3; j++)
+         cx[j] = (c[j] 
+		  - plane[j].dcdx * ix
+		  + plane[j].dcdy * iy);
+
+      do_block_4_3(task, tri, plane, px, py, cx);
+   }
+
+   /* Iterate over fulls: 
+    */
+   while (inmask) {
+      int i = ffs(inmask) - 1;
+      int ix = (i & 3) * 4;
+      int iy = (i >> 2) * 4;
+      int px = x + ix;
+      int py = y + iy; 
+
+      inmask &= ~(1 << i);
+
+      block_full_4(task, tri, px, py);
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index 43f72d8ca8..99a0bae45d 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -32,7 +32,7 @@
 
 
 /**
- * Prototype for a 7 plane rasterizer function.  Will codegenerate
+ * Prototype for a 8 plane rasterizer function.  Will codegenerate
  * several of these.
  *
  * XXX: Varients for more/fewer planes.
@@ -81,11 +81,14 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
    for (j = 0; j < NR_PLANES; j++) {
       const int dcdx = -plane[j].dcdx * 4;
       const int dcdy = plane[j].dcdy * 4;
-      const int cox = c[j] + plane[j].eo * 4;
-      const int cio = c[j] + plane[j].ei * 4 - 1;
-
-      outmask |= build_mask_linear(cox, dcdx, dcdy);
-      partmask |= build_mask_linear(cio, dcdx, dcdy);
+      const int cox = plane[j].eo * 4;
+      const int cio = plane[j].ei * 4 - 1;
+
+      build_masks(c[j] + cox,
+		  cio - cox,
+		  dcdx, dcdy, 
+		  &outmask,   /* sign bits from c[i][0..15] + cox */
+		  &partmask); /* sign bits from c[i][0..15] + cio */
    }
 
    if (outmask == 0xffff)
@@ -102,6 +105,8 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
 
    assert((partial_mask & inmask) == 0);
 
+   LP_COUNT_ADD(nr_empty_4, util_bitcount(0xffff & ~(partial_mask | inmask)));
+
    /* Iterate over partials:
     */
    while (partial_mask) {
@@ -114,6 +119,8 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
 
       partial_mask &= ~(1 << i);
 
+      LP_COUNT(nr_partially_covered_4);
+
       for (j = 0; j < NR_PLANES; j++)
          cx[j] = (c[j] 
 		  - plane[j].dcdx * ix
@@ -133,6 +140,7 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
 
       inmask &= ~(1 << i);
 
+      LP_COUNT(nr_fully_covered_4);
       block_full_4(task, tri, px, py);
    }
 }
@@ -166,11 +174,14 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
       {
 	 const int dcdx = -plane[j].dcdx * 16;
 	 const int dcdy = plane[j].dcdy * 16;
-	 const int cox = c[j] + plane[j].eo * 16;
-	 const int cio = c[j] + plane[j].ei * 16 - 1;
-
-	 outmask |= build_mask_linear(cox, dcdx, dcdy);
-	 partmask |= build_mask_linear(cio, dcdx, dcdy);
+	 const int cox = plane[j].eo * 16;
+	 const int cio = plane[j].ei * 16 - 1;
+
+	 build_masks(c[j] + cox,
+		     cio - cox,
+		     dcdx, dcdy, 
+		     &outmask,   /* sign bits from c[i][0..15] + cox */
+		     &partmask); /* sign bits from c[i][0..15] + cio */
       }
 
       j++;
@@ -190,6 +201,8 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
 
    assert((partial_mask & inmask) == 0);
 
+   LP_COUNT_ADD(nr_empty_16, util_bitcount(0xffff & ~(partial_mask | inmask)));
+
    /* Iterate over partials:
     */
    while (partial_mask) {
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c
index f88a759fe7..15a09b7100 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.c
+++ b/src/gallium/drivers/llvmpipe/lp_scene.c
@@ -163,12 +163,15 @@ lp_scene_reset(struct lp_scene *scene )
 
    /* Free all but last binner command lists:
     */
-   for (i = 0; i < TILES_X; i++) {
-      for (j = 0; j < TILES_Y; j++) {
+   for (i = 0; i < scene->tiles_x; i++) {
+      for (j = 0; j < scene->tiles_y; j++) {
          lp_scene_bin_reset(scene, i, j);
       }
    }
 
+   /* If there are any bins which weren't cleared by the loop above,
+    * they will be caught (on debug builds at least) by this assert:
+    */
    assert(lp_scene_is_empty(scene));
 
    /* Free all but last binned data block:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 167cb2ee2e..1e65a91fc6 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -61,6 +61,8 @@ static const struct debug_named_value lp_debug_flags[] = {
    { "show_tiles",    DEBUG_SHOW_TILES, NULL },
    { "show_subtiles", DEBUG_SHOW_SUBTILES, NULL },
    { "counters", DEBUG_COUNTERS, NULL },
+   { "scene", DEBUG_SCENE, NULL },
+   { "fence", DEBUG_FENCE, NULL },
    DEBUG_NAMED_VALUE_END
 };
 #endif
@@ -87,7 +89,14 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
       return PIPE_MAX_SAMPLERS;
    case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
-      return PIPE_MAX_VERTEX_SAMPLERS;
+      /* At this time, the draw module and llvmpipe driver only
+       * support vertex shader texture lookups when LLVM is enabled in
+       * the draw module.
+       */
+      if (debug_get_bool_option("DRAW_USE_LLVM", TRUE))
+         return PIPE_MAX_VERTEX_SAMPLERS;
+      else
+         return 0;
    case PIPE_CAP_MAX_COMBINED_SAMPLERS:
       return PIPE_MAX_SAMPLERS + PIPE_MAX_VERTEX_SAMPLERS;
    case PIPE_CAP_NPOT_TEXTURES:
@@ -230,6 +239,7 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
    assert(target == PIPE_BUFFER ||
           target == PIPE_TEXTURE_1D ||
           target == PIPE_TEXTURE_2D ||
+          target == PIPE_TEXTURE_RECT ||
           target == PIPE_TEXTURE_3D ||
           target == PIPE_TEXTURE_CUBE);
 
@@ -314,6 +324,51 @@ llvmpipe_destroy_screen( struct pipe_screen *_screen )
 
 
 
+
+/**
+ * Fence reference counting.
+ */
+static void
+llvmpipe_fence_reference(struct pipe_screen *screen,
+                         struct pipe_fence_handle **ptr,
+                         struct pipe_fence_handle *fence)
+{
+   struct lp_fence **old = (struct lp_fence **) ptr;
+   struct lp_fence *f = (struct lp_fence *) fence;
+
+   lp_fence_reference(old, f);
+}
+
+
+/**
+ * Has the fence been executed/finished?
+ */
+static int
+llvmpipe_fence_signalled(struct pipe_screen *screen,
+                         struct pipe_fence_handle *fence,
+                         unsigned flag)
+{
+   struct lp_fence *f = (struct lp_fence *) fence;
+   return lp_fence_signalled(f);
+}
+
+
+/**
+ * Wait for the fence to finish.
+ */
+static int
+llvmpipe_fence_finish(struct pipe_screen *screen,
+                      struct pipe_fence_handle *fence_handle,
+                      unsigned flag)
+{
+   struct lp_fence *f = (struct lp_fence *) fence_handle;
+
+   lp_fence_wait(f);
+   return 0;
+}
+
+
+
 /**
  * Create a new pipe_screen object
  * Note: we're not presently subclassing pipe_screen (no llvmpipe_screen).
@@ -351,9 +406,11 @@ llvmpipe_create_screen(struct sw_winsys *winsys)
 
    screen->base.context_create = llvmpipe_create_context;
    screen->base.flush_frontbuffer = llvmpipe_flush_frontbuffer;
+   screen->base.fence_reference = llvmpipe_fence_reference;
+   screen->base.fence_signalled = llvmpipe_fence_signalled;
+   screen->base.fence_finish = llvmpipe_fence_finish;
 
    llvmpipe_init_screen_resource_funcs(&screen->base);
-   llvmpipe_init_screen_fence_funcs(&screen->base);
 
    lp_jit_screen_init(screen);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 556e571585..3da9097154 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -275,9 +275,10 @@ set_scene_state( struct lp_setup_context *setup,
 void
 lp_setup_flush( struct lp_setup_context *setup,
                 unsigned flags,
-                struct pipe_fence_handle **fence)
+                struct pipe_fence_handle **fence,
+                const char *reason)
 {
-   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+   LP_DBG(DEBUG_SETUP, "%s %s\n", __FUNCTION__, reason);
 
    if (setup->scene) {
       if (fence) {
@@ -287,6 +288,8 @@ lp_setup_flush( struct lp_setup_context *setup,
          *fence = lp_setup_fence( setup );
       }
 
+      if (setup->scene->fence)
+         setup->scene->fence->issued = TRUE;
    }
 
    set_scene_state( setup, SETUP_FLUSHED );
@@ -312,6 +315,11 @@ lp_setup_bind_framebuffer( struct lp_setup_context *setup,
     * scene.
     */
    util_copy_framebuffer_state(&setup->fb, fb);
+   setup->framebuffer.x0 = 0;
+   setup->framebuffer.y0 = 0;
+   setup->framebuffer.x1 = fb->width-1;
+   setup->framebuffer.y1 = fb->height-1;
+   setup->dirty |= LP_SETUP_NEW_SCISSOR;
 }
 
 
@@ -469,11 +477,35 @@ lp_setup_set_triangle_state( struct lp_setup_context *setup,
    setup->ccw_is_frontface = ccw_is_frontface;
    setup->cullmode = cull_mode;
    setup->triangle = first_triangle;
-   setup->scissor_test = scissor;
    setup->pixel_offset = gl_rasterization_rules ? 0.5f : 0.0f;
+
+   if (setup->scissor_test != scissor) {
+      setup->dirty |= LP_SETUP_NEW_SCISSOR;
+      setup->scissor_test = scissor;
+   }
 }
 
+void 
+lp_setup_set_line_state( struct lp_setup_context *setup,
+			 float line_width)
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
 
+   setup->line_width = line_width;
+}
+
+void 
+lp_setup_set_point_state( struct lp_setup_context *setup,
+                          float point_size,                          
+                          boolean point_size_per_vertex,
+                          uint sprite)
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   setup->point_size = point_size;
+   setup->sprite = sprite;
+   setup->point_size_per_vertex = point_size_per_vertex;
+}
 
 void
 lp_setup_set_fs_inputs( struct lp_setup_context *setup,
@@ -559,10 +591,11 @@ lp_setup_set_scissor( struct lp_setup_context *setup,
 
    assert(scissor);
 
-   if (memcmp(&setup->scissor.current, scissor, sizeof(*scissor)) != 0) {
-      setup->scissor.current = *scissor; /* struct copy */
-      setup->dirty |= LP_SETUP_NEW_SCISSOR;
-   }
+   setup->scissor.x0 = scissor->minx;
+   setup->scissor.x1 = scissor->maxx-1;
+   setup->scissor.y0 = scissor->miny;
+   setup->scissor.y1 = scissor->maxy-1;
+   setup->dirty |= LP_SETUP_NEW_SCISSOR;
 }
 
 
@@ -713,6 +746,12 @@ lp_setup_update_state( struct lp_setup_context *setup )
     */
    {
       struct llvmpipe_context *lp = llvmpipe_context(scene->pipe);
+
+      /* Will probably need to move this somewhere else, just need  
+       * to know about vertex shader point size attribute.
+       */
+      setup->psize = lp->psize_slot;
+
       if (lp->dirty) {
          llvmpipe_update_derived(lp);
       }
@@ -806,6 +845,14 @@ lp_setup_update_state( struct lp_setup_context *setup )
       }
    }
 
+   if (setup->dirty & LP_SETUP_NEW_SCISSOR) {
+      setup->draw_region = setup->framebuffer;
+      if (setup->scissor_test) {
+         u_rect_possible_intersection(&setup->scissor,
+                                      &setup->draw_region);
+      }
+   }
+                                      
    setup->dirty = 0;
 
    assert(setup->fs.stored);
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h
index 73b1c85325..821ebb1087 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup.h
@@ -85,7 +85,8 @@ lp_setup_fence( struct lp_setup_context *setup );
 void
 lp_setup_flush( struct lp_setup_context *setup,
                 unsigned flags,
-                struct pipe_fence_handle **fence);
+                struct pipe_fence_handle **fence,
+                const char *reason);
 
 
 void
@@ -99,6 +100,16 @@ lp_setup_set_triangle_state( struct lp_setup_context *setup,
                              boolean scissor,
                              boolean gl_rasterization_rules );
 
+void 
+lp_setup_set_line_state( struct lp_setup_context *setup,
+                         float line_width);
+
+void 
+lp_setup_set_point_state( struct lp_setup_context *setup,
+                          float point_size,                          
+                          boolean point_size_per_vertex,
+                          uint sprite);
+
 void
 lp_setup_set_fs_inputs( struct lp_setup_context *setup,
                         const struct lp_shader_input *interp,
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.c b/src/gallium/drivers/llvmpipe/lp_setup_coef.c
new file mode 100644
index 0000000000..95e3e8fffe
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_coef.c
@@ -0,0 +1,258 @@
+/**************************************************************************
+ *
+ * Copyright 2010, VMware.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Binning code for triangles
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "lp_perf.h"
+#include "lp_setup_context.h"
+#include "lp_setup_coef.h"
+#include "lp_rast.h"
+#include "lp_state_fs.h"
+
+#if !defined(PIPE_ARCH_SSE)
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ */
+static void constant_coef( struct lp_rast_shader_inputs *inputs,
+                           unsigned slot,
+			   const float value,
+                           unsigned i )
+{
+   inputs->a0[slot][i] = value;
+   inputs->dadx[slot][i] = 0.0f;
+   inputs->dady[slot][i] = 0.0f;
+}
+
+
+
+static void linear_coef( struct lp_rast_shader_inputs *inputs,
+                         const struct lp_tri_info *info,
+                         unsigned slot,
+                         unsigned vert_attr,
+                         unsigned i)
+{
+   float a0 = info->v0[vert_attr][i];
+   float a1 = info->v1[vert_attr][i];
+   float a2 = info->v2[vert_attr][i];
+
+   float da01 = a0 - a1;
+   float da20 = a2 - a0;
+   float dadx = (da01 * info->dy20_ooa - info->dy01_ooa * da20);
+   float dady = (da20 * info->dx01_ooa - info->dx20_ooa * da01);
+
+   inputs->dadx[slot][i] = dadx;
+   inputs->dady[slot][i] = dady;
+
+   /* calculate a0 as the value which would be sampled for the
+    * fragment at (0,0), taking into account that we want to sample at
+    * pixel centers, in other words (0.5, 0.5).
+    *
+    * this is neat but unfortunately not a good way to do things for
+    * triangles with very large values of dadx or dady as it will
+    * result in the subtraction and re-addition from a0 of a very
+    * large number, which means we'll end up loosing a lot of the
+    * fractional bits and precision from a0.  the way to fix this is
+    * to define a0 as the sample at a pixel center somewhere near vmin
+    * instead - i'll switch to this later.
+    */
+   inputs->a0[slot][i] = a0 - (dadx * info->x0_center +
+				   dady * info->y0_center);
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void perspective_coef( struct lp_rast_shader_inputs *inputs,
+                              const struct lp_tri_info *info,
+                              unsigned slot,
+			      unsigned vert_attr,
+                              unsigned i)
+{
+   /* premultiply by 1/w  (v[0][3] is always 1/w):
+    */
+   float a0 = info->v0[vert_attr][i] * info->v0[0][3];
+   float a1 = info->v1[vert_attr][i] * info->v1[0][3];
+   float a2 = info->v2[vert_attr][i] * info->v2[0][3];
+   float da01 = a0 - a1;
+   float da20 = a2 - a0;
+   float dadx = da01 * info->dy20_ooa - info->dy01_ooa * da20;
+   float dady = da20 * info->dx01_ooa - info->dx20_ooa * da01;
+
+   inputs->dadx[slot][i] = dadx;
+   inputs->dady[slot][i] = dady;
+   inputs->a0[slot][i] = a0 - (dadx * info->x0_center +
+				   dady * info->y0_center);
+}
+
+
+/**
+ * Special coefficient setup for gl_FragCoord.
+ * X and Y are trivial
+ * Z and W are copied from position_coef which should have already been computed.
+ * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
+ */
+static void
+setup_fragcoord_coef(struct lp_rast_shader_inputs *inputs,
+                     const struct lp_tri_info *info,
+                     unsigned slot,
+                     unsigned usage_mask)
+{
+   /*X*/
+   if (usage_mask & TGSI_WRITEMASK_X) {
+      inputs->a0[slot][0] = 0.0;
+      inputs->dadx[slot][0] = 1.0;
+      inputs->dady[slot][0] = 0.0;
+   }
+
+   /*Y*/
+   if (usage_mask & TGSI_WRITEMASK_Y) {
+      inputs->a0[slot][1] = 0.0;
+      inputs->dadx[slot][1] = 0.0;
+      inputs->dady[slot][1] = 1.0;
+   }
+
+   /*Z*/
+   if (usage_mask & TGSI_WRITEMASK_Z) {
+      linear_coef(inputs, info, slot, 0, 2);
+   }
+
+   /*W*/
+   if (usage_mask & TGSI_WRITEMASK_W) {
+      linear_coef(inputs, info, slot, 0, 3);
+   }
+}
+
+
+/**
+ * Setup the fragment input attribute with the front-facing value.
+ * \param frontface  is the triangle front facing?
+ */
+static void setup_facing_coef( struct lp_rast_shader_inputs *inputs,
+                               unsigned slot,
+                               boolean frontface,
+                               unsigned usage_mask)
+{
+   /* convert TRUE to 1.0 and FALSE to -1.0 */
+   if (usage_mask & TGSI_WRITEMASK_X)
+      constant_coef( inputs, slot, 2.0f * frontface - 1.0f, 0 );
+
+   if (usage_mask & TGSI_WRITEMASK_Y)
+      constant_coef( inputs, slot, 0.0f, 1 ); /* wasted */
+
+   if (usage_mask & TGSI_WRITEMASK_Z)
+      constant_coef( inputs, slot, 0.0f, 2 ); /* wasted */
+
+   if (usage_mask & TGSI_WRITEMASK_W)
+      constant_coef( inputs, slot, 0.0f, 3 ); /* wasted */
+}
+
+
+/**
+ * Compute the tri->coef[] array dadx, dady, a0 values.
+ */
+void lp_setup_tri_coef( struct lp_setup_context *setup,
+			struct lp_rast_shader_inputs *inputs,
+			const struct lp_tri_info *info)
+{
+   unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
+   unsigned slot;
+   unsigned i;
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
+      unsigned vert_attr = setup->fs.input[slot].src_index;
+      unsigned usage_mask = setup->fs.input[slot].usage_mask;
+
+      switch (setup->fs.input[slot].interp) {
+      case LP_INTERP_CONSTANT:
+         if (setup->flatshade_first) {
+            for (i = 0; i < NUM_CHANNELS; i++)
+               if (usage_mask & (1 << i))
+                  constant_coef(inputs, slot+1, info->v0[vert_attr][i], i);
+         }
+         else {
+            for (i = 0; i < NUM_CHANNELS; i++)
+               if (usage_mask & (1 << i))
+                  constant_coef(inputs, slot+1, info->v2[vert_attr][i], i);
+         }
+         break;
+
+      case LP_INTERP_LINEAR:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            if (usage_mask & (1 << i))
+               linear_coef(inputs, info, slot+1, vert_attr, i);
+         break;
+
+      case LP_INTERP_PERSPECTIVE:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            if (usage_mask & (1 << i))
+               perspective_coef(inputs, info, slot+1, vert_attr, i);
+         fragcoord_usage_mask |= TGSI_WRITEMASK_W;
+         break;
+
+      case LP_INTERP_POSITION:
+         /*
+          * The generated pixel interpolators will pick up the coeffs from
+          * slot 0, so all need to ensure that the usage mask is covers all
+          * usages.
+          */
+         fragcoord_usage_mask |= usage_mask;
+         break;
+
+      case LP_INTERP_FACING:
+         setup_facing_coef(inputs, slot+1, info->frontfacing, usage_mask);
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   /* The internal position input is in slot zero:
+    */
+   setup_fragcoord_coef(inputs, info, 0, fragcoord_usage_mask);
+}
+
+#else
+extern void lp_setup_coef_dummy(void);
+void lp_setup_coef_dummy(void)
+{
+}
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.h b/src/gallium/drivers/llvmpipe/lp_setup_coef.h
new file mode 100644
index 0000000000..d68b39c603
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_coef.h
@@ -0,0 +1,61 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * The setup code is concerned with point/line/triangle setup and
+ * putting commands/data into the bins.
+ */
+
+
+#ifndef LP_SETUP_COEF_H
+#define LP_SETUP_COEF_H
+
+
+struct lp_tri_info {
+
+   float x0_center;
+   float y0_center;
+
+   /* turn these into an aligned float[4] */
+   float dy01_ooa;
+   float dy20_ooa;
+   float dx01_ooa;
+   float dx20_ooa;
+
+   const float (*v0)[4];
+   const float (*v1)[4];
+   const float (*v2)[4];
+
+   boolean frontfacing;		/* remove eventually */
+};
+
+void lp_setup_tri_coef( struct lp_setup_context *setup,
+			struct lp_rast_shader_inputs *inputs,
+			const struct lp_tri_info *info);
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c b/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c
new file mode 100644
index 0000000000..73fb70599c
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c
@@ -0,0 +1,207 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Binning code for triangles
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "lp_perf.h"
+#include "lp_setup_context.h"
+#include "lp_setup_coef.h"
+#include "lp_rast.h"
+
+#if defined(PIPE_ARCH_SSE)
+#include <emmintrin.h>
+
+
+static void constant_coef4( struct lp_rast_shader_inputs *inputs,
+			    const struct lp_tri_info *info,
+			    unsigned slot,
+			    const float *attr)
+{
+   *(__m128 *)inputs->a0[slot]   = *(__m128 *)attr;
+   *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0);
+   *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0);
+}
+
+
+
+/**
+ * Setup the fragment input attribute with the front-facing value.
+ * \param frontface  is the triangle front facing?
+ */
+static void setup_facing_coef( struct lp_rast_shader_inputs *inputs,
+			       const struct lp_tri_info *info,
+			       unsigned slot )
+{
+   /* XXX: just pass frontface directly to the shader, don't bother
+    * treating it as an input.
+    */
+   __m128 a0 = _mm_setr_ps(info->frontfacing ? 1.0 : -1.0,
+			   0, 0, 0);
+
+   *(__m128 *)inputs->a0[slot]   = a0;
+   *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0);
+   *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0);
+}
+
+
+
+static void calc_coef4( struct lp_rast_shader_inputs *inputs,
+			const struct lp_tri_info *info,
+			unsigned slot,
+			__m128 a0,
+			__m128 a1,
+			__m128 a2)
+{
+   __m128 da01          = _mm_sub_ps(a0, a1);
+   __m128 da20          = _mm_sub_ps(a2, a0);
+
+   __m128 da01_dy20_ooa = _mm_mul_ps(da01, _mm_set1_ps(info->dy20_ooa));
+   __m128 da20_dy01_ooa = _mm_mul_ps(da20, _mm_set1_ps(info->dy01_ooa));   
+   __m128 dadx          = _mm_sub_ps(da01_dy20_ooa, da20_dy01_ooa);
+
+   __m128 da01_dx20_ooa = _mm_mul_ps(da01, _mm_set1_ps(info->dx20_ooa));
+   __m128 da20_dx01_ooa = _mm_mul_ps(da20, _mm_set1_ps(info->dx01_ooa));
+   __m128 dady          = _mm_sub_ps(da20_dx01_ooa, da01_dx20_ooa);
+
+   __m128 dadx_x0       = _mm_mul_ps(dadx, _mm_set1_ps(info->x0_center));
+   __m128 dady_y0       = _mm_mul_ps(dady, _mm_set1_ps(info->y0_center));
+   __m128 attr_v0       = _mm_add_ps(dadx_x0, dady_y0);
+   __m128 attr_0        = _mm_sub_ps(a0, attr_v0);
+
+   *(__m128 *)inputs->a0[slot]   = attr_0;
+   *(__m128 *)inputs->dadx[slot] = dadx;
+   *(__m128 *)inputs->dady[slot] = dady;
+}
+
+
+static void linear_coef( struct lp_rast_shader_inputs *inputs,
+                         const struct lp_tri_info *info,
+                         unsigned slot,
+                         unsigned vert_attr)
+{
+   __m128 a0 = *(const __m128 *)info->v0[vert_attr];
+   __m128 a1 = *(const __m128 *)info->v1[vert_attr];
+   __m128 a2 = *(const __m128 *)info->v2[vert_attr];
+
+   calc_coef4(inputs, info, slot, a0, a1, a2);
+}
+
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void perspective_coef( struct lp_rast_shader_inputs *inputs,
+                              const struct lp_tri_info *info,
+                              unsigned slot,
+			      unsigned vert_attr)
+{
+   /* premultiply by 1/w  (v[0][3] is always 1/w):
+    */
+   __m128 a0 = *(const __m128 *)info->v0[vert_attr];
+   __m128 a1 = *(const __m128 *)info->v1[vert_attr];
+   __m128 a2 = *(const __m128 *)info->v2[vert_attr];
+
+   __m128 a0_oow = _mm_mul_ps(a0, _mm_set1_ps(info->v0[0][3]));
+   __m128 a1_oow = _mm_mul_ps(a1, _mm_set1_ps(info->v1[0][3]));
+   __m128 a2_oow = _mm_mul_ps(a2, _mm_set1_ps(info->v2[0][3]));
+
+   calc_coef4(inputs, info, slot, a0_oow, a1_oow, a2_oow);
+}
+
+
+
+
+
+/**
+ * Compute the inputs-> dadx, dady, a0 values.
+ */
+void lp_setup_tri_coef( struct lp_setup_context *setup,
+			struct lp_rast_shader_inputs *inputs,
+			const struct lp_tri_info *info)
+{
+   unsigned slot;
+
+   /* The internal position input is in slot zero:
+    */
+   linear_coef(inputs, info, 0, 0);
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
+      unsigned vert_attr = setup->fs.input[slot].src_index;
+
+      switch (setup->fs.input[slot].interp) {
+      case LP_INTERP_CONSTANT:
+         if (setup->flatshade_first) {
+	    constant_coef4(inputs, info, slot+1, info->v0[vert_attr]);
+         }
+         else {
+	    constant_coef4(inputs, info, slot+1, info->v2[vert_attr]);
+         }
+         break;
+
+      case LP_INTERP_LINEAR:
+	 linear_coef(inputs, info, slot+1, vert_attr);
+         break;
+
+      case LP_INTERP_PERSPECTIVE:
+	 perspective_coef(inputs, info, slot+1, vert_attr);
+         break;
+
+      case LP_INTERP_POSITION:
+         /*
+          * The generated pixel interpolators will pick up the coeffs from
+          * slot 0.
+          */
+         break;
+
+      case LP_INTERP_FACING:
+         setup_facing_coef(inputs, info, slot+1);
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+}
+
+#else
+extern void lp_setup_coef_dummy(void);
+void lp_setup_coef_dummy(void)
+{
+}
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
index a0606f5034..877a492c6d 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -41,6 +41,7 @@
 #include "lp_scene.h"
 
 #include "draw/draw_vbuf.h"
+#include "util/u_rect.h"
 
 #define LP_SETUP_NEW_FS          0x01
 #define LP_SETUP_NEW_CONSTANTS   0x02
@@ -73,6 +74,7 @@ struct lp_setup_context
    uint prim;
    uint vertex_size;
    uint nr_vertices;
+   uint sprite;
    uint vertex_buffer_size;
    void *vertex_buffer;
 
@@ -88,10 +90,17 @@ struct lp_setup_context
    boolean flatshade_first;
    boolean ccw_is_frontface;
    boolean scissor_test;
+   boolean point_size_per_vertex;
    unsigned cullmode;
    float pixel_offset;
+   float line_width;
+   float point_size;
+   float psize;
 
    struct pipe_framebuffer_state fb;
+   struct u_rect framebuffer;
+   struct u_rect scissor;
+   struct u_rect draw_region;   /* intersection of fb & scissor */
 
    struct {
       unsigned flags;
@@ -127,9 +136,6 @@ struct lp_setup_context
       uint8_t *stored;
    } blend_color;
 
-   struct {
-      struct pipe_scissor_state current;
-   } scissor;
 
    unsigned dirty;   /**< bitmask of LP_SETUP_NEW_x bits */
 
@@ -158,4 +164,29 @@ void lp_setup_update_state( struct lp_setup_context *setup );
 
 void lp_setup_destroy( struct lp_setup_context *setup );
 
+void
+lp_setup_print_triangle(struct lp_setup_context *setup,
+                        const float (*v0)[4],
+                        const float (*v1)[4],
+                        const float (*v2)[4]);
+
+void
+lp_setup_print_vertex(struct lp_setup_context *setup,
+                      const char *name,
+                      const float (*v)[4]);
+
+
+struct lp_rast_triangle *
+lp_setup_alloc_triangle(struct lp_scene *scene,
+                        unsigned nr_inputs,
+                        unsigned nr_planes,
+                        unsigned *tri_size);
+
+void
+lp_setup_bin_triangle( struct lp_setup_context *setup,
+                       struct lp_rast_triangle *tri,
+                       const struct u_rect *bbox,
+                       int nr_planes );
+
 #endif
+
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index be41c44e6f..ce2da55cf4 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -29,19 +29,671 @@
  * Binning code for lines
  */
 
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "lp_perf.h"
 #include "lp_setup_context.h"
+#include "lp_rast.h"
+#include "lp_state_fs.h"
 
-static void line_nop( struct lp_setup_context *setup,
-                      const float (*v0)[4],
-                      const float (*v1)[4] )
+#define NUM_CHANNELS 4
+
+struct lp_line_info {
+
+   float dx;
+   float dy;
+   float oneoverarea;
+
+   const float (*v1)[4];
+   const float (*v2)[4];
+};
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ */
+static void constant_coef( struct lp_setup_context *setup,
+                           struct lp_rast_triangle *tri,
+                           unsigned slot,
+                           const float value,
+                           unsigned i )
+{
+   tri->inputs.a0[slot][i] = value;
+   tri->inputs.dadx[slot][i] = 0.0f;
+   tri->inputs.dady[slot][i] = 0.0f;
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a triangle.
+ */
+static void linear_coef( struct lp_setup_context *setup,
+                         struct lp_rast_triangle *tri,
+                         struct lp_line_info *info,
+                         unsigned slot,
+                         unsigned vert_attr,
+                         unsigned i)
+{
+   float a1 = info->v1[vert_attr][i]; 
+   float a2 = info->v2[vert_attr][i];
+      
+   float da21 = a1 - a2;   
+   float dadx = da21 * info->dx * info->oneoverarea;
+   float dady = da21 * info->dy * info->oneoverarea;
+
+   tri->inputs.dadx[slot][i] = dadx;
+   tri->inputs.dady[slot][i] = dady;  
+   
+   tri->inputs.a0[slot][i] = (a1 -
+                              (dadx * (info->v1[0][0] - setup->pixel_offset) +
+                               dady * (info->v1[0][1] - setup->pixel_offset)));
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void perspective_coef( struct lp_setup_context *setup,
+                              struct lp_rast_triangle *tri,
+                              struct lp_line_info *info,
+                              unsigned slot,
+                              unsigned vert_attr,
+                              unsigned i)
+{
+   /* premultiply by 1/w  (v[0][3] is always 1/w):
+    */
+   float a1 = info->v1[vert_attr][i] * info->v1[0][3];
+   float a2 = info->v2[vert_attr][i] * info->v2[0][3];
+
+   float da21 = a1 - a2;   
+   float dadx = da21 * info->dx * info->oneoverarea;
+   float dady = da21 * info->dy * info->oneoverarea;
+
+   tri->inputs.dadx[slot][i] = dadx;
+   tri->inputs.dady[slot][i] = dady;
+   
+   tri->inputs.a0[slot][i] = (a1 -
+                              (dadx * (info->v1[0][0] - setup->pixel_offset) +
+                               dady * (info->v1[0][1] - setup->pixel_offset)));
+}
+
+static void
+setup_fragcoord_coef( struct lp_setup_context *setup,
+                      struct lp_rast_triangle *tri,
+                      struct lp_line_info *info,
+                      unsigned slot,
+                      unsigned usage_mask)
+{
+   /*X*/
+   if (usage_mask & TGSI_WRITEMASK_X) {
+      tri->inputs.a0[slot][0] = 0.0;
+      tri->inputs.dadx[slot][0] = 1.0;
+      tri->inputs.dady[slot][0] = 0.0;
+   }
+
+   /*Y*/
+   if (usage_mask & TGSI_WRITEMASK_Y) {
+      tri->inputs.a0[slot][1] = 0.0;
+      tri->inputs.dadx[slot][1] = 0.0;
+      tri->inputs.dady[slot][1] = 1.0;
+   }
+
+   /*Z*/
+   if (usage_mask & TGSI_WRITEMASK_Z) {
+      linear_coef(setup, tri, info, slot, 0, 2);
+   }
+
+   /*W*/
+   if (usage_mask & TGSI_WRITEMASK_W) {
+      linear_coef(setup, tri, info, slot, 0, 3);
+   }
+}
+
+/**
+ * Compute the tri->coef[] array dadx, dady, a0 values.
+ */
+static void setup_line_coefficients( struct lp_setup_context *setup,
+                                     struct lp_rast_triangle *tri,
+                                     struct lp_line_info *info)
+{
+   unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
+   unsigned slot;
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
+      unsigned vert_attr = setup->fs.input[slot].src_index;
+      unsigned usage_mask = setup->fs.input[slot].usage_mask;
+      unsigned i;
+           
+      switch (setup->fs.input[slot].interp) {
+      case LP_INTERP_CONSTANT:
+         if (setup->flatshade_first) {
+            for (i = 0; i < NUM_CHANNELS; i++)
+               if (usage_mask & (1 << i))
+                  constant_coef(setup, tri, slot+1, info->v1[vert_attr][i], i);
+         }
+         else {
+            for (i = 0; i < NUM_CHANNELS; i++)
+               if (usage_mask & (1 << i))
+                  constant_coef(setup, tri, slot+1, info->v2[vert_attr][i], i);
+         }
+         break;
+
+      case LP_INTERP_LINEAR:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            if (usage_mask & (1 << i))
+               linear_coef(setup, tri, info, slot+1, vert_attr, i);
+         break;
+
+      case LP_INTERP_PERSPECTIVE:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            if (usage_mask & (1 << i))
+               perspective_coef(setup, tri, info, slot+1, vert_attr, i);
+         fragcoord_usage_mask |= TGSI_WRITEMASK_W;
+         break;
+
+      case LP_INTERP_POSITION:
+         /*
+          * The generated pixel interpolators will pick up the coeffs from
+          * slot 0, so all need to ensure that the usage mask is covers all
+          * usages.
+          */
+         fragcoord_usage_mask |= usage_mask;
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   /* The internal position input is in slot zero:
+    */
+   setup_fragcoord_coef(setup, tri, info, 0,
+                        fragcoord_usage_mask);
+}
+
+
+
+static INLINE int subpixel_snap( float a )
+{
+   return util_iround(FIXED_ONE * a);
+}
+
+
+/**
+ * Print line vertex attribs (for debug).
+ */
+static void
+print_line(struct lp_setup_context *setup,
+           const float (*v1)[4],
+           const float (*v2)[4])
+{
+   uint i;
+
+   debug_printf("llvmpipe line\n");
+   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {
+      debug_printf("  v1[%d]:  %f %f %f %f\n", i,
+                   v1[i][0], v1[i][1], v1[i][2], v1[i][3]);
+   }
+   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {
+      debug_printf("  v2[%d]:  %f %f %f %f\n", i,
+                   v2[i][0], v2[i][1], v2[i][2], v2[i][3]);
+   }
+}
+
+
+static INLINE boolean sign(float x){
+   return x >= 0;  
+}  
+
+
+/* Used on positive floats only:
+ */
+static INLINE float fracf(float f)
 {
+   return f - floorf(f);
 }
 
 
-void 
-lp_setup_choose_line( struct lp_setup_context *setup )
+
+static void
+lp_setup_line( struct lp_setup_context *setup,
+               const float (*v1)[4],
+               const float (*v2)[4])
 {
-   setup->line = line_nop;
+   struct lp_scene *scene = lp_setup_get_current_scene(setup);
+   struct lp_rast_triangle *line;
+   struct lp_line_info info;
+   float width = MAX2(1.0, setup->line_width);
+   struct u_rect bbox;
+   unsigned tri_bytes;
+   int x[4]; 
+   int y[4];
+   int i;
+   int nr_planes = 4;
+   
+   /* linewidth should be interpreted as integer */
+   int fixed_width = util_iround(width) * FIXED_ONE;
+
+   float x_offset=0;
+   float y_offset=0;
+   float x_offset_end=0;
+   float y_offset_end=0;
+      
+   float x1diff;
+   float y1diff;
+   float x2diff;
+   float y2diff;
+   float dx, dy;
+
+   boolean draw_start;
+   boolean draw_end;
+   boolean will_draw_start;
+   boolean will_draw_end;
+
+   if (0)
+      print_line(setup, v1, v2);
+
+   if (setup->scissor_test) {
+      nr_planes = 8;
+   }
+   else {
+      nr_planes = 4;
+   }
+
+
+   dx = v1[0][0] - v2[0][0];
+   dy = v1[0][1] - v2[0][1];
+  
+   /* X-MAJOR LINE */
+   if (fabsf(dx) >= fabsf(dy)) {
+      float dydx = dy / dx;
+
+      x1diff = v1[0][0] - (float) floor(v1[0][0]) - 0.5;
+      y1diff = v1[0][1] - (float) floor(v1[0][1]) - 0.5;
+      x2diff = v2[0][0] - (float) floor(v2[0][0]) - 0.5;
+      y2diff = v2[0][1] - (float) floor(v2[0][1]) - 0.5;
+
+      if (y2diff==-0.5 && dy<0){
+         y2diff = 0.5;
+      }
+      
+      /* 
+       * Diamond exit rule test for starting point 
+       */    
+      if (fabsf(x1diff) + fabsf(y1diff) < 0.5) {
+         draw_start = TRUE;
+      }
+      else if (sign(x1diff) == sign(-dx)) {
+         draw_start = FALSE;
+      }
+      else if (sign(-y1diff) != sign(dy)) {
+         draw_start = TRUE;
+      }
+      else {
+         /* do intersection test */
+         float yintersect = fracf(v1[0][1]) + x1diff * dydx;
+         draw_start = (yintersect < 1.0 && yintersect > 0.0);
+      }
+
+
+      /* 
+       * Diamond exit rule test for ending point 
+       */    
+      if (fabsf(x2diff) + fabsf(y2diff) < 0.5) {
+         draw_end = FALSE;
+      }
+      else if (sign(x2diff) != sign(-dx)) {
+         draw_end = FALSE;
+      }
+      else if (sign(-y2diff) == sign(dy)) {
+         draw_end = TRUE;
+      }
+      else {
+         /* do intersection test */
+         float yintersect = fracf(v2[0][1]) + x2diff * dydx;
+         draw_end = (yintersect < 1.0 && yintersect > 0.0);
+      }
+
+      /* Are we already drawing start/end?
+       */
+      will_draw_start = sign(-x1diff) != sign(dx);
+      will_draw_end = (sign(x2diff) == sign(-dx)) || x2diff==0;
+
+      if (dx < 0) {
+         /* if v2 is to the right of v1, swap pointers */
+         const float (*temp)[4] = v1;
+         v1 = v2;
+         v2 = temp;
+         dx = -dx;
+         dy = -dy;
+         /* Otherwise shift planes appropriately */
+         if (will_draw_start != draw_start) {
+            x_offset_end = - x1diff - 0.5;
+            y_offset_end = x_offset_end * dydx;
+
+         }
+         if (will_draw_end != draw_end) {
+            x_offset = - x2diff - 0.5;
+            y_offset = x_offset * dydx;
+         }
+
+      }
+      else{
+         /* Otherwise shift planes appropriately */
+         if (will_draw_start != draw_start) {
+            x_offset = - x1diff + 0.5;
+            y_offset = x_offset * dydx;
+         }
+         if (will_draw_end != draw_end) {
+            x_offset_end = - x2diff + 0.5;
+            y_offset_end = x_offset_end * dydx;
+         }
+      }
+  
+      /* x/y positions in fixed point */
+      x[0] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset);
+      x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset);
+      x[2] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset);
+      x[3] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset);
+      
+      y[0] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset) - fixed_width/2;
+      y[1] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset) - fixed_width/2;
+      y[2] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset) + fixed_width/2;
+      y[3] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset) + fixed_width/2;
+      
+   }
+   else {
+      const float dxdy = dx / dy;
+
+      /* Y-MAJOR LINE */      
+      x1diff = v1[0][0] - (float) floor(v1[0][0]) - 0.5;
+      y1diff = v1[0][1] - (float) floor(v1[0][1]) - 0.5;
+      x2diff = v2[0][0] - (float) floor(v2[0][0]) - 0.5;
+      y2diff = v2[0][1] - (float) floor(v2[0][1]) - 0.5;
+
+      if (x2diff==-0.5 && dx<0) {
+         x2diff = 0.5;
+      }
+
+      /* 
+       * Diamond exit rule test for starting point 
+       */    
+      if (fabsf(x1diff) + fabsf(y1diff) < 0.5) {
+         draw_start = TRUE;
+      }
+      else if (sign(-y1diff) == sign(dy)) {
+         draw_start = FALSE;
+      }
+      else if (sign(x1diff) != sign(-dx)) {
+         draw_start = TRUE;
+      }
+      else {
+         /* do intersection test */
+         float xintersect = fracf(v1[0][0]) + y1diff * dxdy;
+         draw_start = (xintersect < 1.0 && xintersect > 0.0);
+      }
+
+      /* 
+       * Diamond exit rule test for ending point 
+       */    
+      if (fabsf(x2diff) + fabsf(y2diff) < 0.5) {
+         draw_end = FALSE;
+      }
+      else if (sign(-y2diff) != sign(dy) ) {
+         draw_end = FALSE;
+      }
+      else if (sign(x2diff) == sign(-dx) ) {
+         draw_end = TRUE;
+      }
+      else {
+         /* do intersection test */
+         float xintersect = fracf(v2[0][0]) + y2diff * dxdy;
+         draw_end = (xintersect < 1.0 && xintersect > 0.0);
+      }
+
+      /* Are we already drawing start/end?
+       */
+      will_draw_start = sign(y1diff) == sign(dy);
+      will_draw_end = (sign(-y2diff) == sign(dy)) || y2diff==0;
+
+      if (dy > 0) {
+         /* if v2 is on top of v1, swap pointers */
+         const float (*temp)[4] = v1;
+         v1 = v2;
+         v2 = temp; 
+         dx = -dx;
+         dy = -dy;
+
+         /* Otherwise shift planes appropriately */
+         if (will_draw_start != draw_start) {
+            y_offset_end = - y1diff + 0.5;
+            x_offset_end = y_offset_end * dxdy;
+         }
+         if (will_draw_end != draw_end) {
+            y_offset = - y2diff + 0.5;
+            x_offset = y_offset * dxdy;
+         }
+      }
+      else {
+         /* Otherwise shift planes appropriately */
+         if (will_draw_start != draw_start) {
+            y_offset = - y1diff - 0.5;
+            x_offset = y_offset * dxdy;
+                     
+         }
+         if (will_draw_end != draw_end) {
+            y_offset_end = - y2diff - 0.5;
+            x_offset_end = y_offset_end * dxdy;
+         }
+      }
+ 
+      /* x/y positions in fixed point */
+      x[0] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset) - fixed_width/2;
+      x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) - fixed_width/2;
+      x[2] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) + fixed_width/2;
+      x[3] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset) + fixed_width/2;
+     
+      y[0] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset); 
+      y[1] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset);
+      y[2] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset);
+      y[3] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset);
+   }
+
+
+
+   LP_COUNT(nr_tris);
+
+ 
+   /* Bounding rectangle (in pixels) */
+   {
+      /* Yes this is necessary to accurately calculate bounding boxes
+       * with the two fill-conventions we support.  GL (normally) ends
+       * up needing a bottom-left fill convention, which requires
+       * slightly different rounding.
+       */
+      int adj = (setup->pixel_offset != 0) ? 1 : 0;
+
+      bbox.x0 = (MIN4(x[0], x[1], x[2], x[3]) + (FIXED_ONE-1)) >> FIXED_ORDER;
+      bbox.x1 = (MAX4(x[0], x[1], x[2], x[3]) + (FIXED_ONE-1)) >> FIXED_ORDER;
+      bbox.y0 = (MIN4(y[0], y[1], y[2], y[3]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+      bbox.y1 = (MAX4(y[0], y[1], y[2], y[3]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+
+      /* Inclusive coordinates:
+       */
+      bbox.x1--;
+      bbox.y1--;
+   }
+
+   if (bbox.x1 < bbox.x0 ||
+       bbox.y1 < bbox.y0) {
+      if (0) debug_printf("empty bounding box\n");
+      LP_COUNT(nr_culled_tris);
+      return;
+   }
+
+   if (!u_rect_test_intersection(&setup->draw_region, &bbox)) {
+      if (0) debug_printf("offscreen\n");
+      LP_COUNT(nr_culled_tris);
+      return;
+   }
+
+   u_rect_find_intersection(&setup->draw_region, &bbox);
+
+   line = lp_setup_alloc_triangle(scene,
+                                  setup->fs.nr_inputs,
+                                  nr_planes,
+                                  &tri_bytes);
+   if (!line)
+      return;
+
+#ifdef DEBUG
+   line->v[0][0] = v1[0][0];
+   line->v[1][0] = v2[0][0];   
+   line->v[0][1] = v1[0][1];
+   line->v[1][1] = v2[0][1];
+#endif
+
+   /* calculate the deltas */
+   line->plane[0].dcdy = x[0] - x[1];
+   line->plane[1].dcdy = x[1] - x[2];
+   line->plane[2].dcdy = x[2] - x[3];
+   line->plane[3].dcdy = x[3] - x[0];
+
+   line->plane[0].dcdx = y[0] - y[1];
+   line->plane[1].dcdx = y[1] - y[2];
+   line->plane[2].dcdx = y[2] - y[3];
+   line->plane[3].dcdx = y[3] - y[0];
+
+
+   info.oneoverarea = 1.0f / (dx * dx  + dy * dy);    
+   info.dx = dx;
+   info.dy = dy;
+   info.v1 = v1;
+   info.v2 = v2;
+
+   /* Setup parameter interpolants:
+    */
+   setup_line_coefficients( setup, line, &info); 
+
+   line->inputs.facing = 1.0F;
+   line->inputs.state = setup->fs.stored;
+
+   for (i = 0; i < 4; i++) {
+      struct lp_rast_plane *plane = &line->plane[i];
+
+      /* half-edge constants, will be interated over the whole render
+       * target.
+       */
+      plane->c = plane->dcdx * x[i] - plane->dcdy * y[i];
+
+      
+      /* correct for top-left vs. bottom-left fill convention.  
+       *
+       * note that we're overloading gl_rasterization_rules to mean
+       * both (0.5,0.5) pixel centers *and* bottom-left filling
+       * convention.
+       *
+       * GL actually has a top-left filling convention, but GL's
+       * notion of "top" differs from gallium's...
+       *
+       * Also, sometimes (in FBO cases) GL will render upside down
+       * to its usual method, in which case it will probably want
+       * to use the opposite, top-left convention.
+       */         
+      if (plane->dcdx < 0) {
+         /* both fill conventions want this - adjust for left edges */
+         plane->c++;            
+      }
+      else if (plane->dcdx == 0) {
+         if (setup->pixel_offset == 0) {
+            /* correct for top-left fill convention:
+             */
+            if (plane->dcdy > 0) plane->c++;
+         }
+         else {
+            /* correct for bottom-left fill convention:
+             */
+            if (plane->dcdy < 0) plane->c++;
+         }
+      }
+
+      plane->dcdx *= FIXED_ONE;
+      plane->dcdy *= FIXED_ONE;
+
+      /* find trivial reject offsets for each edge for a single-pixel
+       * sized block.  These will be scaled up at each recursive level to
+       * match the active blocksize.  Scaling in this way works best if
+       * the blocks are square.
+       */
+      plane->eo = 0;
+      if (plane->dcdx < 0) plane->eo -= plane->dcdx;
+      if (plane->dcdy > 0) plane->eo += plane->dcdy;
+
+      /* Calculate trivial accept offsets from the above.
+       */
+      plane->ei = plane->dcdy - plane->dcdx - plane->eo;
+   }
+
+
+   /* 
+    * When rasterizing scissored tris, use the intersection of the
+    * triangle bounding box and the scissor rect to generate the
+    * scissor planes.
+    *
+    * This permits us to cut off the triangle "tails" that are present
+    * in the intermediate recursive levels caused when two of the
+    * triangles edges don't diverge quickly enough to trivially reject
+    * exterior blocks from the triangle.
+    *
+    * It's not really clear if it's worth worrying about these tails,
+    * but since we generate the planes for each scissored tri, it's
+    * free to trim them in this case.
+    * 
+    * Note that otherwise, the scissor planes only vary in 'C' value,
+    * and even then only on state-changes.  Could alternatively store
+    * these planes elsewhere.
+    */
+   if (nr_planes == 8) {
+      line->plane[4].dcdx = -1;
+      line->plane[4].dcdy = 0;
+      line->plane[4].c = 1-bbox.x0;
+      line->plane[4].ei = 0;
+      line->plane[4].eo = 1;
+
+      line->plane[5].dcdx = 1;
+      line->plane[5].dcdy = 0;
+      line->plane[5].c = bbox.x1+1;
+      line->plane[5].ei = -1;
+      line->plane[5].eo = 0;
+
+      line->plane[6].dcdx = 0;
+      line->plane[6].dcdy = 1;
+      line->plane[6].c = 1-bbox.y0;
+      line->plane[6].ei = 0;
+      line->plane[6].eo = 1;
+
+      line->plane[7].dcdx = 0;
+      line->plane[7].dcdy = -1;
+      line->plane[7].c = bbox.y1+1;
+      line->plane[7].ei = -1;
+      line->plane[7].eo = 0;
+   }
+
+   lp_setup_bin_triangle(setup, line, &bbox, nr_planes);
+}
+   
+
+void lp_setup_choose_line( struct lp_setup_context *setup ) 
+{ 
+   setup->line = lp_setup_line;
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 9f69e6c5ce..6ae318d328 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2010, VMware Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -18,7 +18,7 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -30,17 +30,299 @@
  */
 
 #include "lp_setup_context.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "lp_perf.h"
+#include "lp_setup_context.h"
+#include "lp_rast.h"
+#include "lp_state_fs.h"
+#include "tgsi/tgsi_scan.h"
+
+#define NUM_CHANNELS 4
+
+struct point_info {
+   /* x,y deltas */
+   int dy01, dy12;
+   int dx01, dx12;
+
+   const float (*v0)[4];
+};   
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ */
+static void constant_coef( struct lp_setup_context *setup,
+                           struct lp_rast_triangle *point,
+                           unsigned slot,
+                           const float value,
+                           unsigned i )
+{
+   point->inputs.a0[slot][i] = value;
+   point->inputs.dadx[slot][i] = 0.0f;
+   point->inputs.dady[slot][i] = 0.0f;
+}
+
+static void perspective_coef( struct lp_setup_context *setup,
+                              struct lp_rast_triangle *point,
+                              const struct point_info *info,
+                              unsigned slot,
+                              unsigned vert_attr,
+                              unsigned i)
+{
+   if (i == 0) {   
+      float dadx = FIXED_ONE / (float)info->dx12;  
+      float dady =  0.0f;
+      point->inputs.dadx[slot][i] = dadx;
+      point->inputs.dady[slot][i] = dady;
+      point->inputs.a0[slot][i] = (0.5 -
+                                  (dadx * ((float)info->v0[0][0] - setup->pixel_offset) +
+                                   dady * ((float)info->v0[0][1] - setup->pixel_offset)));
+   }
+
+   else if (i == 1) {
+      float dadx =  0.0f; 
+      float dady =  FIXED_ONE / (float)info->dx12;
+   
+      point->inputs.dadx[slot][i] = dadx;
+      point->inputs.dady[slot][i] = dady;
+      point->inputs.a0[slot][i] = (0.5 -
+                                  (dadx * ((float)info->v0[0][0] - setup->pixel_offset) +
+                                   dady * ((float)info->v0[0][1] - setup->pixel_offset)));
+   }
+
+   else if (i == 2) {
+      point->inputs.a0[slot][i] = 0.0f;
+      point->inputs.dadx[slot][i] = 0.0f;
+      point->inputs.dady[slot][i] = 0.0f;
+   }
+      
+   else if (i == 3) {
+      point->inputs.a0[slot][i] = 1.0f;
+      point->inputs.dadx[slot][i] = 0.0f;
+      point->inputs.dady[slot][i] = 0.0f;
+   }
+
+}
+
+
+/**
+ * Special coefficient setup for gl_FragCoord.
+ * X and Y are trivial
+ * Z and W are copied from position_coef which should have already been computed.
+ * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
+ */
+static void
+setup_point_fragcoord_coef(struct lp_setup_context *setup,
+                           struct lp_rast_triangle *point,
+                           const struct point_info *info,
+                           unsigned slot,
+                           unsigned usage_mask)
+{
+   /*X*/
+   if (usage_mask & TGSI_WRITEMASK_X) {
+      point->inputs.a0[slot][0] = 0.0;
+      point->inputs.dadx[slot][0] = 1.0;
+      point->inputs.dady[slot][0] = 0.0;
+   }
+
+   /*Y*/
+   if (usage_mask & TGSI_WRITEMASK_Y) {
+      point->inputs.a0[slot][1] = 0.0;
+      point->inputs.dadx[slot][1] = 0.0;
+      point->inputs.dady[slot][1] = 1.0;
+   }
+
+   /*Z*/
+   if (usage_mask & TGSI_WRITEMASK_Z) {
+      constant_coef(setup, point, slot, info->v0[0][2], 2);
+   }
+
+   /*W*/
+   if (usage_mask & TGSI_WRITEMASK_W) {
+      constant_coef(setup, point, slot, info->v0[0][3], 3);
+   }
+}
+
+/**
+ * Compute the point->coef[] array dadx, dady, a0 values.
+ */
+static void   
+setup_point_coefficients( struct lp_setup_context *setup,
+                          struct lp_rast_triangle *point,
+                          const struct point_info *info)
+{
+   unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
+   unsigned slot;
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
+      unsigned vert_attr = setup->fs.input[slot].src_index;
+      unsigned usage_mask = setup->fs.input[slot].usage_mask;
+      unsigned i;
+      
+      switch (setup->fs.input[slot].interp) {
+      case LP_INTERP_POSITION:
+         /*
+          * The generated pixel interpolators will pick up the coeffs from
+          * slot 0, so all need to ensure that the usage mask is covers all
+          * usages.
+          */
+         fragcoord_usage_mask |= usage_mask;
+         break;
+
+      case LP_INTERP_PERSPECTIVE:
+         /* For point sprite textures */        
+         if (setup->fs.current.variant->shader->info.input_semantic_name[slot] 
+             == TGSI_SEMANTIC_GENERIC) 
+         {
+            int index = setup->fs.current.variant->shader->info.input_semantic_index[slot];
+            
+            if (setup->sprite & (1 << index)) {
+               for (i = 0; i < NUM_CHANNELS; i++)
+                  if (usage_mask & (1 << i))
+                     perspective_coef(setup, point, info, slot+1, vert_attr, i);
+               fragcoord_usage_mask |= TGSI_WRITEMASK_W;
+               break;                     
+            }
+         }
+
+         /* Otherwise fallthrough */
+      default:
+         for (i = 0; i < NUM_CHANNELS; i++) {
+            if (usage_mask & (1 << i))
+               constant_coef(setup, point, slot+1, info->v0[vert_attr][i], i);
+         }
+      }
+   }
 
-static void point_nop( struct lp_setup_context *setup,
-                       const float (*v0)[4] )
+   /* The internal position input is in slot zero:
+    */
+   setup_point_fragcoord_coef(setup, point, info, 0,
+                              fragcoord_usage_mask);
+}
+
+static INLINE int
+subpixel_snap(float a)
 {
+   return util_iround(FIXED_ONE * a);
+}
+
+
+static void lp_setup_point( struct lp_setup_context *setup,
+                            const float (*v0)[4] )
+{
+   /* x/y positions in fixed point */
+   const int sizeAttr = setup->psize;
+   const float size
+      = (setup->point_size_per_vertex && sizeAttr > 0) ? v0[sizeAttr][0]
+      : setup->point_size;
+   
+   /* Point size as fixed point integer, remove rounding errors 
+    * and gives minimum width for very small points
+    */
+   int fixed_width = MAX2(FIXED_ONE,
+                          (subpixel_snap(size) + FIXED_ONE/2 - 1) & ~(FIXED_ONE-1));
+
+   const int x0 = subpixel_snap(v0[0][0] - setup->pixel_offset) - fixed_width/2;
+   const int y0 = subpixel_snap(v0[0][1] - setup->pixel_offset) - fixed_width/2;
+     
+   struct lp_scene *scene = lp_setup_get_current_scene(setup);
+   struct lp_rast_triangle *point;
+   unsigned bytes;
+   struct u_rect bbox;
+   unsigned nr_planes = 4;
+   struct point_info info;
+
+
+   /* Bounding rectangle (in pixels) */
+   {
+      /* Yes this is necessary to accurately calculate bounding boxes
+       * with the two fill-conventions we support.  GL (normally) ends
+       * up needing a bottom-left fill convention, which requires
+       * slightly different rounding.
+       */
+      int adj = (setup->pixel_offset != 0) ? 1 : 0;
+
+      bbox.x0 = (x0 + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+      bbox.x1 = (x0 + fixed_width + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+      bbox.y0 = (y0 + (FIXED_ONE-1)) >> FIXED_ORDER;
+      bbox.y1 = (y0 + fixed_width + (FIXED_ONE-1)) >> FIXED_ORDER;
+
+      /* Inclusive coordinates:
+       */
+      bbox.x1--;
+      bbox.y1--;
+   }
+   
+   if (!u_rect_test_intersection(&setup->draw_region, &bbox)) {
+      if (0) debug_printf("offscreen\n");
+      LP_COUNT(nr_culled_tris);
+      return;
+   }
+
+   u_rect_find_intersection(&setup->draw_region, &bbox);
+
+   point = lp_setup_alloc_triangle(scene,
+                                   setup->fs.nr_inputs,
+                                   nr_planes,
+                                   &bytes);
+   if (!point)
+      return;
+
+#ifdef DEBUG
+   point->v[0][0] = v0[0][0];
+   point->v[0][1] = v0[0][1];
+#endif
+
+   info.v0 = v0;
+   info.dx01 = 0;
+   info.dx12 = fixed_width;
+   info.dy01 = fixed_width;
+   info.dy12 = 0;
+   
+   /* Setup parameter interpolants:
+    */
+   setup_point_coefficients(setup, point, &info);
+
+   point->inputs.facing = 1.0F;
+   point->inputs.state = setup->fs.stored;
+
+   {
+      point->plane[0].dcdx = -1;
+      point->plane[0].dcdy = 0;
+      point->plane[0].c = 1-bbox.x0;
+      point->plane[0].ei = 0;
+      point->plane[0].eo = 1;
+
+      point->plane[1].dcdx = 1;
+      point->plane[1].dcdy = 0;
+      point->plane[1].c = bbox.x1+1;
+      point->plane[1].ei = -1;
+      point->plane[1].eo = 0;
+
+      point->plane[2].dcdx = 0;
+      point->plane[2].dcdy = 1;
+      point->plane[2].c = 1-bbox.y0;
+      point->plane[2].ei = 0;
+      point->plane[2].eo = 1;
+
+      point->plane[3].dcdx = 0;
+      point->plane[3].dcdy = -1;
+      point->plane[3].c = bbox.y1+1;
+      point->plane[3].ei = -1;
+      point->plane[3].eo = 0;
+   }
+
+   lp_setup_bin_triangle(setup, point, &bbox, nr_planes);
 }
 
 
 void 
 lp_setup_choose_point( struct lp_setup_context *setup )
 {
-   setup->point = point_nop;
+   setup->point = lp_setup_point;
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 393533ebee..0180d95090 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -31,35 +31,15 @@
 
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_rect.h"
 #include "lp_perf.h"
 #include "lp_setup_context.h"
+#include "lp_setup_coef.h"
 #include "lp_rast.h"
 #include "lp_state_fs.h"
 
 #define NUM_CHANNELS 4
 
-struct tri_info {
-
-   float pixel_offset;
-
-   /* fixed point vertex coordinates */
-   int x[3];
-   int y[3];
-
-   /* float x,y deltas - all from the original coordinates
-    */
-   float dy01, dy20;
-   float dx01, dx20;
-   float oneoverarea;
-
-   const float (*v0)[4];
-   const float (*v1)[4];
-   const float (*v2)[4];
-
-   boolean frontfacing;
-};
-
-
 
    
 static INLINE int
@@ -76,247 +56,6 @@ fixed_to_float(int a)
 
 
 
-/**
- * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
- */
-static void constant_coef( struct lp_rast_triangle *tri,
-                           unsigned slot,
-			   const float value,
-                           unsigned i )
-{
-   tri->inputs.a0[slot][i] = value;
-   tri->inputs.dadx[slot][i] = 0.0f;
-   tri->inputs.dady[slot][i] = 0.0f;
-}
-
-
-
-static void linear_coef( struct lp_rast_triangle *tri,
-                         const struct tri_info *info,
-                         unsigned slot,
-                         unsigned vert_attr,
-                         unsigned i)
-{
-   float a0 = info->v0[vert_attr][i];
-   float a1 = info->v1[vert_attr][i];
-   float a2 = info->v2[vert_attr][i];
-
-   float da01 = a0 - a1;
-   float da20 = a2 - a0;
-   float dadx = (da01 * info->dy20 - info->dy01 * da20) * info->oneoverarea;
-   float dady = (da20 * info->dx01 - info->dx20 * da01) * info->oneoverarea;
-
-   tri->inputs.dadx[slot][i] = dadx;
-   tri->inputs.dady[slot][i] = dady;
-
-   /* calculate a0 as the value which would be sampled for the
-    * fragment at (0,0), taking into account that we want to sample at
-    * pixel centers, in other words (0.5, 0.5).
-    *
-    * this is neat but unfortunately not a good way to do things for
-    * triangles with very large values of dadx or dady as it will
-    * result in the subtraction and re-addition from a0 of a very
-    * large number, which means we'll end up loosing a lot of the
-    * fractional bits and precision from a0.  the way to fix this is
-    * to define a0 as the sample at a pixel center somewhere near vmin
-    * instead - i'll switch to this later.
-    */
-   tri->inputs.a0[slot][i] = (a0 -
-                              (dadx * (info->v0[0][0] - info->pixel_offset) +
-                               dady * (info->v0[0][1] - info->pixel_offset)));
-}
-
-
-/**
- * Compute a0, dadx and dady for a perspective-corrected interpolant,
- * for a triangle.
- * We basically multiply the vertex value by 1/w before computing
- * the plane coefficients (a0, dadx, dady).
- * Later, when we compute the value at a particular fragment position we'll
- * divide the interpolated value by the interpolated W at that fragment.
- */
-static void perspective_coef( struct lp_rast_triangle *tri,
-                              const struct tri_info *info,
-                              unsigned slot,
-			      unsigned vert_attr,
-                              unsigned i)
-{
-   /* premultiply by 1/w  (v[0][3] is always 1/w):
-    */
-   float a0 = info->v0[vert_attr][i] * info->v0[0][3];
-   float a1 = info->v1[vert_attr][i] * info->v1[0][3];
-   float a2 = info->v2[vert_attr][i] * info->v2[0][3];
-   float da01 = a0 - a1;
-   float da20 = a2 - a0;
-   float dadx = (da01 * info->dy20 - info->dy01 * da20) * info->oneoverarea;
-   float dady = (da20 * info->dx01 - info->dx20 * da01) * info->oneoverarea;
-
-   tri->inputs.dadx[slot][i] = dadx;
-   tri->inputs.dady[slot][i] = dady;
-   tri->inputs.a0[slot][i] = (a0 -
-                              (dadx * (info->v0[0][0] - info->pixel_offset) +
-                               dady * (info->v0[0][1] - info->pixel_offset)));
-}
-
-
-/**
- * Special coefficient setup for gl_FragCoord.
- * X and Y are trivial
- * Z and W are copied from position_coef which should have already been computed.
- * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
- */
-static void
-setup_fragcoord_coef(struct lp_rast_triangle *tri,
-                     const struct tri_info *info,
-                     unsigned slot,
-                     unsigned usage_mask)
-{
-   /*X*/
-   if (usage_mask & TGSI_WRITEMASK_X) {
-      tri->inputs.a0[slot][0] = 0.0;
-      tri->inputs.dadx[slot][0] = 1.0;
-      tri->inputs.dady[slot][0] = 0.0;
-   }
-
-   /*Y*/
-   if (usage_mask & TGSI_WRITEMASK_Y) {
-      tri->inputs.a0[slot][1] = 0.0;
-      tri->inputs.dadx[slot][1] = 0.0;
-      tri->inputs.dady[slot][1] = 1.0;
-   }
-
-   /*Z*/
-   if (usage_mask & TGSI_WRITEMASK_Z) {
-      linear_coef(tri, info, slot, 0, 2);
-   }
-
-   /*W*/
-   if (usage_mask & TGSI_WRITEMASK_W) {
-      linear_coef(tri, info, slot, 0, 3);
-   }
-}
-
-
-/**
- * Setup the fragment input attribute with the front-facing value.
- * \param frontface  is the triangle front facing?
- */
-static void setup_facing_coef( struct lp_rast_triangle *tri,
-                               unsigned slot,
-                               boolean frontface,
-                               unsigned usage_mask)
-{
-   /* convert TRUE to 1.0 and FALSE to -1.0 */
-   if (usage_mask & TGSI_WRITEMASK_X)
-      constant_coef( tri, slot, 2.0f * frontface - 1.0f, 0 );
-
-   if (usage_mask & TGSI_WRITEMASK_Y)
-      constant_coef( tri, slot, 0.0f, 1 ); /* wasted */
-
-   if (usage_mask & TGSI_WRITEMASK_Z)
-      constant_coef( tri, slot, 0.0f, 2 ); /* wasted */
-
-   if (usage_mask & TGSI_WRITEMASK_W)
-      constant_coef( tri, slot, 0.0f, 3 ); /* wasted */
-}
-
-
-/**
- * Compute the tri->coef[] array dadx, dady, a0 values.
- */
-static void setup_tri_coefficients( struct lp_setup_context *setup,
-				    struct lp_rast_triangle *tri,
-                                    const struct tri_info *info)
-{
-   unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
-   unsigned slot;
-   unsigned i;
-
-   /* setup interpolation for all the remaining attributes:
-    */
-   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
-      unsigned vert_attr = setup->fs.input[slot].src_index;
-      unsigned usage_mask = setup->fs.input[slot].usage_mask;
-
-      switch (setup->fs.input[slot].interp) {
-      case LP_INTERP_CONSTANT:
-         if (setup->flatshade_first) {
-            for (i = 0; i < NUM_CHANNELS; i++)
-               if (usage_mask & (1 << i))
-                  constant_coef(tri, slot+1, info->v0[vert_attr][i], i);
-         }
-         else {
-            for (i = 0; i < NUM_CHANNELS; i++)
-               if (usage_mask & (1 << i))
-                  constant_coef(tri, slot+1, info->v2[vert_attr][i], i);
-         }
-         break;
-
-      case LP_INTERP_LINEAR:
-         for (i = 0; i < NUM_CHANNELS; i++)
-            if (usage_mask & (1 << i))
-               linear_coef(tri, info, slot+1, vert_attr, i);
-         break;
-
-      case LP_INTERP_PERSPECTIVE:
-         for (i = 0; i < NUM_CHANNELS; i++)
-            if (usage_mask & (1 << i))
-               perspective_coef(tri, info, slot+1, vert_attr, i);
-         fragcoord_usage_mask |= TGSI_WRITEMASK_W;
-         break;
-
-      case LP_INTERP_POSITION:
-         /*
-          * The generated pixel interpolators will pick up the coeffs from
-          * slot 0, so all need to ensure that the usage mask is covers all
-          * usages.
-          */
-         fragcoord_usage_mask |= usage_mask;
-         break;
-
-      case LP_INTERP_FACING:
-         setup_facing_coef(tri, slot+1, info->frontfacing, usage_mask);
-         break;
-
-      default:
-         assert(0);
-      }
-   }
-
-   /* The internal position input is in slot zero:
-    */
-   setup_fragcoord_coef(tri, info, 0, fragcoord_usage_mask);
-
-   if (0) {
-      for (i = 0; i < NUM_CHANNELS; i++) {
-         float a0   = tri->inputs.a0  [0][i];
-         float dadx = tri->inputs.dadx[0][i];
-         float dady = tri->inputs.dady[0][i];
-
-         debug_printf("POS.%c: a0 = %f, dadx = %f, dady = %f\n",
-                      "xyzw"[i],
-                      a0, dadx, dady);
-      }
-
-      for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
-         unsigned usage_mask = setup->fs.input[slot].usage_mask;
-         for (i = 0; i < NUM_CHANNELS; i++) {
-            if (usage_mask & (1 << i)) {
-               float a0   = tri->inputs.a0  [1 + slot][i];
-               float dadx = tri->inputs.dadx[1 + slot][i];
-               float dady = tri->inputs.dady[1 + slot][i];
-
-               debug_printf("IN[%u].%c: a0 = %f, dadx = %f, dady = %f\n",
-                            slot,
-                            "xyzw"[i],
-                            a0, dadx, dady);
-            }
-         }
-      }
-   }
-}
-
-
 
 
 
@@ -329,11 +68,11 @@ static void setup_tri_coefficients( struct lp_setup_context *setup,
  * \param nr_inputs  number of fragment shader inputs
  * \return pointer to triangle space
  */
-static INLINE struct lp_rast_triangle *
-alloc_triangle(struct lp_scene *scene,
-               unsigned nr_inputs,
-               unsigned nr_planes,
-               unsigned *tri_size)
+struct lp_rast_triangle *
+lp_setup_alloc_triangle(struct lp_scene *scene,
+                        unsigned nr_inputs,
+                        unsigned nr_planes,
+                        unsigned *tri_size)
 {
    unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float);
    struct lp_rast_triangle *tri;
@@ -357,35 +96,71 @@ alloc_triangle(struct lp_scene *scene,
    return tri;
 }
 
+void
+lp_setup_print_vertex(struct lp_setup_context *setup,
+                      const char *name,
+                      const float (*v)[4])
+{
+   int i, j;
+
+   debug_printf("   wpos (%s[0]) xyzw %f %f %f %f\n",
+                name,
+                v[0][0], v[0][1], v[0][2], v[0][3]);
+
+   for (i = 0; i < setup->fs.nr_inputs; i++) {
+      const float *in = v[setup->fs.input[i].src_index];
+
+      debug_printf("  in[%d] (%s[%d]) %s%s%s%s ",
+                   i, 
+                   name, setup->fs.input[i].src_index,
+                   (setup->fs.input[i].usage_mask & 0x1) ? "x" : " ",
+                   (setup->fs.input[i].usage_mask & 0x2) ? "y" : " ",
+                   (setup->fs.input[i].usage_mask & 0x4) ? "z" : " ",
+                   (setup->fs.input[i].usage_mask & 0x8) ? "w" : " ");
+
+      for (j = 0; j < 4; j++)
+         if (setup->fs.input[i].usage_mask & (1<<j))
+            debug_printf("%.5f ", in[j]);
+
+      debug_printf("\n");
+   }
+}
+
 
 /**
  * Print triangle vertex attribs (for debug).
  */
-static void
-print_triangle(struct lp_setup_context *setup,
-               const float (*v1)[4],
-               const float (*v2)[4],
-               const float (*v3)[4])
+void
+lp_setup_print_triangle(struct lp_setup_context *setup,
+                        const float (*v0)[4],
+                        const float (*v1)[4],
+                        const float (*v2)[4])
 {
-   uint i;
+   debug_printf("triangle\n");
 
-   debug_printf("llvmpipe triangle\n");
-   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {
-      debug_printf("  v1[%d]:  %f %f %f %f\n", i,
-                   v1[i][0], v1[i][1], v1[i][2], v1[i][3]);
-   }
-   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {
-      debug_printf("  v2[%d]:  %f %f %f %f\n", i,
-                   v2[i][0], v2[i][1], v2[i][2], v2[i][3]);
-   }
-   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {
-      debug_printf("  v3[%d]:  %f %f %f %f\n", i,
-                   v3[i][0], v3[i][1], v3[i][2], v3[i][3]);
+   {
+      const float ex = v0[0][0] - v2[0][0];
+      const float ey = v0[0][1] - v2[0][1];
+      const float fx = v1[0][0] - v2[0][0];
+      const float fy = v1[0][1] - v2[0][1];
+
+      /* det = cross(e,f).z */
+      const float det = ex * fy - ey * fx;
+      if (det < 0.0f) 
+         debug_printf("   - ccw\n");
+      else if (det > 0.0f)
+         debug_printf("   - cw\n");
+      else
+         debug_printf("   - zero area\n");
    }
+
+   lp_setup_print_vertex(setup, "v0", v0);
+   lp_setup_print_vertex(setup, "v1", v1);
+   lp_setup_print_vertex(setup, "v2", v2);
 }
 
 
-lp_rast_cmd lp_rast_tri_tab[8] = {
+lp_rast_cmd lp_rast_tri_tab[9] = {
    NULL,               /* should be impossible */
    lp_rast_triangle_1,
    lp_rast_triangle_2,
@@ -393,7 +168,8 @@ lp_rast_cmd lp_rast_tri_tab[8] = {
    lp_rast_triangle_4,
    lp_rast_triangle_5,
    lp_rast_triangle_6,
-   lp_rast_triangle_7
+   lp_rast_triangle_7,
+   lp_rast_triangle_8
 };
 
 /**
@@ -403,25 +179,27 @@ lp_rast_cmd lp_rast_tri_tab[8] = {
  */
 static void
 do_triangle_ccw(struct lp_setup_context *setup,
+		const float (*v0)[4],
 		const float (*v1)[4],
 		const float (*v2)[4],
-		const float (*v3)[4],
 		boolean frontfacing )
 {
-
    struct lp_scene *scene = lp_setup_get_current_scene(setup);
-   struct lp_fragment_shader_variant *variant = setup->fs.current.variant;
    struct lp_rast_triangle *tri;
-   struct tri_info info;
+   int x[3];
+   int y[3];
+   float dy01, dy20;
+   float dx01, dx20;
+   float oneoverarea;
+   struct lp_tri_info info;
    int area;
-   int minx, maxx, miny, maxy;
-   int ix0, ix1, iy0, iy1;
+   struct u_rect bbox;
    unsigned tri_bytes;
    int i;
    int nr_planes = 3;
       
    if (0)
-      print_triangle(setup, v1, v2, v3);
+      lp_setup_print_triangle(setup, v0, v1, v2);
 
    if (setup->scissor_test) {
       nr_planes = 7;
@@ -430,38 +208,73 @@ do_triangle_ccw(struct lp_setup_context *setup,
       nr_planes = 3;
    }
 
+   /* x/y positions in fixed point */
+   x[0] = subpixel_snap(v0[0][0] - setup->pixel_offset);
+   x[1] = subpixel_snap(v1[0][0] - setup->pixel_offset);
+   x[2] = subpixel_snap(v2[0][0] - setup->pixel_offset);
+   y[0] = subpixel_snap(v0[0][1] - setup->pixel_offset);
+   y[1] = subpixel_snap(v1[0][1] - setup->pixel_offset);
+   y[2] = subpixel_snap(v2[0][1] - setup->pixel_offset);
+
+
+   /* Bounding rectangle (in pixels) */
+   {
+      /* Yes this is necessary to accurately calculate bounding boxes
+       * with the two fill-conventions we support.  GL (normally) ends
+       * up needing a bottom-left fill convention, which requires
+       * slightly different rounding.
+       */
+      int adj = (setup->pixel_offset != 0) ? 1 : 0;
+
+      bbox.x0 = (MIN3(x[0], x[1], x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER;
+      bbox.x1 = (MAX3(x[0], x[1], x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER;
+      bbox.y0 = (MIN3(y[0], y[1], y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+      bbox.y1 = (MAX3(y[0], y[1], y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+
+      /* Inclusive coordinates:
+       */
+      bbox.x1--;
+      bbox.y1--;
+   }
+
+   if (bbox.x1 < bbox.x0 ||
+       bbox.y1 < bbox.y0) {
+      if (0) debug_printf("empty bounding box\n");
+      LP_COUNT(nr_culled_tris);
+      return;
+   }
+
+   if (!u_rect_test_intersection(&setup->draw_region, &bbox)) {
+      if (0) debug_printf("offscreen\n");
+      LP_COUNT(nr_culled_tris);
+      return;
+   }
+
+   u_rect_find_intersection(&setup->draw_region, &bbox);
 
-   tri = alloc_triangle(scene,
-                        setup->fs.nr_inputs,
-                        nr_planes,
-                        &tri_bytes);
+   tri = lp_setup_alloc_triangle(scene,
+                                 setup->fs.nr_inputs,
+                                 nr_planes,
+                                 &tri_bytes);
    if (!tri)
       return;
 
 #ifdef DEBUG
-   tri->v[0][0] = v1[0][0];
-   tri->v[1][0] = v2[0][0];
-   tri->v[2][0] = v3[0][0];
-   tri->v[0][1] = v1[0][1];
-   tri->v[1][1] = v2[0][1];
-   tri->v[2][1] = v3[0][1];
+   tri->v[0][0] = v0[0][0];
+   tri->v[1][0] = v1[0][0];
+   tri->v[2][0] = v2[0][0];
+   tri->v[0][1] = v0[0][1];
+   tri->v[1][1] = v1[0][1];
+   tri->v[2][1] = v2[0][1];
 #endif
 
-   /* x/y positions in fixed point */
-   info.x[0] = subpixel_snap(v1[0][0] - setup->pixel_offset);
-   info.x[1] = subpixel_snap(v2[0][0] - setup->pixel_offset);
-   info.x[2] = subpixel_snap(v3[0][0] - setup->pixel_offset);
-   info.y[0] = subpixel_snap(v1[0][1] - setup->pixel_offset);
-   info.y[1] = subpixel_snap(v2[0][1] - setup->pixel_offset);
-   info.y[2] = subpixel_snap(v3[0][1] - setup->pixel_offset);
-
-   tri->plane[0].dcdy = info.x[0] - info.x[1];
-   tri->plane[1].dcdy = info.x[1] - info.x[2];
-   tri->plane[2].dcdy = info.x[2] - info.x[0];
+   tri->plane[0].dcdy = x[0] - x[1];
+   tri->plane[1].dcdy = x[1] - x[2];
+   tri->plane[2].dcdy = x[2] - x[0];
 
-   tri->plane[0].dcdx = info.y[0] - info.y[1];
-   tri->plane[1].dcdx = info.y[1] - info.y[2];
-   tri->plane[2].dcdx = info.y[2] - info.y[0];
+   tri->plane[0].dcdx = y[0] - y[1];
+   tri->plane[1].dcdx = y[1] - y[2];
+   tri->plane[2].dcdx = y[2] - y[0];
 
    area = (tri->plane[0].dcdy * tri->plane[2].dcdx -
            tri->plane[2].dcdy * tri->plane[0].dcdx);
@@ -478,57 +291,29 @@ do_triangle_ccw(struct lp_setup_context *setup,
       return;
    }
 
-   /* Bounding rectangle (in pixels) */
-   {
-      /* Yes this is necessary to accurately calculate bounding boxes
-       * with the two fill-conventions we support.  GL (normally) ends
-       * up needing a bottom-left fill convention, which requires
-       * slightly different rounding.
-       */
-      int adj = (setup->pixel_offset != 0) ? 1 : 0;
-
-      minx = (MIN3(info.x[0], info.x[1], info.x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER;
-      maxx = (MAX3(info.x[0], info.x[1], info.x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER;
-      miny = (MIN3(info.y[0], info.y[1], info.y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
-      maxy = (MAX3(info.y[0], info.y[1], info.y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
-   }
-
-   if (setup->scissor_test) {
-      minx = MAX2(minx, setup->scissor.current.minx);
-      maxx = MIN2(maxx, setup->scissor.current.maxx);
-      miny = MAX2(miny, setup->scissor.current.miny);
-      maxy = MIN2(maxy, setup->scissor.current.maxy);
-   }
-   else {
-      minx = MAX2(minx, 0);
-      miny = MAX2(miny, 0);
-      maxx = MIN2(maxx, scene->fb.width);
-      maxy = MIN2(maxy, scene->fb.height);
-   }
-
-
-   if (miny >= maxy || minx >= maxx) {
-      lp_scene_putback_data( scene, tri_bytes );
-      LP_COUNT(nr_culled_tris);
-      return;
-   }
 
    /* 
     */
-   info.pixel_offset = setup->pixel_offset;
-   info.v0 = v1;
-   info.v1 = v2;
-   info.v2 = v3;
-   info.dx01 = info.v0[0][0] - info.v1[0][0];
-   info.dx20 = info.v2[0][0] - info.v0[0][0];
-   info.dy01 = info.v0[0][1] - info.v1[0][1];
-   info.dy20 = info.v2[0][1] - info.v0[0][1];
-   info.oneoverarea = 1.0f / (info.dx01 * info.dy20 - info.dx20 * info.dy01);
+   dx01 = v0[0][0] - v1[0][0];
+   dy01 = v0[0][1] - v1[0][1];
+   dx20 = v2[0][0] - v0[0][0];
+   dy20 = v2[0][1] - v0[0][1];
+   oneoverarea = 1.0f / (dx01 * dy20 - dx20 * dy01);
+
+   info.v0 = v0;
+   info.v1 = v1;
+   info.v2 = v2;
    info.frontfacing = frontfacing;
+   info.x0_center = v0[0][0] - setup->pixel_offset;
+   info.y0_center = v0[0][1] - setup->pixel_offset;
+   info.dx01_ooa  = dx01 * oneoverarea;
+   info.dx20_ooa  = dx20 * oneoverarea;
+   info.dy01_ooa  = dy01 * oneoverarea;
+   info.dy20_ooa  = dy20 * oneoverarea;
 
    /* Setup parameter interpolants:
     */
-   setup_tri_coefficients( setup, tri, &info );
+   lp_setup_tri_coef( setup, &tri->inputs, &info );
 
    tri->inputs.facing = frontfacing ? 1.0F : -1.0F;
    tri->inputs.state = setup->fs.stored;
@@ -541,7 +326,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
       /* half-edge constants, will be interated over the whole render
        * target.
        */
-      plane->c = plane->dcdx * info.x[i] - plane->dcdy * info.y[i];
+      plane->c = plane->dcdx * x[i] - plane->dcdy * y[i];
 
       /* correct for top-left vs. bottom-left fill convention.  
        *
@@ -612,29 +397,43 @@ do_triangle_ccw(struct lp_setup_context *setup,
    if (nr_planes == 7) {
       tri->plane[3].dcdx = -1;
       tri->plane[3].dcdy = 0;
-      tri->plane[3].c = 1-minx;
+      tri->plane[3].c = 1-bbox.x0;
       tri->plane[3].ei = 0;
       tri->plane[3].eo = 1;
 
       tri->plane[4].dcdx = 1;
       tri->plane[4].dcdy = 0;
-      tri->plane[4].c = maxx;
+      tri->plane[4].c = bbox.x1+1;
       tri->plane[4].ei = -1;
       tri->plane[4].eo = 0;
 
       tri->plane[5].dcdx = 0;
       tri->plane[5].dcdy = 1;
-      tri->plane[5].c = 1-miny;
+      tri->plane[5].c = 1-bbox.y0;
       tri->plane[5].ei = 0;
       tri->plane[5].eo = 1;
 
       tri->plane[6].dcdx = 0;
       tri->plane[6].dcdy = -1;
-      tri->plane[6].c = maxy;
+      tri->plane[6].c = bbox.y1+1;
       tri->plane[6].ei = -1;
       tri->plane[6].eo = 0;
    }
 
+   lp_setup_bin_triangle( setup, tri, &bbox, nr_planes );
+}
+
+
+void
+lp_setup_bin_triangle( struct lp_setup_context *setup,
+                       struct lp_rast_triangle *tri,
+                       const struct u_rect *bbox,
+                       int nr_planes )
+{
+   struct lp_scene *scene = setup->scene;
+   struct lp_fragment_shader_variant *variant = setup->fs.current.variant;
+   int ix0, ix1, iy0, iy1;
+   int i;
 
    /*
     * All fields of 'tri' are now set.  The remaining code here is
@@ -643,10 +442,30 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
    /* Convert to tile coordinates, and inclusive ranges:
     */
-   ix0 = minx / TILE_SIZE;
-   iy0 = miny / TILE_SIZE;
-   ix1 = (maxx-1) / TILE_SIZE;
-   iy1 = (maxy-1) / TILE_SIZE;
+   if (nr_planes == 3) {
+      int ix0 = bbox->x0 / 16;
+      int iy0 = bbox->y0 / 16;
+      int ix1 = bbox->x1 / 16;
+      int iy1 = bbox->y1 / 16;
+      
+      if (iy0 == iy1 && ix0 == ix1)
+      {
+
+	 /* Triangle is contained in a single 16x16 block:
+	  */
+	 int mask = (ix0 & 3) | ((iy0 & 3) << 4);
+
+	 lp_scene_bin_command( scene, ix0/4, iy0/4,
+			       lp_rast_triangle_3_16,
+			       lp_rast_arg_triangle(tri, mask) );
+	 return;
+      }
+   }
+
+   ix0 = bbox->x0 / TILE_SIZE;
+   iy0 = bbox->y0 / TILE_SIZE;
+   ix1 = bbox->x1 / TILE_SIZE;
+   iy1 = bbox->y1 / TILE_SIZE;
 
    /*
     * Clamp to framebuffer size
@@ -799,9 +618,10 @@ static void triangle_both( struct lp_setup_context *setup,
    const float fy = v1[0][1] - v2[0][1];
 
    /* det = cross(e,f).z */
-   if (ex * fy - ey * fx < 0.0f) 
+   const float det = ex * fy - ey * fx;
+   if (det < 0.0f) 
       triangle_ccw( setup, v0, v1, v2 );
-   else
+   else if (det > 0.0f)
       triangle_cw( setup, v0, v1, v2 );
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index 77bec4640b..edd723f65f 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -74,6 +74,15 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
       vs_index = draw_find_shader_output(llvmpipe->draw,
                                          lpfs->info.input_semantic_name[i],
                                          lpfs->info.input_semantic_index[i]);
+      if (vs_index < 0) {
+         /*
+          * This can happen with sprite coordinates - the vertex
+          * shader doesn't need to provide an output as we generate
+          * them internally.  However, lets keep pretending that there
+          * is something there to not confuse other code.
+          */
+         vs_index = 0;
+      }
 
       /* This can be pre-computed, except for flatshade:
        */
@@ -125,6 +134,17 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
       inputs[i].src_index = vinfo->num_attribs;
       draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
    }
+
+   /* Figure out if we need pointsize as well.
+    */
+   vs_index = draw_find_shader_output(llvmpipe->draw,
+                                      TGSI_SEMANTIC_PSIZE, 0);
+
+   if (vs_index > 0) {
+      llvmpipe->psize_slot = vinfo->num_attribs;
+      draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+   }
+
    llvmpipe->num_inputs = lpfs->info.num_inputs;
 
    draw_compute_vertex_size(vinfo);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index dbca49a2ef..33c1a49efe 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -808,7 +808,7 @@ generate_variant(struct llvmpipe_context *lp,
    variant->list_item_local.base = variant;
    variant->no = shader->variants_created++;
 
-   memcpy(&variant->key, key, sizeof *key);
+   memcpy(&variant->key, key, shader->variant_key_size);
 
    if (gallivm_debug & GALLIVM_DEBUG_IR) {
       debug_printf("llvmpipe: Creating fragment shader #%u variant #%u:\n", 
@@ -840,6 +840,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
                          const struct pipe_shader_state *templ)
 {
    struct lp_fragment_shader *shader;
+   int nr_samplers;
 
    shader = CALLOC_STRUCT(lp_fragment_shader);
    if (!shader)
@@ -854,6 +855,11 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
    /* we need to keep a local copy of the tokens */
    shader->base.tokens = tgsi_dup_tokens(templ->tokens);
 
+   nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1;
+
+   shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key,
+				     sampler[nr_samplers]);
+
    if (LP_DEBUG & DEBUG_TGSI) {
       unsigned attrib;
       debug_printf("llvmpipe: Create fragment shader #%u %p:\n", shader->no, (void *) shader);
@@ -921,7 +927,6 @@ static void
 llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
-   struct pipe_fence_handle *fence = NULL;
    struct lp_fragment_shader *shader = fs;
    struct lp_fs_variant_list_item *li;
 
@@ -934,12 +939,7 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
     * Flushing alone might not sufficient we need to wait on it too.
     */
 
-   llvmpipe_flush(pipe, 0, &fence);
-
-   if (fence) {
-      pipe->screen->fence_finish(pipe->screen, fence, 0);
-      pipe->screen->fence_reference(pipe->screen, &fence, NULL);
-   }
+   llvmpipe_finish(pipe, __FUNCTION__);
 
    li = first_elem(&shader->variants);
    while(!at_end(&shader->variants, li)) {
@@ -1027,7 +1027,7 @@ make_variant_key(struct llvmpipe_context *lp,
 {
    unsigned i;
 
-   memset(key, 0, sizeof *key);
+   memset(key, 0, shader->variant_key_size);
 
    if (lp->framebuffer.zsbuf) {
       if (lp->depth_stencil->depth.enabled) {
@@ -1097,9 +1097,17 @@ make_variant_key(struct llvmpipe_context *lp,
       }
    }
 
-   for(i = 0; i < PIPE_MAX_SAMPLERS; ++i)
-      if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i))
-         lp_sampler_static_state(&key->sampler[i], lp->fragment_sampler_views[i], lp->sampler[i]);
+   /* This value will be the same for all the variants of a given shader:
+    */
+   key->nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1;
+
+   for(i = 0; i < key->nr_samplers; ++i) {
+      if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
+         lp_sampler_static_state(&key->sampler[i],
+				 lp->fragment_sampler_views[i],
+				 lp->sampler[i]);
+      }
+   }
 }
 
 /**
@@ -1118,7 +1126,7 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
 
    li = first_elem(&shader->variants);
    while(!at_end(&shader->variants, li)) {
-      if(memcmp(&li->base->key, &key, sizeof key) == 0) {
+      if(memcmp(&li->base->key, &key, shader->variant_key_size) == 0) {
          variant = li->base;
          break;
       }
@@ -1134,19 +1142,14 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
       unsigned i;
       if (lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS) {
          struct pipe_context *pipe = &lp->pipe;
-         struct pipe_fence_handle *fence = NULL;
 
          /*
           * XXX: we need to flush the context until we have some sort of reference
           * counting in fragment shaders as they may still be binned
           * Flushing alone might not be sufficient we need to wait on it too.
           */
-         llvmpipe_flush(pipe, 0, &fence);
+         llvmpipe_finish(pipe, __FUNCTION__);
 
-         if (fence) {
-            pipe->screen->fence_finish(pipe->screen, fence, 0);
-            pipe->screen->fence_reference(pipe->screen, &fence, NULL);
-         }
          for (i = 0; i < LP_MAX_SHADER_VARIANTS / 4; i++) {
             struct lp_fs_variant_list_item *item = last_elem(&lp->fs_variants_list);
             remove_shader_variant(lp, item->base);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h
index 37900fc544..33c480010d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h
@@ -53,13 +53,10 @@ struct lp_fragment_shader_variant_key
    struct pipe_blend_state blend;
    enum pipe_format zsbuf_format;
    unsigned nr_cbufs:8;
+   unsigned nr_samplers:8;	/* actually derivable from just the shader */
    unsigned flatshade:1;
    unsigned occlusion_count:1;
 
-   struct {
-      ubyte colormask;
-   } cbuf_blend[PIPE_MAX_COLOR_BUFS];
-
    struct lp_sampler_static_state sampler[PIPE_MAX_SAMPLERS];
 };
 
@@ -97,6 +94,7 @@ struct lp_fragment_shader
    struct lp_fs_variant_list_item variants;
 
    /* For debugging/profiling purposes */
+   unsigned variant_key_size;
    unsigned no;
    unsigned variants_created;
    unsigned variants_cached;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
index afd3e0b21c..0bad7320f3 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
@@ -73,7 +73,13 @@ llvmpipe_bind_rasterizer_state(struct pipe_context *pipe, void *handle)
                    llvmpipe->rasterizer->gl_rasterization_rules);
       lp_setup_set_flatshade_first( llvmpipe->setup,
                    llvmpipe->rasterizer->flatshade_first);
-   }
+      lp_setup_set_line_state( llvmpipe->setup,
+                   llvmpipe->rasterizer->line_width);
+      lp_setup_set_point_state( llvmpipe->setup,
+                   llvmpipe->rasterizer->point_size,
+                   llvmpipe->rasterizer->point_size_per_vertex,
+                   llvmpipe->rasterizer->sprite_coord_enable);
+       }
 
    llvmpipe->dirty |= LP_NEW_RASTERIZER;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_state_vertex.c b/src/gallium/drivers/llvmpipe/lp_state_vertex.c
index d86e66b4fb..fb29423dd3 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_vertex.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_vertex.c
@@ -100,7 +100,7 @@ llvmpipe_set_index_buffer(struct pipe_context *pipe,
    else
       memset(&llvmpipe->index_buffer, 0, sizeof(llvmpipe->index_buffer));
 
-   /* TODO make this more like a state */
+   draw_set_index_buffer(llvmpipe->draw, ib);
 }
 
 void
diff --git a/src/gallium/drivers/llvmpipe/lp_surface.c b/src/gallium/drivers/llvmpipe/lp_surface.c
index f761e82850..63ddc669c2 100644
--- a/src/gallium/drivers/llvmpipe/lp_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_surface.c
@@ -68,14 +68,16 @@ lp_resource_copy(struct pipe_context *pipe,
                            0, /* flush_flags */
                            FALSE, /* read_only */
                            TRUE, /* cpu_access */
-                           FALSE); /* do_not_block */
+                           FALSE,
+                           "blit dst"); /* do_not_block */
 
    llvmpipe_flush_resource(pipe,
                            src, subsrc.face, subsrc.level,
                            0, /* flush_flags */
                            TRUE, /* read_only */
                            TRUE, /* cpu_access */
-                           FALSE); /* do_not_block */
+                           FALSE,
+                           "blit src"); /* do_not_block */
 
    /*
    printf("surface copy from %u to %u: %u,%u to %u,%u %u x %u\n",
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
index 25112c10a6..5832ea2744 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -67,6 +67,7 @@ resource_is_texture(const struct pipe_resource *resource)
       return FALSE;
    case PIPE_TEXTURE_1D:
    case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
    case PIPE_TEXTURE_3D:
    case PIPE_TEXTURE_CUBE:
       return TRUE;
@@ -583,7 +584,8 @@ llvmpipe_get_transfer(struct pipe_context *pipe,
                                    0, /* flush_flags */
                                    read_only,
                                    TRUE, /* cpu_access */
-                                   do_not_block)) {
+                                   do_not_block,
+                                   "transfer dest")) {
          /*
           * It would have blocked, but state tracker requested no to.
           */
diff --git a/src/gallium/drivers/nouveau/nouveau_class.h b/src/gallium/drivers/nouveau/nouveau_class.h
index f44979e562..d9f35b4c4b 100644
--- a/src/gallium/drivers/nouveau/nouveau_class.h
+++ b/src/gallium/drivers/nouveau/nouveau_class.h
@@ -6189,6 +6189,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NV34TCL_FP_REG_CONTROL_UNK1_MASK						0xffff0000
 #define   NV34TCL_FP_REG_CONTROL_UNK0_SHIFT						0
 #define   NV34TCL_FP_REG_CONTROL_UNK0_MASK						0x0000ffff
+#define  NV34TCL_FLATSHADE_FIRST							0x00001454
+#define  NV34TCL_EDGEFLAG_ENABLE							0x0000145c
 #define  NV34TCL_VP_CLIP_PLANES_ENABLE							0x00001478
 #define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0						(1 <<  1)
 #define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1						(1 <<  5)
@@ -6222,10 +6224,13 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV34TCL_VTXFMT__SIZE								0x00000010
 #define   NV34TCL_VTXFMT_TYPE_SHIFT							0
 #define   NV34TCL_VTXFMT_TYPE_MASK							0x0000000f
-#define    NV34TCL_VTXFMT_TYPE_FLOAT							0x00000002
-#define    NV34TCL_VTXFMT_TYPE_HALF							0x00000003
-#define    NV34TCL_VTXFMT_TYPE_UBYTE							0x00000004
-#define    NV34TCL_VTXFMT_TYPE_USHORT							0x00000005
+#define    NV34TCL_VTXFMT_TYPE_16_SNORM							0x00000001
+#define    NV34TCL_VTXFMT_TYPE_32_FLOAT							0x00000002
+#define    NV34TCL_VTXFMT_TYPE_16_FLOAT							0x00000003
+#define    NV34TCL_VTXFMT_TYPE_8_UNORM							0x00000004
+#define    NV34TCL_VTXFMT_TYPE_16_SSCALED							0x00000005
+#define    NV34TCL_VTXFMT_TYPE_11_11_10_SNORM							0x00000006
+#define    NV34TCL_VTXFMT_TYPE_8_USCALED							0x00000007
 #define   NV34TCL_VTXFMT_SIZE_SHIFT							4
 #define   NV34TCL_VTXFMT_SIZE_MASK							0x000000f0
 #define   NV34TCL_VTXFMT_STRIDE_SHIFT							8
@@ -6368,6 +6373,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define    NV34TCL_TX_FORMAT_FORMAT_R8G8B8_RECT						0x00001e00
 #define    NV34TCL_TX_FORMAT_FORMAT_A8L8_RECT						0x00002000
 #define    NV34TCL_TX_FORMAT_FORMAT_DSDT8						0x00002800
+#define    NV34TCL_TX_FORMAT_FORMAT_Z24							0x2a00
+#define    NV34TCL_TX_FORMAT_FORMAT_Z24_RECT						0x2b00 /* XXX: guess! */
+#define    NV34TCL_TX_FORMAT_FORMAT_Z16							0x2c00
+#define    NV34TCL_TX_FORMAT_FORMAT_Z16_RECT						0x2d00 /* XXX: guess! */
 #define    NV34TCL_TX_FORMAT_FORMAT_HILO16						0x00003300
 #define    NV34TCL_TX_FORMAT_FORMAT_HILO16_RECT						0x00003600
 #define    NV34TCL_TX_FORMAT_FORMAT_HILO8						0x00004400
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index 513e5e02bc..ebb21a6e5a 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -258,6 +258,7 @@ nouveau_screen_fini(struct nouveau_screen *screen)
 {
 	struct pipe_winsys *ws = screen->base.winsys;
 	nouveau_channel_free(&screen->channel);
-	ws->destroy(ws);
+	if (ws)
+		ws->destroy(ws);
 }
 
diff --git a/src/gallium/drivers/nouveau/nouveau_util.h b/src/gallium/drivers/nouveau/nouveau_util.h
deleted file mode 100644
index b165f7a611..0000000000
--- a/src/gallium/drivers/nouveau/nouveau_util.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef __NOUVEAU_UTIL_H__
-#define __NOUVEAU_UTIL_H__
-
-/* Determine how many vertices can be pushed into the command stream.
- * Where the remaining space isn't large enough to represent all verices,
- * split the buffer at primitive boundaries.
- *
- * Returns a count of vertices that can be rendered, and an index to
- * restart drawing at after a flush.
- */
-static INLINE unsigned
-nouveau_vbuf_split(unsigned remaining, unsigned overhead, unsigned vpp,
-		   unsigned mode, unsigned start, unsigned count,
-		   unsigned *restart)
-{
-	int max, adj = 0;
-
-	max  = remaining - overhead;
-	if (max < 0)
-		return 0;
-
-	max *= vpp;
-	if (max >= count)
-		return count;
-
-	switch (mode) {
-	case PIPE_PRIM_POINTS:
-		break;
-	case PIPE_PRIM_LINES:
-		max = max & 1;
-		break;
-	case PIPE_PRIM_TRIANGLES:
-		max = max - (max % 3);
-		break;
-	case PIPE_PRIM_QUADS:
-		max = max & ~3;
-		break;
-	case PIPE_PRIM_LINE_LOOP:
-	case PIPE_PRIM_LINE_STRIP:
-		if (max < 2)
-			max = 0;
-		adj = 1;
-		break;
-	case PIPE_PRIM_POLYGON:
-	case PIPE_PRIM_TRIANGLE_STRIP:
-	case PIPE_PRIM_TRIANGLE_FAN:
-		if (max < 3)
-			max = 0;
-		adj = 2;
-		break;
-	case PIPE_PRIM_QUAD_STRIP:
-		if (max < 4)
-			max = 0;
-		adj = 3;
-		break;
-	default:
-		assert(0);
-	}
-
-	*restart = start + max - adj;
-	return max;
-}
-
-/* Integer base-2 logarithm, rounded towards zero. */
-static INLINE unsigned log2i(unsigned i)
-{
-	unsigned r = 0;
-
-	if (i & 0xffff0000) {
-		i >>= 16;
-		r += 16;
-	}
-	if (i & 0x0000ff00) {
-		i >>= 8;
-		r += 8;
-	}
-	if (i & 0x000000f0) {
-		i >>= 4;
-		r += 4;
-	}
-	if (i & 0x0000000c) {
-		i >>= 2;
-		r += 2;
-	}
-	if (i & 0x00000002) {
-		r += 1;
-	}
-	return r;
-}
-
-#endif
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
index 12b5ad106c..dd0e8fd41b 100644
--- a/src/gallium/drivers/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -238,7 +238,8 @@ nv50_miptree_from_handle(struct pipe_screen *pscreen,
 	unsigned stride;
 
 	/* Only supports 2D, non-mipmapped textures for the moment */
-	if (template->target != PIPE_TEXTURE_2D ||
+	if ((template->target != PIPE_TEXTURE_2D &&
+	      template->target != PIPE_TEXTURE_RECT) ||
 	    template->last_level != 0 ||
 	    template->depth0 != 1)
 		return NULL;
diff --git a/src/gallium/drivers/nv50/nv50_push.c b/src/gallium/drivers/nv50/nv50_push.c
index 0091927a98..380f69406a 100644
--- a/src/gallium/drivers/nv50/nv50_push.c
+++ b/src/gallium/drivers/nv50/nv50_push.c
@@ -108,8 +108,9 @@ emit_vertex(struct push_context *ctx, unsigned n)
    int i;
 
    if (ctx->edgeflag_attr < 16) {
-      float *edgeflag = (uint8_t *)ctx->attr[ctx->edgeflag_attr].map +
-                        ctx->attr[ctx->edgeflag_attr].stride * n;
+      float *edgeflag = (float *)
+         ((uint8_t *)ctx->attr[ctx->edgeflag_attr].map +
+          ctx->attr[ctx->edgeflag_attr].stride * n);
 
       if (*edgeflag != ctx->edgeflag) {
          BEGIN_RING(chan, tesla, NV50TCL_EDGEFLAG_ENABLE, 1);
diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c
index 5535818370..658324ec5b 100644
--- a/src/gallium/drivers/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nv50/nv50_tex.c
@@ -83,6 +83,9 @@ nv50_tex_construct(struct nv50_sampler_view *view)
 	case PIPE_TEXTURE_2D:
 		tic[2] |= NV50TIC_0_2_TARGET_2D;
 		break;
+	case PIPE_TEXTURE_RECT:
+		tic[2] |= NV50TIC_0_2_TARGET_RECT;
+		break;
 	case PIPE_TEXTURE_3D:
 		tic[2] |= NV50TIC_0_2_TARGET_3D;
 		break;
diff --git a/src/gallium/drivers/nvfx/Makefile b/src/gallium/drivers/nvfx/Makefile
index c1d57ca396..6cbbad699e 100644
--- a/src/gallium/drivers/nvfx/Makefile
+++ b/src/gallium/drivers/nvfx/Makefile
@@ -4,7 +4,7 @@ include $(TOP)/configs/current
 LIBNAME = nvfx
 
 C_SOURCES = \
-	nv04_surface_2d.c \
+	nv04_2d.c \
 	nvfx_buffer.c \
 	nvfx_context.c \
 	nvfx_clear.c \
@@ -14,6 +14,7 @@ C_SOURCES = \
 	nv30_fragtex.c \
 	nv40_fragtex.c \
 	nvfx_miptree.c \
+	nvfx_push.c \
 	nvfx_query.c \
 	nvfx_resource.c \
 	nvfx_screen.c \
diff --git a/src/gallium/drivers/nvfx/SConscript b/src/gallium/drivers/nvfx/SConscript
index 02d931b10e..80e3ef2257 100644
--- a/src/gallium/drivers/nvfx/SConscript
+++ b/src/gallium/drivers/nvfx/SConscript
@@ -9,7 +9,7 @@ env.PrependUnique(delete_existing=1, CPPPATH = [
 nvfx = env.ConvenienceLibrary(
     target = 'nvfx',
     source = [
-        'nv04_surface_2d.c',
+        'nv04_2d.c',
         'nvfx_buffer.c',
         'nvfx_context.c',
         'nvfx_clear.c',
@@ -19,6 +19,7 @@ nvfx = env.ConvenienceLibrary(
         'nv30_fragtex.c',
         'nv40_fragtex.c',
         'nvfx_miptree.c',
+        'nvfx_push.c',
         'nvfx_query.c',
         'nvfx_resource.c',
         'nvfx_screen.c',
diff --git a/src/gallium/drivers/nvfx/nv04_2d.c b/src/gallium/drivers/nvfx/nv04_2d.c
new file mode 100644
index 0000000000..c05312219b
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv04_2d.c
@@ -0,0 +1,1341 @@
+/**************************************************************************
+ *
+ * Copyright 2009 Ben Skeggs
+ * Copyright 2009 Younes Manton
+ * Copyright 2010 Luca Barbieri
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+/* this code has no Mesa or Gallium dependency and can be reused in the classic Mesa driver or DDX */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <nouveau/nouveau_class.h>
+#include <nouveau/nouveau_device.h>
+#include <nouveau/nouveau_pushbuf.h>
+#include <nouveau/nouveau_channel.h>
+#include <nouveau/nouveau_bo.h>
+#include <nouveau/nouveau_notifier.h>
+#include <nouveau/nouveau_grobj.h>
+#include "nv04_2d.h"
+
+/* avoid depending on Mesa/Gallium */
+#ifdef __GNUC__
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#else
+#define likely(x) !!(x)
+#define unlikely(x) !!(x)
+#endif
+
+#define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
+#define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
+
+struct nv04_2d_context
+{
+	struct nouveau_notifier *ntfy;
+	struct nouveau_grobj *surf2d;
+	struct nouveau_grobj *swzsurf;
+	struct nouveau_grobj *m2mf;
+	struct nouveau_grobj *rect;
+	struct nouveau_grobj *sifm;
+	struct nouveau_grobj *blit;
+};
+
+static inline int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+static inline int
+util_is_pot(unsigned x)
+{
+   return (x & (x - 1)) == 0;
+}
+
+/* Integer base-2 logarithm, rounded towards zero. */
+static inline unsigned log2i(unsigned i)
+{
+	unsigned r = 0;
+
+	if (i & 0xffff0000) {
+		i >>= 16;
+		r += 16;
+	}
+	if (i & 0x0000ff00) {
+		i >>= 8;
+		r += 8;
+	}
+	if (i & 0x000000f0) {
+		i >>= 4;
+		r += 4;
+	}
+	if (i & 0x0000000c) {
+		i >>= 2;
+		r += 2;
+	}
+	if (i & 0x00000002) {
+		r += 1;
+	}
+	return r;
+}
+
+//#define NV04_REGION_DEBUG
+
+// Yes, we really want to inline everything, since all the functions are used only once
+#if defined(__GNUC__) && defined(DEBUG)
+#define inline __attribute__((always_inline)) inline
+#endif
+
+static inline unsigned
+nv04_swizzle_bits_square(unsigned x, unsigned y)
+{
+	unsigned u = (x & 0x001) << 0 |
+		     (x & 0x002) << 1 |
+		     (x & 0x004) << 2 |
+		     (x & 0x008) << 3 |
+		     (x & 0x010) << 4 |
+		     (x & 0x020) << 5 |
+		     (x & 0x040) << 6 |
+		     (x & 0x080) << 7 |
+		     (x & 0x100) << 8 |
+		     (x & 0x200) << 9 |
+		     (x & 0x400) << 10 |
+		     (x & 0x800) << 11;
+
+	unsigned v = (y & 0x001) << 1 |
+		     (y & 0x002) << 2 |
+		     (y & 0x004) << 3 |
+		     (y & 0x008) << 4 |
+		     (y & 0x010) << 5 |
+		     (y & 0x020) << 6 |
+		     (y & 0x040) << 7 |
+		     (y & 0x080) << 8 |
+		     (y & 0x100) << 9 |
+		     (y & 0x200) << 10 |
+		     (y & 0x400) << 11 |
+		     (y & 0x800) << 12;
+	return v | u;
+}
+
+/* rectangular swizzled textures are linear concatenations of swizzled square tiles */
+static inline unsigned
+nv04_swizzle_bits_2d(unsigned x, unsigned y, unsigned w, unsigned h)
+{
+	if(h <= 1)
+		return x;
+	else
+	{
+		unsigned s = MIN2(w, h);
+		unsigned m = s - 1;
+		return (((x | y) & ~m) * s) | nv04_swizzle_bits_square(x & m, y & m);
+	}
+}
+
+// general 3D texture case
+static inline unsigned
+nv04_swizzle_bits(unsigned x, unsigned y, unsigned z, unsigned w, unsigned h, unsigned d)
+{
+	if(d <= 1)
+		return nv04_swizzle_bits_2d(x, y, w, h);
+	else
+	{
+		// TODO: autogenerate code for all possible texture sizes (13 * 13 * 13 with dims <= 4096) and do a single indirect call
+		unsigned v = 0;
+		w >>= 1;
+		h >>= 1;
+		d >>= 1;
+		for(int i = 0;;)
+		{
+			int oldi = i;
+			if(likely(w))
+			{
+				v |= (x & 1) << i;
+				x >>= 1;
+				w >>= 1;
+				++i;
+			}
+
+			if(likely(h))
+			{
+				v |= (y & 1) << i;
+				y >>= 1;
+				h >>= 1;
+				++i;
+			}
+
+			if(likely(d))
+			{
+				v |= (z & 1) << i;
+				z >>= 1;
+				d >>= 1;
+				++i;
+			}
+
+			if(i == oldi)
+				break;
+		}
+		return v;
+	}
+}
+
+unsigned
+nv04_region_begin(struct nv04_region* rgn, unsigned w, unsigned h)
+{
+	if(rgn->pitch)
+		return rgn->pitch * rgn->y + (rgn->x << rgn->bpps);
+	else
+		return nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d) << rgn->bpps;
+}
+
+unsigned
+nv04_region_end(struct nv04_region* rgn, unsigned w, unsigned h)
+{
+	if(rgn->pitch)
+		return rgn->pitch * (rgn->y + h - 1) + ((rgn->x + w) << rgn->bpps);
+	else
+		return (nv04_swizzle_bits(rgn->x + w - 1, rgn->y + h - 1, rgn->z, rgn->w, rgn->h, rgn->d) + 1) << rgn->bpps;
+}
+
+// *pitch = -1 -> use 3D swizzling for (x, y), *pitch = 0 -> use 2D swizzling, other *pitch -> use linear calculations
+// returns 2 if pixel order is 3D-swizzled and 1 if subrect is 2D-swizzled
+/* *pitch == -1 ret = 0 -> 3D swizzled subrect
+ * *pitch == 0 ret = 0 -> 2D swizzled subrect
+ * *pitch > 0 ret = 0 -> linear subrect
+ * *pitch > 0 ret = 1 -> linear subrect, but with swizzled 3D data inside
+ */
+
+static inline void
+nv04_region_print(struct nv04_region* rgn)
+{
+	fprintf(stderr, "<%i[%i]> ", rgn->bo->handle, rgn->offset);
+	if(rgn->pitch)
+		fprintf(stderr, "lin %i", rgn->pitch);
+	else
+		fprintf(stderr, "swz %ix%ix%i", rgn->w, rgn->h, rgn->d);
+	fprintf(stderr, " (%i, %i, %i)", rgn->x, rgn->y, rgn->z);
+}
+
+static inline void
+nv04_region_assert(struct nv04_region* rgn, unsigned w, unsigned h)
+{
+	unsigned end = rgn->offset + nv04_region_end(rgn, w, h);
+
+	assert(rgn->offset <= (int)rgn->bo->size);
+	assert(end <= rgn->bo->size);
+	(void) end;
+	if(!rgn->pitch) {
+		assert(util_is_pot(rgn->w));
+		assert(util_is_pot(rgn->h));
+	}
+}
+
+/* determine if region can be linearized or fake-linearized */
+static inline int
+nv04_region_is_contiguous(struct nv04_region* rgn, int w, int h)
+{
+	int surf_min;
+	int rect_min;
+
+	if(rgn->pitch)
+		return rgn->pitch == w << rgn->bpps;
+
+	// redundant, but this is the fast path for the common case
+	if(w == rgn->w && h == rgn->h && rgn->d <= 1)
+		return 1;
+
+	// must be POT
+	if((w & (w - 1)) || (h & (h - 1)))
+		return 0;
+
+	// must be aligned
+	if((rgn->x & (w - 1)) || (rgn->y & (h - 1)))
+		return 0;
+
+	if(rgn->d > 1)
+		return 0;
+
+	surf_min = MIN2(rgn->w, rgn->h);
+	rect_min = MIN2(w, h);
+
+	if((rect_min == surf_min) || (w == h) || (w == 2 * h))
+		return 1;
+
+	return 0;
+}
+
+// double the pitch until it is larger than the alignment, or the height becomes odd or 1
+static inline void
+nv04_region_contiguous_shape(struct nv04_region* rgn, int* w, int* h, int align)
+{
+	while(!(*h & 1) && (*w << rgn->bpps) < (1 << align))
+	{
+		*w <<= 1;
+		*h >>= 1;
+	}
+
+	while((*w << rgn->bpps) > 16384 && !(*w & 1))
+	{
+		*w >>= 1;
+		*h <<= 1;
+	}
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tCONTIGUOUS %ix%i\n", *w, *h);
+#endif
+}
+
+static inline void
+nv04_region_linearize_contiguous(struct nv04_region* rgn, unsigned w, unsigned h)
+{
+	int pos;
+	if(rgn->pitch)
+	{
+		rgn->offset += rgn->y * rgn->pitch + (rgn->x << rgn->bpps);
+		rgn->x = 0;
+		rgn->y = 0;
+	}
+	else
+	{
+		rgn->offset += (rgn->w * rgn->h * rgn->z) << rgn->bpps;
+		pos = nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d);
+		rgn->x = pos & (w - 1);
+		rgn->y = pos / w;
+	}
+	rgn->pitch = w << rgn->bpps;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tLINEARIZE ");
+	nv04_region_print(rgn);
+	fprintf(stderr, "\n");
+#endif
+}
+
+	/* preserve the offset! */
+	/*
+	rgn->pitch = util_format_get_stride(rgn->format, w);
+	int pos = nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d);
+	rgn->x = pos & (w - 1);
+	rgn->y = pos & ~(w - 1);
+	*/
+
+	/*
+	rgn->offset +=
+	rgn->pitch = util_format_get_stride(rgn->format, w);
+	rgn->x = 0;
+	rgn->y = 0;
+	*/
+
+/* This code will get used for, and always succeed on:
+ * - 4x2 1bpp swizzled texture mipmap levels
+ * - linear regions created by linearization
+ *
+ * This code will get used for, and MAY work for:
+ * - misaligned texture blanket
+ * - linear surfaces created without wide_pitch (in this case, it will only work if we are lucky)
+ *
+ * The general case requires splitting the region in 2.
+ */
+static inline int
+nv04_region_do_align_offset(struct nv04_region* rgn, unsigned w, unsigned h, int shift)
+{
+	if(rgn->pitch > 0)
+	{
+		int delta;
+
+		assert(!(rgn->offset & ((1 << rgn->bpps) - 1))); // fatal!
+		delta = rgn->offset & ((1 << shift) - 1);
+
+		if(h <= 1)
+		{
+			rgn->x += delta >> rgn->bpps;
+			rgn->offset -= delta;
+			rgn->pitch = align((rgn->x + w) << rgn->bpps, 1 << shift);
+		}
+		else
+		{
+			int newxo = (rgn->x << rgn->bpps) + delta;
+			int dy = newxo / rgn->pitch;
+			newxo -= dy * rgn->pitch;
+			if((newxo + (w << rgn->bpps)) > rgn->pitch)
+			{
+				// TODO: split the region into two rectangles (!) if *really* necessary, unless the hardware actually supports "wrapping" rectangles
+				// this does not happen if the surface is pitch-aligned, which it should always be
+				assert(0);
+				return -1;
+			}
+			rgn->x = newxo >> rgn->bpps;
+			rgn->y += dy;
+		}
+	}
+	else
+	{
+		int size;
+		int min;
+		int v;
+
+		// we don't care about the alignment of 3D surfaces since the 2D engine can't use them
+		if(rgn->d < 0)
+			return -1;
+
+		min = MIN2(rgn->w, rgn->h);
+		size = min * min << rgn->bpps;
+
+		// this is unfixable, and should not be happening
+		if(rgn->offset & (size - 1))
+			return -1;
+
+		v = (rgn->offset & ((1 << shift) - 1)) / size;
+		rgn->offset -= v * size;
+
+		if(rgn->h == min)
+		{
+			unsigned w;
+			rgn->x += rgn->h * v;
+			w = rgn->w + rgn->h * v;
+
+			while(rgn->w < w)
+				rgn->w += rgn->w;
+		}
+		else
+		{
+			unsigned h;
+			rgn->y += rgn->w * v;
+			h = rgn->h + rgn->w * v;
+
+			while(rgn->h < h)
+				rgn->h += rgn->h;
+		}
+	}
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tALIGNED ");
+	nv04_region_print(rgn);
+	fprintf(stderr, "\n");
+#endif
+	return 0;
+}
+
+// both pitch and shift
+// will leave the region unchanged if it fails
+static inline int
+nv04_region_align(struct nv04_region* rgn, unsigned w, unsigned h, int shift)
+{
+	if(rgn->pitch & ((1 << shift) - 1))
+	{
+		if(h == 1)
+			goto do_align; /* this will fix pitch too in this case */
+		else
+			return -1;
+	}
+
+	if(rgn->offset & ((1 << shift) - 1))
+	{
+		do_align:
+		if(nv04_region_do_align_offset(rgn, w, h, shift))
+			return -1;
+	}
+	return 0;
+}
+
+/* this contains 22 different copy loops after preprocessing. unfortunately, it's necessary */
+void
+nv04_region_copy_cpu(struct nv04_region* dst, struct nv04_region* src, int w, int h)
+{
+	uint8_t* mdst;
+	uint8_t* msrc;
+	int size;
+
+	if(dst->bo != src->bo)
+	{
+		nouveau_bo_map(dst->bo, NOUVEAU_BO_WR);
+		nouveau_bo_map(src->bo, NOUVEAU_BO_RD);
+	}
+	else
+		nouveau_bo_map(dst->bo, NOUVEAU_BO_WR | NOUVEAU_BO_RD);
+
+	mdst = (uint8_t*)dst->bo->map + dst->offset;
+	msrc = (uint8_t*)src->bo->map + src->offset;
+
+	size = w << dst->bpps;
+
+	nv04_region_assert(dst, w, h);
+	nv04_region_assert(src, w, h);
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tRGN_COPY_CPU [%i, %i: %i] ", w, h, dst->bpps);
+	for(int i = 0; i < 2; ++i)
+	{
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+
+//	for(int i = 0; i < 16; ++i)
+//		fprintf(stderr, "%02x ", msrc[i]);
+//	fprintf(stderr, "\n");
+#endif
+
+	// TODO: support overlapping copies!
+	if(src->pitch && dst->pitch)
+	{
+		mdst += dst->y * dst->pitch + (dst->x << dst->bpps);
+		msrc += src->y * src->pitch + (src->x << src->bpps);
+		if(dst->bo != src->bo)
+			goto simple;
+		else if(mdst < msrc)
+		{
+			if(mdst + size <= msrc)
+			{
+simple:
+				for(int iy = 0; iy < h; ++iy)
+				{
+					assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
+					assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
+					memcpy(mdst, msrc, size);
+					msrc += src->pitch; mdst += dst->pitch;
+				}
+			}
+			else
+			{
+				for(int iy = 0; iy < h; ++iy)
+				{
+					assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
+					assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
+					memmove(mdst, msrc, size);
+					msrc += src->pitch; mdst += dst->pitch;
+				}
+			}
+		}
+		else
+		{
+			/* copy backwards so we don't destroy data we have to read yet */
+			if(msrc + size <= mdst)
+			{
+				for(int iy = h - 1; iy >= 0; --iy)
+				{
+					assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
+					assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
+					memcpy(mdst, msrc, size);
+					msrc += src->pitch; mdst += dst->pitch;
+				}
+			}
+			else
+			{
+				for(int iy = h - 1; iy >= 0; --iy)
+				{
+					assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
+					assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
+					memmove(mdst, msrc, size);
+					msrc += src->pitch; mdst += dst->pitch;
+				}
+			}
+		}
+	}
+	else
+	{
+		int* dswx = NULL;
+		int* dswy = NULL;
+		int* sswx = NULL;
+		int* sswy = NULL;
+		int dir;
+
+		if(!dst->pitch)
+		{
+			dswx = alloca(w * sizeof(int));
+			for(int ix = 0; ix < w; ++ix) // we are adding, so z cannot be contributed by both
+				dswx[ix] = nv04_swizzle_bits(dst->x + ix, 0, 0, dst->w, dst->h, dst->d);
+			dswy = alloca(h * sizeof(int));
+			for(int iy = 0; iy < h; ++iy)
+				dswy[iy] = nv04_swizzle_bits(0, dst->y + iy, dst->z, dst->w, dst->h, dst->d);
+		}
+
+		if(!src->pitch)
+		{
+			sswx = alloca(w * sizeof(int));
+			for(int ix = 0; ix < w; ++ix)
+				sswx[ix] = nv04_swizzle_bits(src->x + ix, 0, 0, src->w, src->h, src->d);
+			sswy = alloca(h * sizeof(int));
+			for(int iy = 0; iy < h; ++iy)
+				sswy[iy] = nv04_swizzle_bits(0, src->y + iy, src->z, src->w, src->h, src->d);
+		}
+
+		dir = 1;
+		/* do backwards copies for overlapping swizzled surfaces */
+		if(dst->pitch == src->pitch && dst->offset == src->offset)
+		{
+			if(dst->y > src->y || (dst->y == src->y && dst->x > src->x))
+				dir = -1;
+		}
+
+#define SWIZZLED_COPY_LOOPS
+		if(dir == 1)
+		{
+			int dir = 1;
+#define LOOP_Y for(int iy = 0; iy < h; ++iy)
+#define LOOP_X for(int ix = 0; ix < w; ++ix)
+#include "nv04_2d_loops.h"
+#undef LOOP_X
+#undef LOOP_Y
+		}
+		else
+		{
+			int dir = -1;
+#define LOOP_Y for(int iy = h - 1; iy >= 0; --iy)
+#define LOOP_X for(int ix = w - 1; ix >= 0; --ix)
+#include "nv04_2d_loops.h"
+#undef LOOP_X
+#undef LOOP_Y
+		}
+#undef SWIZZLED_COPY_LOOP
+	}
+
+	if(src->bo != dst->bo)
+		nouveau_bo_unmap(src->bo);
+	nouveau_bo_unmap(dst->bo);
+}
+
+/* TODO: if the destination is swizzled, we are doing random writes, which causes write combining to fail
+ * the alternative is to read, modify and copy back, which may or may not be faster
+ * loading 3D textures is a common case that hits this and could probably benefit from the temporary
+ */
+void
+nv04_region_fill_cpu(struct nv04_region* dst, int w, int h, unsigned value)
+{
+	uint8_t* mdst = (nouveau_bo_map(dst->bo, NOUVEAU_BO_WR), (uint8_t*)dst->bo->map + dst->offset);
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tRGN_FILL_CPU ");
+	nv04_region_print(dst);
+	fprintf(stderr, "\n");
+#endif
+
+	nv04_region_assert(dst, w, h);
+
+	if(dst->pitch)
+	{
+		unsigned size = w << dst->bpps;
+
+#define FILL(T) do { \
+			for(int iy = 0; iy < h; ++iy) \
+			{ \
+				assert((char*)((T*)mdst + w) <= (char*)dst->bo->map + dst->bo->size); \
+				for(int ix = 0; ix < w; ++ix) \
+					((T*)mdst)[ix] = (T)value; \
+				mdst += dst->pitch; \
+			} \
+		} while(0)
+
+		mdst += dst->y * dst->pitch + (dst->x << dst->bpps);
+
+		if(dst->bpps == 0)
+		{
+ms:
+			assert(mdst + size * h <= (uint8_t*)dst->bo->map + dst->bo->size);
+			if(size == dst->pitch)
+				memset(mdst, (uint8_t)value, size * h);
+			else
+			{
+				for(int iy = 0; iy < h; ++iy)
+				{
+					assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
+					memset(mdst, (uint8_t)value, size);
+					mdst += dst->pitch;
+				}
+			}
+		}
+		else if(dst->bpps == 1)
+		{
+			if(!((uint8_t)value ^ (uint8_t)(value >> 8)))
+				goto ms;
+
+			FILL(uint16_t);
+		}
+		else if(dst->bpps == 2)
+		{
+			if(value == (uint8_t)value * 0x1010101)
+				goto ms;
+			FILL(uint32_t);
+		}
+		else
+			assert(0);
+#undef FILL
+	}
+	else
+	{
+		int* dswx;
+		int* dswy;
+
+		dswx = alloca(w * sizeof(int));
+		for(int ix = 0; ix < w; ++ix)
+			dswx[ix] = nv04_swizzle_bits(dst->x + ix, 0, dst->z, dst->w, dst->h, dst->d);
+		dswy = alloca(h * sizeof(int));
+		for(int iy = 0; iy < h; ++iy)
+			dswy[iy] = nv04_swizzle_bits(0, dst->y + iy, dst->z, dst->w, dst->h, dst->d);
+
+#define FILL(T) do { \
+			T tvalue = (T)value; \
+			for(int iy = 0; iy < h; ++iy) \
+			{ \
+				T* pdst = (T*)mdst + dswy[iy]; \
+				for(int ix = 0; ix < w; ++ix) \
+				{ \
+					assert((uint8_t*)&pdst[dswx[ix] + 1] <= (uint8_t*)dst->bo->map + dst->bo->size); \
+					pdst[dswx[ix]] = tvalue; \
+				} \
+			} \
+		} while(0)
+
+		if(dst->bpps == 0)
+			FILL(uint8_t);
+		else if(dst->bpps == 1)
+			FILL(uint16_t);
+		else if(dst->bpps == 2)
+			FILL(uint32_t);
+		else
+			assert(0 && "unhandled bpp");
+#undef FILL
+	}
+
+	nouveau_bo_unmap(dst->bo);
+}
+
+static void
+nv04_region_copy_swizzle(struct nv04_2d_context *ctx,
+			  struct nv04_region* dst,
+			  struct nv04_region* src,
+			  int w, int h, int cs2d_format, int sifm_format)
+{
+	struct nouveau_channel *chan = ctx->swzsurf->channel;
+	struct nouveau_grobj *swzsurf = ctx->swzsurf;
+	struct nouveau_grobj *sifm = ctx->sifm;
+	/* Max width & height may not be the same on all HW, but must be POT */
+	unsigned max_shift = 10;
+	unsigned cw = 1 << max_shift;
+	unsigned ch = 1 << max_shift;
+	unsigned sx = dst->x >> max_shift;
+	unsigned sy = dst->y >> max_shift;
+	unsigned ex = (dst->x + w - 1) >> max_shift;
+	unsigned ey = (dst->y + h - 1) >> max_shift;
+	unsigned chunks = (ex - sx + 1) * (ey - sy + 1);
+	unsigned chunk_size;
+	if(dst->w < cw)
+		cw = dst->w;
+	if(dst->h < ch)
+		ch = dst->h;
+	chunk_size = cw * ch << dst->bpps;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tRGN_COPY_SWIZZLE [%i, %i: %i] ", w, h, dst->bpps);
+	for(int i = 0; i < 2; ++i)
+	{
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+#endif
+
+	nv04_region_assert(dst, w, h);
+	nv04_region_assert(src, w, h);
+
+	MARK_RING (chan, 8 + chunks * 17, 2 + chunks * 2);
+
+	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, dst->bo,
+			NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_FORMAT, 1);
+	OUT_RING  (chan, cs2d_format |
+			 log2i(cw) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT |
+			 log2i(ch) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT);
+
+	BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, src->bo,
+			 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1);
+	OUT_RING  (chan, swzsurf->handle);
+
+	assert(!(dst->offset & 63));
+
+	for (int cy = sy; cy <= ey; ++cy) {
+	  int ry = MAX2(0, (int)(dst->y - ch * cy));
+	  int rh = MIN2((int)ch, (int)(dst->y - ch * cy + h)) - ry;
+	  for (int cx = sx; cx <= ex; ++cx) {
+	    int rx = MAX2(0, (int)(dst->x - cw * cx));
+	    int rw = MIN2((int)cw, (int)(dst->x - cw * cx + w)) - rx;
+	    unsigned dst_offset;
+	    unsigned src_offset;
+
+	    BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
+
+	    dst_offset = dst->offset + (nv04_swizzle_bits_2d(cx * cw, cy * ch, dst->w, dst->h) << dst->bpps);
+	    assert(dst_offset <= dst->bo->size);
+	    assert(dst_offset + chunk_size <= dst->bo->size);
+	    OUT_RELOCl(chan, dst->bo, dst_offset,
+			    NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	    BEGIN_RING(chan, sifm, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
+	    OUT_RING  (chan, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
+	    OUT_RING  (chan, sifm_format);
+	    OUT_RING  (chan, NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
+	    OUT_RING  (chan, rx | (ry << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT));
+	    OUT_RING  (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | rw);
+	    OUT_RING  (chan, rx | (ry << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT));
+	    OUT_RING  (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | rw);
+	    OUT_RING  (chan, 1 << 20);
+	    OUT_RING  (chan, 1 << 20);
+
+	    BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_SIZE, 4);
+	    OUT_RING  (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_SIZE_H_SHIFT | align(rw, 8));
+	    OUT_RING  (chan, src->pitch |
+			     NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
+			     NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
+	    src_offset = src->offset + (cy * ch + ry + src->y - dst->y) * src->pitch + ((cx * cw + rx + src->x - dst->x) << src->bpps);
+	    assert(src_offset <= src->bo->size);
+	    assert(src_offset + (src->pitch * (rh - 1)) + (rw << src->bpps) <= src->bo->size);
+	    OUT_RELOCl(chan, src->bo, src_offset,
+			     NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	    OUT_RING  (chan, 0);
+	  }
+	}
+}
+
+static inline void
+nv04_copy_m2mf_begin(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, struct nouveau_bo* srcbo, unsigned commands)
+{
+	struct nouveau_channel *chan = ctx->m2mf->channel;
+	struct nouveau_grobj *m2mf = ctx->m2mf;
+	MARK_RING (chan, 3 + commands * 9, 2 + commands * 2);
+	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
+	OUT_RELOCo(chan, srcbo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dstbo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+}
+
+static inline void
+nv04_copy_m2mf_body(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, int* pdstoff, unsigned dstpitch, struct nouveau_bo* srcbo, int* psrcoff, unsigned srcpitch, unsigned size, unsigned lines)
+{
+	struct nouveau_channel *chan = ctx->m2mf->channel;
+	struct nouveau_grobj *m2mf = ctx->m2mf;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\t\t\tCOPY_M2MF_BODY [%i, %i] <%i[%u]> lin %u <- <%i[%u]> lin %u\n", size, lines, dstbo->handle, *pdstoff, dstpitch, srcbo->handle, *psrcoff, srcpitch);
+#endif
+
+	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
+	OUT_RELOCl(chan, srcbo, *psrcoff,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, dstbo, *pdstoff,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+	OUT_RING  (chan, srcpitch);
+	OUT_RING  (chan, dstpitch);
+	OUT_RING  (chan, size);
+	OUT_RING  (chan, lines);
+	OUT_RING  (chan, 0x0101);
+	OUT_RING  (chan, 0);
+
+	*psrcoff += srcpitch * lines;
+	*pdstoff += dstpitch * lines;
+}
+
+static void
+nv04_copy_m2mf(struct nv04_2d_context *ctx,
+		struct nouveau_bo* dstbo, int dstoff, unsigned dstpitch,
+		struct nouveau_bo* srcbo, int srcoff, unsigned srcpitch,
+		unsigned size, unsigned h)
+{
+	unsigned max_pitch = 32767;
+	unsigned max_lines = 2047;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\t\tCOPY_M2MF [%i, %i] <%i[%i]> lin %u <- <%i[%i]> lin %u\n", size, h, dstbo->handle, dstoff, dstpitch, srcbo->handle, srcoff, srcpitch);
+#endif
+
+	if(srcpitch <= max_pitch && dstpitch <= max_pitch)
+	{
+		unsigned full_pages = h / max_lines;
+		unsigned leftover_lines = h - full_pages * max_lines;
+
+		nv04_copy_m2mf_begin(ctx, dstbo, srcbo, full_pages + !!leftover_lines);
+
+		for(unsigned i = 0; i < full_pages; ++i)
+			nv04_copy_m2mf_body(ctx, dstbo, &dstoff, dstpitch, srcbo, &srcoff, srcpitch, size, max_lines);
+
+		if(leftover_lines)
+			nv04_copy_m2mf_body(ctx, dstbo, &dstoff, dstpitch, srcbo, &srcoff, srcpitch, size, leftover_lines);
+	}
+	else
+	{
+		unsigned lines = size / max_pitch;
+		unsigned leftover = size - lines * max_pitch;
+		unsigned full_pages = lines / max_lines;
+		unsigned leftover_lines = lines - full_pages * max_lines;
+		unsigned srcgap = srcpitch - size;
+		unsigned dstgap = dstpitch - size;
+
+		nv04_copy_m2mf_begin(ctx, dstbo, srcbo, h * (full_pages + !!leftover_lines + !!leftover));
+
+		for(unsigned i = 0; i < h; ++i)
+		{
+			for(unsigned j = 0; j < full_pages; ++j)
+				nv04_copy_m2mf_body(ctx, dstbo, &dstoff, max_pitch, srcbo, &srcoff, max_pitch, max_pitch, max_lines);
+
+			if(leftover_lines)
+				nv04_copy_m2mf_body(ctx, dstbo, &dstoff, max_pitch, srcbo, &srcoff, max_pitch, max_pitch, leftover_lines);
+
+			if(leftover)
+				nv04_copy_m2mf_body(ctx, dstbo, &dstoff, leftover, srcbo, &srcoff, leftover, leftover, 1);
+
+			srcoff += srcgap;
+			dstoff += dstgap;
+		}
+	}
+}
+
+void
+nv04_memcpy(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, int dstoff, struct nouveau_bo* srcbo, int srcoff, unsigned size)
+{
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tMEMCPY [%i] <%i[%i]> <- <%i[%i]>\n", size, dstbo->handle, dstoff, srcbo->handle, srcoff);
+#endif
+
+	nv04_copy_m2mf(ctx, dstbo, dstoff, size, srcbo, srcoff, size, size, 1);
+}
+
+static void
+nv04_region_copy_m2mf(struct nv04_2d_context *ctx, struct nv04_region *dst, struct nv04_region *src, int w, int h)
+{
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tRGN_COPY_M2MF [%i, %i: %i] ", w, h, dst->bpps);
+	for(int i = 0; i < 2; ++i)
+	{
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+#endif
+
+	nv04_region_assert(dst, w, h);
+	nv04_region_assert(src, w, h);
+	assert(src->pitch);
+	assert(dst->pitch);
+
+	nv04_copy_m2mf(ctx,
+			dst->bo, dst->offset + dst->y * dst->pitch + (dst->x << dst->bpps), dst->pitch,
+			src->bo, src->offset + src->y * src->pitch + (src->x << src->bpps), src->pitch,
+			w << src->bpps, h);
+}
+
+static inline void
+nv04_region_copy_blit(struct nv04_2d_context *ctx, struct nv04_region* dst, struct nv04_region* src, int w, int h, int format)
+{
+	struct nouveau_channel *chan = ctx->surf2d->channel;
+	struct nouveau_grobj *surf2d = ctx->surf2d;
+	struct nouveau_grobj *blit = ctx->blit;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tRGN_COPY_BLIT [%i, %i: %i] ", w, h, dst->bpps);
+	for(int i = 0; i < 2; ++i)
+	{
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+#endif
+
+	assert(!(src->pitch & 63) && src->pitch);
+	assert(!(dst->pitch & 63) && dst->pitch);
+	nv04_region_assert(dst, w, h);
+	nv04_region_assert(src, w, h);
+
+	MARK_RING (chan, 12, 4);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, src->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, format);
+	OUT_RING  (chan, (dst->pitch << 16) | src->pitch);
+	OUT_RELOCl(chan, src->bo, src->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, blit, 0x0300, 3);
+	OUT_RING  (chan, (src->y << 16) | src->x);
+	OUT_RING  (chan, (dst->y << 16) | dst->x);
+	OUT_RING  (chan, ( h << 16) |  w);
+}
+
+/* THEOREM: a non-linearizable swizzled destination is always 64 byte aligned, except for 4x2 mipmap levels of swizzled 1bpp surfaces
+ * HYPOTESIS:
+ * 1. The first mipmap level is 64-byte-aligned
+ * PROOF:
+ * 1. Thus, all mipmaps level with a parent which is 64-byte or more in size are.
+ * 2. At 1bpp, the smallest levels with a <= 32-byte parent are either Nx1 or 1xN or size <=8, thus 4x2, 2x2 or 2x4
+ * 3. Nx1, 1xN, 2x4, 2x2 have all subrects linearizable. 4x2 does not.
+ * 4. At 2/4bpp or more, the smallest levels with a 32-byte parent are 1xN, Nx1 or 2x2
+ *
+ * However, nv04_region_align handles that.
+ */
+
+// 0 -> done, 1 -> do with 3D engine or CPU, -1 -> do with CPU
+// dst and src may be modified, and the possibly modified version should be passed to nv04_region_cpu if necessary
+int
+nv04_region_copy_2d(struct nv04_2d_context *ctx, struct nv04_region* dst, struct nv04_region* src,
+		int w, int h, int cs2d_format, int sifm_format, int dst_to_gpu, int src_on_gpu)
+{
+	assert(src->bpps == dst->bpps);
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "RGN_COPY%s [%i, %i: %i] ", (cs2d_format >= 0) ? "_2D" : "_NO2D", w, h, dst->bpps);
+	for(int i = 0; i < 2; ++i)
+	{
+		int gpu = i ? src_on_gpu : dst_to_gpu;
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, " %s", gpu ? "gpu" : "cpu");
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+#endif
+
+	// if they are contiguous and either both swizzled or both linear, reshape
+	if(!dst->pitch == !src->pitch
+		&& nv04_region_is_contiguous(dst, w, h)
+		&& nv04_region_is_contiguous(src, w, h))
+	{
+		nv04_region_contiguous_shape(dst, &w, &h, 6);
+		nv04_region_linearize_contiguous(dst, w, h);
+		nv04_region_linearize_contiguous(src, w, h);
+	}
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tOPT ");
+	for(int i = 0; i < 2; ++i)
+	{
+		nv04_region_print(i ? src : dst);
+		fprintf(stderr, i ? "\n" : " <- ");
+	}
+#endif
+
+	/* if the destination is not for GPU _and_ source is on CPU, use CPU */
+	/* if the destination is not for GPU _or_ source is on CPU, use CPU only if we think it's faster than the GPU */
+	/* TODO: benchmark to find out in which cases exactly we should prefer the CPU */
+	 if((!dst_to_gpu && !src_on_gpu)
+		|| (!dst->pitch && dst->d > 1)
+		/* 3D swizzled destination are unwritable by the GPU, and 2D swizzled ones are readable only by the 3D engine */
+	 )
+		 return -1;
+	/* there is no known way to read 2D/3D-swizzled surfaces with the 2D engine
+	 * ask the caller to use the 3D engine
+	 * If a format cannot be sampled from the 3D engine there is no point in making it swizzled, so we must not do so
+	 */
+	 else if(!src->pitch)
+	 {
+#ifdef NV04_REGION_DEBUG
+		fprintf(stderr, "\tCOPY_ENG3D\n");
+#endif
+		 return 1;
+	 }
+	/* Setup transfer to swizzle the texture to vram if needed */
+	else
+	{
+		if (!dst->pitch)
+		{
+			if(cs2d_format < 0 || sifm_format < 0 || !dst_to_gpu)
+			{
+#ifdef NV04_REGION_DEBUG
+				fprintf(stderr, "\tCOPY_ENG3D\n");
+#endif
+				return 1;
+			}
+			else
+			{
+				assert(!nv04_region_align(dst, w, h, 6));
+
+				nv04_region_copy_swizzle(ctx, dst, src, w, h, cs2d_format, sifm_format);
+				return 0;
+			}
+		}
+		else
+		{
+			/* NV_CONTEXT_SURFACES_2D has buffer alignment restrictions, fallback
+			 * to NV_MEMORY_TO_MEMORY_FORMAT in this case.
+			 * TODO: is this also true for the source? possibly not
+			 */
+
+			if ((cs2d_format < 0)
+				|| !dst_to_gpu
+				|| nv04_region_align(src, w, h, 6)
+				|| nv04_region_align(dst, w, h, 6)
+				)
+				nv04_region_copy_m2mf(ctx, dst, src, w, h);
+			else
+				nv04_region_copy_blit(ctx, dst, src, w, h, cs2d_format);
+
+			return 0;
+		}
+	}
+}
+
+static inline void
+nv04_region_fill_gdirect(struct nv04_2d_context *ctx, struct nv04_region* dst, int w, int h, unsigned value)
+{
+	struct nouveau_channel *chan = ctx->surf2d->channel;
+	struct nouveau_grobj *surf2d = ctx->surf2d;
+	struct nouveau_grobj *rect = ctx->rect;
+	int cs2d_format, gdirect_format;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "\tFILL_GDIRECT\n");
+#endif
+
+	assert(!(dst->pitch & 63) && dst->pitch);
+	nv04_region_assert(dst, w, h);
+
+	if(dst->bpps == 0)
+	{
+		gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+		cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
+	}
+	else if(dst->bpps == 1)
+	{
+		gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
+		cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y16;
+	}
+	else if(dst->bpps == 2)
+	{
+		gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+		cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
+	}
+	else
+	{
+		assert(0);
+		gdirect_format = 0;
+		cs2d_format = 0;
+	}
+
+	MARK_RING (chan, 15, 4);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, cs2d_format);
+	OUT_RING  (chan, (dst->pitch << 16) | dst->pitch);
+	OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT, 1);
+	OUT_RING  (chan, gdirect_format);
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR1_A, 1);
+	OUT_RING  (chan, value);
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(0), 2);
+	OUT_RING  (chan, (dst->x << 16) | dst->y);
+	OUT_RING  (chan, ( w << 16) |  h);
+}
+
+int
+nv04_region_fill_2d(struct nv04_2d_context *ctx, struct nv04_region *dst,
+		  int w, int h, unsigned value)
+{
+	if(!w || !h)
+		return 0;
+
+#ifdef NV04_REGION_DEBUG
+	fprintf(stderr, "FILL [%i, %i: %i] ", w, h, dst->bpps);
+	nv04_region_print(dst);
+	fprintf(stderr, " <- 0x%x\n", value);
+#endif
+
+	if(nv04_region_is_contiguous(dst, w, h))
+	{
+		nv04_region_contiguous_shape(dst, &w, &h, 6);
+		nv04_region_linearize_contiguous(dst, w, h);
+	}
+
+	// TODO: maybe do intermediate copies for some cases instead of using the 3D engine/CPU
+	/* GdiRect doesn't work together with swzsurf, so the 3D engine, or an intermediate copy, is the only option here */
+	if(!dst->pitch)
+	{
+#ifdef NV04_REGION_DEBUG
+		fprintf(stderr, "\tFILL_ENG3D\n");
+#endif
+		return 1;
+	}
+	else if(!nv04_region_align(dst, w, h, 6))
+	{
+		nv04_region_fill_gdirect(ctx, dst, w, h, value);
+		return 0;
+	}
+	else
+		return -1;
+}
+
+
+void
+nv04_2d_context_takedown(struct nv04_2d_context *ctx)
+{
+	nouveau_notifier_free(&ctx->ntfy);
+	nouveau_grobj_free(&ctx->m2mf);
+	nouveau_grobj_free(&ctx->surf2d);
+	nouveau_grobj_free(&ctx->swzsurf);
+	nouveau_grobj_free(&ctx->rect);
+	nouveau_grobj_free(&ctx->blit);
+	nouveau_grobj_free(&ctx->sifm);
+
+	free(ctx);
+}
+
+struct nv04_2d_context *
+nv04_2d_context_init(struct nouveau_channel* chan)
+{
+	struct nv04_2d_context *ctx = calloc(1, sizeof(struct nv04_2d_context));
+	unsigned handle = 0x88000000, class;
+	int ret;
+
+	if (!ctx)
+		return NULL;
+
+	ret = nouveau_notifier_alloc(chan, handle++, 1, &ctx->ntfy);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, 0x0039, &ctx->m2mf);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+
+	if (chan->device->chipset < 0x10)
+		class = NV04_CONTEXT_SURFACES_2D;
+	else
+		class = NV10_CONTEXT_SURFACES_2D;
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->surf2d);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->surf2d,
+			 NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
+
+	if (chan->device->chipset < 0x10)
+		class = NV04_IMAGE_BLIT;
+	else
+		class = NV12_IMAGE_BLIT;
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->blit);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+	BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_SURFACE, 1);
+	OUT_RING  (chan, ctx->surf2d->handle);
+	BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_OPERATION, 1);
+	OUT_RING  (chan, NV01_IMAGE_BLIT_OPERATION_SRCCOPY);
+
+	ret = nouveau_grobj_alloc(chan, handle++, NV04_GDI_RECTANGLE_TEXT,
+				  &ctx->rect);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_SURFACE, 1);
+	OUT_RING  (chan, ctx->surf2d->handle);
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_OPERATION, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY);
+	BEGIN_RING(chan, ctx->rect,
+			 NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE);
+
+	switch (chan->device->chipset & 0xf0) {
+	case 0x00:
+	case 0x10:
+		class = NV04_SWIZZLED_SURFACE;
+		break;
+	case 0x20:
+		class = NV20_SWIZZLED_SURFACE;
+		break;
+	case 0x30:
+		class = NV30_SWIZZLED_SURFACE;
+		break;
+	case 0x40:
+	case 0x60:
+		class = NV40_SWIZZLED_SURFACE;
+		break;
+	default:
+		/* Famous last words: this really can't happen.. */
+		assert(0);
+		break;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->swzsurf);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	/* all the Gallium MARK_RING calculations assume no autobinding, so do that now */
+	if(ctx->swzsurf->bound == NOUVEAU_GROBJ_UNBOUND)
+		nouveau_grobj_autobind(ctx->swzsurf);
+
+	switch (chan->device->chipset & 0xf0) {
+	case 0x10:
+	case 0x20:
+		class = NV10_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	case 0x30:
+		class = NV30_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	case 0x40:
+	case 0x60:
+		class = NV40_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	default:
+		class = NV04_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->sifm);
+	if (ret) {
+		nv04_2d_context_takedown(ctx);
+		return NULL;
+	}
+
+	/* all the Gallium MARK_RING calculations assume no autobinding, so do that now */
+	if(ctx->sifm->bound == NOUVEAU_GROBJ_UNBOUND)
+		nouveau_grobj_autobind(ctx->sifm);
+
+	return ctx;
+}
diff --git a/src/gallium/drivers/nvfx/nv04_2d.h b/src/gallium/drivers/nvfx/nv04_2d.h
new file mode 100644
index 0000000000..e638b8c874
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv04_2d.h
@@ -0,0 +1,87 @@
+/**************************************************************************
+ *
+ * Copyright 2009 Ben Skeggs
+ * Copyright 2009 Younes Manton
+ * Copyright 2010 Luca Barbieri
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+/* this code has no Mesa or Gallium dependency and can be reused in the classic Mesa driver or DDX */
+
+#ifndef __NV04_2D_H__
+#define __NV04_2D_H__
+
+struct nv04_2d_context;
+struct nouveau_channel;
+struct nouveau_bo;
+
+// NOTE: all functions taking this as a parameter will CLOBBER it (except for ->bo)
+struct nv04_region {
+	struct nouveau_bo* bo;
+	int offset;
+	unsigned pitch; // 0 -> swizzled
+	unsigned bpps; // bpp shift (0, 1, 2; 3, 4 for fp/compressed)
+	unsigned x, y, z;
+	unsigned w, h, d;
+};
+
+void
+nv04_memcpy(struct nv04_2d_context *ctx,
+		struct nouveau_bo* dstbo, int dstoff,
+		struct nouveau_bo* srcbo, int srcoff,
+		unsigned size);
+
+unsigned
+nv04_region_begin(struct nv04_region* rgn, unsigned w, unsigned h);
+
+unsigned
+nv04_region_end(struct nv04_region* rgn, unsigned w, unsigned h);
+
+void
+nv04_2d_context_takedown(struct nv04_2d_context *pctx);
+
+struct nv04_2d_context *
+nv04_2d_context_init(struct nouveau_channel* chan);
+
+void
+nv04_region_copy_cpu(struct nv04_region* dst, struct nv04_region* src, int w, int h);
+
+void
+nv04_region_fill_cpu(struct nv04_region* dst, int w, int h, unsigned value);
+
+int
+nv04_region_copy_2d(struct nv04_2d_context *ctx,
+		struct nv04_region* dst, struct nv04_region* src,
+		int w, int h,
+		int cs2d_format, int sifm_format,
+		int dst_to_gpu, int src_on_gpu);
+
+int
+nv04_region_fill_2d(struct nv04_2d_context *ctx,
+		struct nv04_region *dst,
+                int w, int h,
+                unsigned value);
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nv04_2d_loops.h b/src/gallium/drivers/nvfx/nv04_2d_loops.h
new file mode 100644
index 0000000000..3a6787c071
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv04_2d_loops.h
@@ -0,0 +1,70 @@
+#ifndef T
+{
+	if(dst->bpps == 0)
+#define T uint8_t
+#include "nv04_2d_loops.h"
+#undef T
+	else if(dst->bpps == 1)
+#define T uint16_t
+#include "nv04_2d_loops.h"
+#undef T
+	else if(dst->bpps == 2)
+#define T uint32_t
+#include "nv04_2d_loops.h"
+#undef T
+	else
+		assert(0);
+}
+#else
+#ifdef SWIZZLED_COPY_LOOPS
+{
+	if(!dst->pitch)
+	{
+		if(!src->pitch)
+		{
+			LOOP_Y
+			{
+				T* pdst = (T*)mdst + dswy[iy];
+				T* psrc = (T*)msrc + sswy[iy];
+				LOOP_X
+				{
+					assert((char*)&psrc[sswx[ix] + 1] <= ((char*)src->bo->map + src->bo->size));
+					assert((char*)&pdst[dswx[ix] + 1] <= ((char*)dst->bo->map + dst->bo->size));
+					pdst[dswx[ix]] = psrc[sswx[ix]];
+				}
+			}
+		}
+		else
+		{
+			T* psrc = (T*)(msrc + ((dir > 0) ? src->y : (src->y + h - 1)) * src->pitch) + src->x;
+			LOOP_Y
+			{
+				T* pdst = (T*)mdst + dswy[iy];
+				LOOP_X
+				{
+					assert((char*)&psrc[ix + 1] <= ((char*)src->bo->map + src->bo->size));
+					assert((char*)&pdst[dswx[ix] + 1] <= ((char*)dst->bo->map + dst->bo->size));
+					pdst[dswx[ix]] = psrc[ix];
+				}
+				psrc = (T*)((char*)psrc + dir * src->pitch);
+			}
+		}
+	}
+	else
+	{
+		T* pdst = (T*)(mdst + ((dir > 0) ? dst->y : (dst->y + h - 1)) * dst->pitch) + dst->x;
+		LOOP_Y
+		{
+			T* psrc = (T*)msrc + sswy[iy];
+			LOOP_X
+			{
+				assert((char*)&psrc[sswx[ix] + 1] <= ((char*)src->bo->map + src->bo->size));
+				assert((char*)&pdst[ix + 1] <= ((char*)dst->bo->map + dst->bo->size));
+				pdst[ix] = psrc[sswx[ix]];
+			}
+			pdst = (T*)((char*)pdst + dir * dst->pitch);
+		}
+	}
+}
+#endif
+#endif
diff --git a/src/gallium/drivers/nvfx/nv04_surface_2d.c b/src/gallium/drivers/nvfx/nv04_surface_2d.c
deleted file mode 100644
index 7acbb505df..0000000000
--- a/src/gallium/drivers/nvfx/nv04_surface_2d.c
+++ /dev/null
@@ -1,532 +0,0 @@
-#include "pipe/p_context.h"
-#include "pipe/p_format.h"
-#include "util/u_format.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-
-#include "nouveau/nouveau_winsys.h"
-#include "nouveau/nouveau_util.h"
-#include "nouveau/nouveau_screen.h"
-#include "nv04_surface_2d.h"
-
-static INLINE int
-nv04_surface_format(enum pipe_format format)
-{
-	switch (format) {
-	case PIPE_FORMAT_A8_UNORM:
-	case PIPE_FORMAT_L8_UNORM:
-	case PIPE_FORMAT_I8_UNORM:
-		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
-	case PIPE_FORMAT_R16_SNORM:
-	case PIPE_FORMAT_B5G6R5_UNORM:
-	case PIPE_FORMAT_Z16_UNORM:
-	case PIPE_FORMAT_L8A8_UNORM:
-		return NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5;
-	case PIPE_FORMAT_B8G8R8X8_UNORM:
-	case PIPE_FORMAT_B8G8R8A8_UNORM:
-		return NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8;
-	case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
-	case PIPE_FORMAT_X8Z24_UNORM:
-		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
-	default:
-		return -1;
-	}
-}
-
-static INLINE int
-nv04_rect_format(enum pipe_format format)
-{
-	switch (format) {
-	case PIPE_FORMAT_A8_UNORM:
-		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
-	case PIPE_FORMAT_B5G6R5_UNORM:
-	case PIPE_FORMAT_L8A8_UNORM:
-	case PIPE_FORMAT_Z16_UNORM:
-		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
-	case PIPE_FORMAT_B8G8R8X8_UNORM:
-	case PIPE_FORMAT_B8G8R8A8_UNORM:
-	case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
-	case PIPE_FORMAT_X8Z24_UNORM:
-		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
-	default:
-		return -1;
-	}
-}
-
-static INLINE int
-nv04_scaled_image_format(enum pipe_format format)
-{
-	switch (format) {
-	case PIPE_FORMAT_A8_UNORM:
-	case PIPE_FORMAT_L8_UNORM:
-	case PIPE_FORMAT_I8_UNORM:
-		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_Y8;
-	case PIPE_FORMAT_B5G5R5A1_UNORM:
-		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5;
-	case PIPE_FORMAT_B8G8R8A8_UNORM:
-		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8;
-	case PIPE_FORMAT_B8G8R8X8_UNORM:
-		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8;
-	case PIPE_FORMAT_B5G6R5_UNORM:
-	case PIPE_FORMAT_R16_SNORM:
-	case PIPE_FORMAT_L8A8_UNORM:
-		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5;
-	default:
-		return -1;
-	}
-}
-
-static INLINE unsigned
-nv04_swizzle_bits_square(unsigned x, unsigned y)
-{
-	unsigned u = (x & 0x001) << 0 |
-	             (x & 0x002) << 1 |
-	             (x & 0x004) << 2 |
-	             (x & 0x008) << 3 |
-	             (x & 0x010) << 4 |
-	             (x & 0x020) << 5 |
-	             (x & 0x040) << 6 |
-	             (x & 0x080) << 7 |
-	             (x & 0x100) << 8 |
-	             (x & 0x200) << 9 |
-	             (x & 0x400) << 10 |
-	             (x & 0x800) << 11;
-
-	unsigned v = (y & 0x001) << 1 |
-	             (y & 0x002) << 2 |
-	             (y & 0x004) << 3 |
-	             (y & 0x008) << 4 |
-	             (y & 0x010) << 5 |
-	             (y & 0x020) << 6 |
-	             (y & 0x040) << 7 |
-	             (y & 0x080) << 8 |
-	             (y & 0x100) << 9 |
-	             (y & 0x200) << 10 |
-	             (y & 0x400) << 11 |
-	             (y & 0x800) << 12;
-	return v | u;
-}
-
-/* rectangular swizzled textures are linear concatenations of swizzled square tiles */
-static INLINE unsigned
-nv04_swizzle_bits(unsigned x, unsigned y, unsigned w, unsigned h)
-{
-	unsigned s = MIN2(w, h);
-	unsigned m = s - 1;
-	return (((x | y) & ~m) * s) | nv04_swizzle_bits_square(x & m, y & m);
-}
-
-static int
-nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
-			  struct pipe_surface *dst, int dx, int dy,
-			  struct pipe_surface *src, int sx, int sy,
-			  int w, int h)
-{
-	struct nouveau_channel *chan = ctx->swzsurf->channel;
-	struct nouveau_grobj *swzsurf = ctx->swzsurf;
-	struct nouveau_grobj *sifm = ctx->sifm;
-	struct nouveau_bo *src_bo = ctx->buf(src);
-	struct nouveau_bo *dst_bo = ctx->buf(dst);
-	const unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
-        /* Max width & height may not be the same on all HW, but must be POT */
-	const unsigned max_w = 1024;
-	const unsigned max_h = 1024;
-	unsigned sub_w = w > max_w ? max_w : w;
-	unsigned sub_h = h > max_h ? max_h : h;
-	unsigned x;
-	unsigned y;
-
-        /* Swizzled surfaces must be POT  */
-	assert(util_is_pot(dst->width) && util_is_pot(dst->height));
-
-        /* If area is too large to copy in one shot we must copy it in POT chunks to meet alignment requirements */
-	assert(sub_w == w || util_is_pot(sub_w));
-	assert(sub_h == h || util_is_pot(sub_h));
-
-	MARK_RING (chan, 8 + ((w+sub_w)/sub_w)*((h+sub_h)/sub_h)*17, 2 +
-			 ((w+sub_w)/sub_w)*((h+sub_h)/sub_h)*2);
-
-	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
-	OUT_RELOCo(chan, dst_bo,
-	                 NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-
-	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_FORMAT, 1);
-	OUT_RING  (chan, nv04_surface_format(dst->format) |
-	                 log2i(dst->width) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT |
-	                 log2i(dst->height) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT);
-
-	BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1);
-	OUT_RELOCo(chan, src_bo,
-	                 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
-	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1);
-	OUT_RING  (chan, swzsurf->handle);
-
-	for (y = 0; y < h; y += sub_h) {
-	  sub_h = MIN2(sub_h, h - y);
-
-	  for (x = 0; x < w; x += sub_w) {
-	    sub_w = MIN2(sub_w, w - x);
-
-	    assert(!(dst->offset & 63));
-
-	    BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
-	    OUT_RELOCl(chan, dst_bo, dst->offset,
-                             NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-
-	    BEGIN_RING(chan, sifm, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
-	    OUT_RING  (chan, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
-	    OUT_RING  (chan, nv04_scaled_image_format(src->format));
-	    OUT_RING  (chan, NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
-	    OUT_RING  (chan, (x + dx) | ((y + dy) << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT));
-	    OUT_RING  (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | sub_w);
-	    OUT_RING  (chan, (x + dx) | ((y + dy) << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT));
-	    OUT_RING  (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | sub_w);
-	    OUT_RING  (chan, 1 << 20);
-	    OUT_RING  (chan, 1 << 20);
-
-	    BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_SIZE, 4);
-	    OUT_RING  (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_SIZE_H_SHIFT | sub_w);
-	    OUT_RING  (chan, src_pitch |
-			     NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
-			     NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
-	    OUT_RELOCl(chan, src_bo, src->offset + (sy+y) * src_pitch + (sx+x) * util_format_get_blocksize(src->texture->format),
-                             NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
-	    OUT_RING  (chan, 0);
-	  }
-	}
-
-	return 0;
-}
-
-static int
-nv04_surface_copy_m2mf(struct nv04_surface_2d *ctx,
-		       struct pipe_surface *dst, int dx, int dy,
-		       struct pipe_surface *src, int sx, int sy, int w, int h)
-{
-	struct nouveau_channel *chan = ctx->m2mf->channel;
-	struct nouveau_grobj *m2mf = ctx->m2mf;
-	struct nouveau_bo *src_bo = ctx->buf(src);
-	struct nouveau_bo *dst_bo = ctx->buf(dst);
-	unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
-	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
-	unsigned dst_offset = dst->offset + dy * dst_pitch +
-	                      dx * util_format_get_blocksize(dst->texture->format);
-	unsigned src_offset = src->offset + sy * src_pitch +
-	                      sx * util_format_get_blocksize(src->texture->format);
-
-	MARK_RING (chan, 3 + ((h / 2047) + 1) * 9, 2 + ((h / 2047) + 1) * 2);
-	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
-	OUT_RELOCo(chan, src_bo,
-		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
-	OUT_RELOCo(chan, dst_bo,
-		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-
-	while (h) {
-		int count = (h > 2047) ? 2047 : h;
-
-		BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
-		OUT_RELOCl(chan, src_bo, src_offset,
-			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-		OUT_RELOCl(chan, dst_bo, dst_offset,
-			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
-		OUT_RING  (chan, src_pitch);
-		OUT_RING  (chan, dst_pitch);
-		OUT_RING  (chan, w * util_format_get_blocksize(src->texture->format));
-		OUT_RING  (chan, count);
-		OUT_RING  (chan, 0x0101);
-		OUT_RING  (chan, 0);
-
-		h -= count;
-		src_offset += src_pitch * count;
-		dst_offset += dst_pitch * count;
-	}
-
-	return 0;
-}
-
-static int
-nv04_surface_copy_blit(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
-		       int dx, int dy, struct pipe_surface *src, int sx, int sy,
-		       int w, int h)
-{
-	struct nouveau_channel *chan = ctx->surf2d->channel;
-	struct nouveau_grobj *surf2d = ctx->surf2d;
-	struct nouveau_grobj *blit = ctx->blit;
-	struct nouveau_bo *src_bo = ctx->buf(src);
-	struct nouveau_bo *dst_bo = ctx->buf(dst);
-	unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
-	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
-	int format;
-
-	format = nv04_surface_format(dst->format);
-	if (format < 0)
-		return 1;
-
-	MARK_RING (chan, 12, 4);
-	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
-	OUT_RELOCo(chan, src_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
-	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
-	OUT_RING  (chan, format);
-	OUT_RING  (chan, (dst_pitch << 16) | src_pitch);
-	OUT_RELOCl(chan, src_bo, src->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
-	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-
-	BEGIN_RING(chan, blit, 0x0300, 3);
-	OUT_RING  (chan, (sy << 16) | sx);
-	OUT_RING  (chan, (dy << 16) | dx);
-	OUT_RING  (chan, ( h << 16) |  w);
-
-	return 0;
-}
-
-static void
-nv04_surface_copy(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
-		  int dx, int dy, struct pipe_surface *src, int sx, int sy,
-		  int w, int h)
-{
-	int src_linear = src->texture->flags & NVFX_RESOURCE_FLAG_LINEAR;
-	int dst_linear = dst->texture->flags & NVFX_RESOURCE_FLAG_LINEAR;
-
-	assert(src->format == dst->format);
-
-	/* Setup transfer to swizzle the texture to vram if needed */
-        if (src_linear && !dst_linear && w > 1 && h > 1) {
-           nv04_surface_copy_swizzle(ctx, dst, dx, dy, src, sx, sy, w, h);
-           return;
-        }
-
-        /* Use M2MF instead of the blitter since it always works
-         * Any possible performance drop is likely to be not very significant
-         * and dwarfed anyway by the current buffer management problems
-         */
-        nv04_surface_copy_m2mf(ctx, dst, dx, dy, src, sx, sy, w, h);
-}
-
-static void
-nv04_surface_fill(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
-		  int dx, int dy, int w, int h, unsigned value)
-{
-	struct nouveau_channel *chan = ctx->surf2d->channel;
-	struct nouveau_grobj *surf2d = ctx->surf2d;
-	struct nouveau_grobj *rect = ctx->rect;
-	struct nouveau_bo *dst_bo = ctx->buf(dst);
-	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
-	int cs2d_format, gdirect_format;
-
-	cs2d_format = nv04_surface_format(dst->format);
-	assert(cs2d_format >= 0);
-
-	gdirect_format = nv04_rect_format(dst->format);
-	assert(gdirect_format >= 0);
-
-	MARK_RING (chan, 16, 4);
-	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
-	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
-	OUT_RING  (chan, cs2d_format);
-	OUT_RING  (chan, (dst_pitch << 16) | dst_pitch);
-	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-
-	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT, 1);
-	OUT_RING  (chan, gdirect_format);
-	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR1_A, 1);
-	OUT_RING  (chan, value);
-	BEGIN_RING(chan, rect,
-		   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(0), 2);
-	OUT_RING  (chan, (dx << 16) | dy);
-	OUT_RING  (chan, ( w << 16) |  h);
-}
-
-void
-nv04_surface_2d_takedown(struct nv04_surface_2d **pctx)
-{
-	struct nv04_surface_2d *ctx;
-
-	if (!pctx || !*pctx)
-		return;
-	ctx = *pctx;
-	*pctx = NULL;
-
-	nouveau_notifier_free(&ctx->ntfy);
-	nouveau_grobj_free(&ctx->m2mf);
-	nouveau_grobj_free(&ctx->surf2d);
-	nouveau_grobj_free(&ctx->swzsurf);
-	nouveau_grobj_free(&ctx->rect);
-	nouveau_grobj_free(&ctx->blit);
-	nouveau_grobj_free(&ctx->sifm);
-
-	FREE(ctx);
-}
-
-struct nv04_surface_2d *
-nv04_surface_2d_init(struct nouveau_screen *screen)
-{
-	struct nv04_surface_2d *ctx = CALLOC_STRUCT(nv04_surface_2d);
-	struct nouveau_channel *chan = screen->channel;
-	unsigned handle = 0x88000000, class;
-	int ret;
-
-	if (!ctx)
-		return NULL;
-
-	ret = nouveau_notifier_alloc(chan, handle++, 1, &ctx->ntfy);
-	if (ret) {
-		nv04_surface_2d_takedown(&ctx);
-		return NULL;
-	}
-
-	ret = nouveau_grobj_alloc(chan, handle++, 0x0039, &ctx->m2mf);
-	if (ret) {
-		nv04_surface_2d_takedown(&ctx);
-		return NULL;
-	}
-
-	BEGIN_RING(chan, ctx->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 1);
-	OUT_RING  (chan, ctx->ntfy->handle);
-
-	if (chan->device->chipset < 0x10)
-		class = NV04_CONTEXT_SURFACES_2D;
-	else
-		class = NV10_CONTEXT_SURFACES_2D;
-
-	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->surf2d);
-	if (ret) {
-		nv04_surface_2d_takedown(&ctx);
-		return NULL;
-	}
-
-	BEGIN_RING(chan, ctx->surf2d,
-			 NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
-	OUT_RING  (chan, chan->vram->handle);
-	OUT_RING  (chan, chan->vram->handle);
-
-	if (chan->device->chipset < 0x10)
-		class = NV04_IMAGE_BLIT;
-	else
-		class = NV12_IMAGE_BLIT;
-
-	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->blit);
-	if (ret) {
-		nv04_surface_2d_takedown(&ctx);
-		return NULL;
-	}
-
-	BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_DMA_NOTIFY, 1);
-	OUT_RING  (chan, ctx->ntfy->handle);
-	BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_SURFACE, 1);
-	OUT_RING  (chan, ctx->surf2d->handle);
-	BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_OPERATION, 1);
-	OUT_RING  (chan, NV01_IMAGE_BLIT_OPERATION_SRCCOPY);
-
-	ret = nouveau_grobj_alloc(chan, handle++, NV04_GDI_RECTANGLE_TEXT,
-				  &ctx->rect);
-	if (ret) {
-		nv04_surface_2d_takedown(&ctx);
-		return NULL;
-	}
-
-	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY, 1);
-	OUT_RING  (chan, ctx->ntfy->handle);
-	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_SURFACE, 1);
-	OUT_RING  (chan, ctx->surf2d->handle);
-	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_OPERATION, 1);
-	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY);
-	BEGIN_RING(chan, ctx->rect,
-			 NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT, 1);
-	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE);
-
-	switch (chan->device->chipset & 0xf0) {
-	case 0x00:
-	case 0x10:
-		class = NV04_SWIZZLED_SURFACE;
-		break;
-	case 0x20:
-		class = NV20_SWIZZLED_SURFACE;
-		break;
-	case 0x30:
-		class = NV30_SWIZZLED_SURFACE;
-		break;
-	case 0x40:
-	case 0x60:
-		class = NV40_SWIZZLED_SURFACE;
-		break;
-	default:
-		/* Famous last words: this really can't happen.. */
-		assert(0);
-		break;
-	}
-
-	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->swzsurf);
-	if (ret) {
-		nv04_surface_2d_takedown(&ctx);
-		return NULL;
-	}
-
-	switch (chan->device->chipset & 0xf0) {
-	case 0x10:
-	case 0x20:
-		class = NV10_SCALED_IMAGE_FROM_MEMORY;
-		break;
-	case 0x30:
-		class = NV30_SCALED_IMAGE_FROM_MEMORY;
-		break;
-	case 0x40:
-	case 0x60:
-		class = NV40_SCALED_IMAGE_FROM_MEMORY;
-		break;
-	default:
-		class = NV04_SCALED_IMAGE_FROM_MEMORY;
-		break;
-	}
-
-	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->sifm);
-	if (ret) {
-		nv04_surface_2d_takedown(&ctx);
-		return NULL;
-	}
-
-	ctx->copy = nv04_surface_copy;
-	ctx->fill = nv04_surface_fill;
-	return ctx;
-}
-
-struct nv04_surface*
-nv04_surface_wrap_for_render(struct pipe_screen *pscreen,
-			     struct nv04_surface_2d* eng2d, struct nv04_surface* ns)
-{
-	struct pipe_resource templ;
-	struct pipe_resource* temp_tex;
-	struct nv04_surface* temp_ns;
-	int temp_flags;
-
-	temp_flags = ns->base.usage;
-
-	ns->base.usage = 0;
-
-	memset(&templ, 0, sizeof(templ));
-	templ.format = ns->base.texture->format;
-	templ.target = PIPE_TEXTURE_2D;
-	templ.width0 = ns->base.width;
-	templ.height0 = ns->base.height;
-	templ.depth0 = 1;
-	templ.last_level = 0;
-
-	// TODO: this is probably wrong and we should specifically handle multisampling somehow once it is implemented
-	templ.nr_samples = ns->base.texture->nr_samples;
-
-	templ.bind = ns->base.texture->bind | PIPE_BIND_RENDER_TARGET;
-
-	temp_tex = pscreen->resource_create(pscreen, &templ);
-	temp_ns = (struct nv04_surface*)pscreen->get_tex_surface(pscreen, temp_tex, 0, 0, 0, temp_flags);
-	temp_ns->backing = ns;
-
-	if(1) /* hmm */
-		eng2d->copy(eng2d, &temp_ns->backing->base,
-			    0, 0, &ns->base,
-			    0, 0, ns->base.width, ns->base.height);
-
-	return temp_ns;
-}
diff --git a/src/gallium/drivers/nvfx/nv04_surface_2d.h b/src/gallium/drivers/nvfx/nv04_surface_2d.h
deleted file mode 100644
index 2123c3ed08..0000000000
--- a/src/gallium/drivers/nvfx/nv04_surface_2d.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef __NV04_SURFACE_2D_H__
-#define __NV04_SURFACE_2D_H__
-
-#include "pipe/p_state.h"
-
-struct nouveau_screen;
-
-struct nv04_surface {
-	struct pipe_surface base;
-	unsigned pitch;
-	struct nv04_surface* backing;
-};
-
-struct nv04_surface_2d {
-	struct nouveau_notifier *ntfy;
-	struct nouveau_grobj *surf2d;
-	struct nouveau_grobj *swzsurf;
-	struct nouveau_grobj *m2mf;
-	struct nouveau_grobj *rect;
-	struct nouveau_grobj *blit;
-	struct nouveau_grobj *sifm;
-
-	struct nouveau_bo *(*buf)(struct pipe_surface *);
-
-	void (*copy)(struct nv04_surface_2d *, struct pipe_surface *dst,
-		     int dx, int dy, struct pipe_surface *src, int sx, int sy,
-		     int w, int h);
-	void (*fill)(struct nv04_surface_2d *, struct pipe_surface *dst,
-		     int dx, int dy, int w, int h, unsigned value);
-};
-
-struct nv04_surface_2d *
-nv04_surface_2d_init(struct nouveau_screen *screen);
-
-void
-nv04_surface_2d_takedown(struct nv04_surface_2d **);
-
-struct nv04_surface*
-nv04_surface_wrap_for_render(struct pipe_screen *pscreen, struct nv04_surface_2d* eng2d, struct nv04_surface* ns);
-
-#define NVFX_RESOURCE_FLAG_LINEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
-
-#endif
diff --git a/src/gallium/drivers/nvfx/nv30_fragtex.c b/src/gallium/drivers/nvfx/nv30_fragtex.c
index dec073ac90..0c3d43fd57 100644
--- a/src/gallium/drivers/nvfx/nv30_fragtex.c
+++ b/src/gallium/drivers/nvfx/nv30_fragtex.c
@@ -1,7 +1,6 @@
 #include "util/u_format.h"
 
 #include "nvfx_context.h"
-#include "nouveau/nouveau_util.h"
 #include "nvfx_tex.h"
 #include "nvfx_resource.h"
 
@@ -10,138 +9,109 @@ nv30_sampler_state_init(struct pipe_context *pipe,
 			  struct nvfx_sampler_state *ps,
 			  const struct pipe_sampler_state *cso)
 {
-	if (cso->max_anisotropy >= 8) {
-		ps->en |= NV34TCL_TX_ENABLE_ANISO_8X;
-	} else
-	if (cso->max_anisotropy >= 4) {
-		ps->en |= NV34TCL_TX_ENABLE_ANISO_4X;
-	} else
-	if (cso->max_anisotropy >= 2) {
-		ps->en |= NV34TCL_TX_ENABLE_ANISO_2X;
-	}
+	float limit;
 
+	if (cso->max_anisotropy >= 2)
 	{
-		float limit;
+		if (cso->max_anisotropy >= 8)
+			ps->en |= NV34TCL_TX_ENABLE_ANISO_8X;
+		else if (cso->max_anisotropy >= 4)
+			ps->en |= NV34TCL_TX_ENABLE_ANISO_4X;
+		else if (cso->max_anisotropy >= 2)
+			ps->en |= NV34TCL_TX_ENABLE_ANISO_2X;
+	}
 
-		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
-		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+	limit = CLAMP(cso->lod_bias, -16.0, 15.0 + (255.0 / 256.0));
+	ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
 
-		limit = CLAMP(cso->max_lod, 0.0, 15.0);
-		ps->en |= (int)(limit) << 14 /*NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT*/;
+	ps->max_lod = (int)CLAMP(cso->max_lod, 0.0, 15.0);
+	ps->min_lod = (int)CLAMP(cso->min_lod, 0.0, 15.0);
 
-		limit = CLAMP(cso->min_lod, 0.0, 15.0);
-		ps->en |= (int)(limit) << 26 /*NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT*/;
-	}
+	ps->en |= NV34TCL_TX_ENABLE_ENABLE;
 }
 
-#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w)                        \
-{                                                                              \
-  TRUE,                                                                        \
-  PIPE_FORMAT_##m,                                                             \
-  NV34TCL_TX_FORMAT_FORMAT_##tf,                                               \
-  (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y |           \
-   NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w |           \
-   NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y |           \
-   NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w)            \
-}
-
-struct nv30_texture_format {
-	boolean defined;
-	uint	pipe;
-	int     format;
-	int     swizzle;
-};
-
-static struct nv30_texture_format
-nv30_texture_formats[] = {
-	_(B8G8R8X8_UNORM, A8R8G8B8,   S1,   S1,   S1,  ONE, X, Y, Z, W),
-	_(B8G8R8A8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W),
-	_(B5G5R5A1_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W),
-	_(B4G4R4A4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W),
-	_(B5G6R5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
-	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X),
-	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X),
-	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X),
-	_(L8A8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y),
-	_(Z16_UNORM     , R5G6B5  ,   S1,   S1,   S1,  ONE, X, X, X, X),
-	_(S8_USCALED_Z24_UNORM   , A8R8G8B8,   S1,   S1,   S1,  ONE, X, X, X, X),
-	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
-	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
-	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
-	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
-	{},
-};
-
-static struct nv30_texture_format *
-nv30_fragtex_format(uint pipe_format)
+void
+nv30_sampler_view_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_view *sv)
 {
-	struct nv30_texture_format *tf = nv30_texture_formats;
-
-	while (tf->defined) {
-		if (tf->pipe == pipe_format)
-			return tf;
-		tf++;
-	}
-
-	NOUVEAU_ERR("unknown texture format %s\n", util_format_name(pipe_format));
-	return NULL;
+	struct pipe_resource* pt = sv->base.texture;
+	struct nvfx_texture_format *tf = &nvfx_texture_formats[sv->base.format];
+	unsigned txf;
+	unsigned level = pt->target == PIPE_TEXTURE_CUBE ? 0 : sv->base.first_level;
+
+	assert(tf->fmt[0] >= 0);
+
+	txf = sv->u.init_fmt;
+	txf |= (level != sv->base.last_level ? NV34TCL_TX_FORMAT_MIPMAP : 0);
+	txf |= util_logbase2(u_minify(pt->width0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT;
+	txf |= util_logbase2(u_minify(pt->height0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT;
+	txf |= util_logbase2(u_minify(pt->depth0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT;
+	txf |=  0x10000;
+
+	sv->u.nv30.fmt[0] = tf->fmt[0] | txf;
+	sv->u.nv30.fmt[1] = tf->fmt[1] | txf;
+	sv->u.nv30.fmt[2] = tf->fmt[2] | txf;
+	sv->u.nv30.fmt[3] = tf->fmt[3] | txf;
+
+	sv->swizzle  |= (nvfx_subresource_pitch(pt, 0) << NV34TCL_TX_SWIZZLE_RECT_PITCH_SHIFT);
+
+	if(pt->height0 <= 1 || util_format_is_compressed(sv->base.format))
+		sv->u.nv30.rect = -1;
+	else
+		sv->u.nv30.rect = !!(pt->flags & NVFX_RESOURCE_FLAG_LINEAR);
+
+	sv->lod_offset = sv->base.first_level - level;
+	sv->max_lod_limit = sv->base.last_level - level;
 }
 
-
 void
 nv30_fragtex_set(struct nvfx_context *nvfx, int unit)
 {
 	struct nvfx_sampler_state *ps = nvfx->tex_sampler[unit];
-	struct nvfx_miptree *nv30mt = (struct nvfx_miptree *)nvfx->fragment_sampler_views[unit]->texture;
-	struct pipe_resource *pt = &nv30mt->base.base;
-	struct nouveau_bo *bo = nv30mt->base.bo;
-	struct nv30_texture_format *tf;
+	struct nvfx_sampler_view* sv = (struct nvfx_sampler_view*)nvfx->fragment_sampler_views[unit];
+	struct nouveau_bo *bo = ((struct nvfx_miptree *)sv->base.texture)->base.bo;
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	uint32_t txf, txs;
+	unsigned txf;
 	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+	unsigned use_rect;
+	unsigned max_lod = MIN2(ps->max_lod + sv->lod_offset, sv->max_lod_limit);
+	unsigned min_lod = MIN2(ps->min_lod + sv->lod_offset, max_lod) ;
 
-	tf = nv30_fragtex_format(pt->format);
-	if (!tf)
-		return;
-
-	txf  = tf->format;
-	txf |= ((pt->last_level>0) ? NV34TCL_TX_FORMAT_MIPMAP : 0);
-	txf |= log2i(pt->width0) << NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT;
-	txf |= log2i(pt->height0) << NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT;
-	txf |= log2i(pt->depth0) << NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT;
-	txf |= NV34TCL_TX_FORMAT_NO_BORDER | 0x10000;
-
-	switch (pt->target) {
-	case PIPE_TEXTURE_CUBE:
-		txf |= NV34TCL_TX_FORMAT_CUBIC;
-		/* fall-through */
-	case PIPE_TEXTURE_2D:
-		txf |= NV34TCL_TX_FORMAT_DIMS_2D;
-		break;
-	case PIPE_TEXTURE_3D:
-		txf |= NV34TCL_TX_FORMAT_DIMS_3D;
-		break;
-	case PIPE_TEXTURE_1D:
-		txf |= NV34TCL_TX_FORMAT_DIMS_1D;
-		break;
-	default:
-		NOUVEAU_ERR("Unknown target %d\n", pt->target);
-		return;
+	if(sv->u.nv30.rect < 0)
+	{
+		/* in the case of compressed or 1D textures, we can get away with this,
+		 * since the layout is the same
+		 */
+		use_rect = ps->fmt;
+	}
+	else
+	{
+		static boolean warned = FALSE;
+		if( !!ps->fmt != sv->u.nv30.rect && !warned) {
+			warned = TRUE;
+			fprintf(stderr,
+					"Unimplemented: coordinate normalization mismatch. Possible reasons:\n"
+					"1. ARB_texture_non_power_of_two is being used despite the fact it isn't supported\n"
+					"2. The state tracker is not using the appropriate coordinate normalization\n"
+					"3. The state tracker is not supported\n");
+		}
+
+		use_rect  = sv->u.nv30.rect;
 	}
 
-	txs = tf->swizzle;
+	txf = sv->u.nv30.fmt[ps->compare + (use_rect ? 2 : 0)];
 
 	MARK_RING(chan, 9, 2);
 	OUT_RING(chan, RING_3D(NV34TCL_TX_OFFSET(unit), 8));
-	OUT_RELOC(chan, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
-	OUT_RELOC(chan, bo, txf, tex_flags | NOUVEAU_BO_OR,
-		      NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
-	OUT_RING(chan, ps->wrap);
-	OUT_RING(chan, NV34TCL_TX_ENABLE_ENABLE | ps->en);
-	OUT_RING(chan, txs);
-	OUT_RING(chan, ps->filt | 0x2000 /*voodoo*/);
-	OUT_RING(chan, (pt->width0 << NV34TCL_TX_NPOT_SIZE_W_SHIFT) |
-		       pt->height0);
+	OUT_RELOC(chan, bo, sv->offset, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	OUT_RELOC(chan, bo, txf,
+		tex_flags | NOUVEAU_BO_OR,
+		NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
+	OUT_RING(chan, (ps->wrap & sv->wrap_mask) | sv->wrap);
+	OUT_RING(chan, ps->en | (min_lod << NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT) | (max_lod << NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT));
+	OUT_RING(chan, sv->swizzle);
+	OUT_RING(chan, ps->filt | sv->filt);
+	OUT_RING(chan, sv->npot_size);
 	OUT_RING(chan, ps->bcol);
 
 	nvfx->hw_txf[unit] = txf;
diff --git a/src/gallium/drivers/nvfx/nv30_vertprog.h b/src/gallium/drivers/nvfx/nv30_vertprog.h
index ec0444c07f..9a68f5c1fb 100644
--- a/src/gallium/drivers/nvfx/nv30_vertprog.h
+++ b/src/gallium/drivers/nvfx/nv30_vertprog.h
@@ -68,7 +68,7 @@
 #define NV30_VP_INST_DEST_TEMP_ID_SHIFT        16
 #define NV30_VP_INST_DEST_TEMP_ID_MASK        (0x0F << 16)
 #define NV30_VP_INST_COND_UPDATE_ENABLE        (1<<15)
-#define NV30_VP_INST_VEC_DEST_TEMP_MASK      (0xF << 16)
+#define NV30_VP_INST_VEC_DEST_TEMP_MASK      (0x1F << 16)
 #define NV30_VP_INST_COND_TEST_ENABLE        (1<<14)
 #define NV30_VP_INST_COND_SHIFT          11
 #define NV30_VP_INST_COND_MASK          (0x07 << 11)
@@ -111,7 +111,7 @@
 #define NV30_VP_INST_SRC2H_SHIFT        0    /*NV20*/
 #define NV30_VP_INST_SRC2H_MASK          (0x7FF << 0)  /* NV30_VP_SRC2_HIGH_MASK >> 4*/
 #define NV30_VP_INST_IADDR_SHIFT        2
-#define NV30_VP_INST_IADDR_MASK          (0xF <<  28)   /* NV30_VP_SRC2_LOW_MASK << 28 */
+#define NV30_VP_INST_IADDR_MASK          (0x1FF <<  2)   /* NV30_VP_SRC2_LOW_MASK << 28 */
 
 /* DWORD 3 */
 #define NV30_VP_INST_SRC2L_SHIFT        28    /*NV20*/
@@ -125,7 +125,7 @@
 #define NV30_VP_INST_VDEST_WRITEMASK_SHIFT      12    /*NV20*/
 #define NV30_VP_INST_VDEST_WRITEMASK_MASK      (0x0F << 12)  /*NV20*/
 #define NV30_VP_INST_DEST_SHIFT        2
-#define NV30_VP_INST_DEST_MASK        (0x0F <<  2)
+#define NV30_VP_INST_DEST_MASK        (0x1F <<  2)
 #  define NV30_VP_INST_DEST_POS  0
 #  define NV30_VP_INST_DEST_BFC0  1
 #  define NV30_VP_INST_DEST_BFC1  2
@@ -133,7 +133,8 @@
 #  define NV30_VP_INST_DEST_COL1  4
 #  define NV30_VP_INST_DEST_FOGC  5
 #  define NV30_VP_INST_DEST_PSZ   6
-#  define NV30_VP_INST_DEST_TC(n)  (8+n)
+#  define NV30_VP_INST_DEST_TC(n)  (8+(n))
+#  define NV30_VP_INST_DEST_CLP(n) (17 + (n))
 
 /* Useful to split the source selection regs into their pieces */
 #define NV30_VP_SRC0_HIGH_SHIFT                                                6
diff --git a/src/gallium/drivers/nvfx/nv40_fragtex.c b/src/gallium/drivers/nvfx/nv40_fragtex.c
index 0068b1ba54..106ce71a07 100644
--- a/src/gallium/drivers/nvfx/nv40_fragtex.c
+++ b/src/gallium/drivers/nvfx/nv40_fragtex.c
@@ -8,168 +8,97 @@ nv40_sampler_state_init(struct pipe_context *pipe,
 			  struct nvfx_sampler_state *ps,
 			  const struct pipe_sampler_state *cso)
 {
+	float limit;
 	if (cso->max_anisotropy >= 2) {
 		/* no idea, binary driver sets it, works without it.. meh.. */
 		ps->wrap |= (1 << 5);
 
-		if (cso->max_anisotropy >= 16) {
+		if (cso->max_anisotropy >= 16)
 			ps->en |= NV40TCL_TEX_ENABLE_ANISO_16X;
-		} else
-		if (cso->max_anisotropy >= 12) {
+		else if (cso->max_anisotropy >= 12)
 			ps->en |= NV40TCL_TEX_ENABLE_ANISO_12X;
-		} else
-		if (cso->max_anisotropy >= 10) {
+		else if (cso->max_anisotropy >= 10)
 			ps->en |= NV40TCL_TEX_ENABLE_ANISO_10X;
-		} else
-		if (cso->max_anisotropy >= 8) {
+		else if (cso->max_anisotropy >= 8)
 			ps->en |= NV40TCL_TEX_ENABLE_ANISO_8X;
-		} else
-		if (cso->max_anisotropy >= 6) {
+		else if (cso->max_anisotropy >= 6)
 			ps->en |= NV40TCL_TEX_ENABLE_ANISO_6X;
-		} else
-		if (cso->max_anisotropy >= 4) {
+		else if (cso->max_anisotropy >= 4)
 			ps->en |= NV40TCL_TEX_ENABLE_ANISO_4X;
-		} else {
+		else
 			ps->en |= NV40TCL_TEX_ENABLE_ANISO_2X;
-		}
 	}
 
-	{
-		float limit;
+	limit = CLAMP(cso->lod_bias, -16.0, 15.0 + (255.0 / 256.0));
+	ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
 
-		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
-		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+	ps->max_lod = (int)(CLAMP(cso->max_lod, 0.0, 15.0 + (255.0 / 256.0)) * 256.0);
+	ps->min_lod = (int)(CLAMP(cso->min_lod, 0.0, 15.0 + (255.0 / 256.0)) * 256.0);
 
-		limit = CLAMP(cso->max_lod, 0.0, 15.0);
-		ps->en |= (int)(limit * 256.0) << 7;
-
-		limit = CLAMP(cso->min_lod, 0.0, 15.0);
-		ps->en |= (int)(limit * 256.0) << 19;
-	}
-}
-
-#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sx,sy,sz,sw)            \
-{                                                                              \
-  TRUE,                                                                        \
-  PIPE_FORMAT_##m,                                                             \
-  NV40TCL_TEX_FORMAT_FORMAT_##tf,                                              \
-  (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y |         \
-   NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w |         \
-   NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y |         \
-   NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w),         \
-  ((NV34TCL_TX_FILTER_SIGNED_RED*sx) | (NV34TCL_TX_FILTER_SIGNED_GREEN*sy) |       \
-   (NV34TCL_TX_FILTER_SIGNED_BLUE*sz) | (NV34TCL_TX_FILTER_SIGNED_ALPHA*sw))       \
+	ps->en |= NV40TCL_TEX_ENABLE_ENABLE;
 }
 
-struct nv40_texture_format {
-	boolean defined;
-	uint	pipe;
-	int     format;
-	int     swizzle;
-	int     sign;
-};
-
-static struct nv40_texture_format
-nv40_texture_formats[] = {
-	_(B8G8R8X8_UNORM, A8R8G8B8,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
-	_(B8G8R8A8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
-	_(B5G5R5A1_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
-	_(B4G4R4A4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
-	_(B5G6R5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
-	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
-	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X, 0, 0, 0, 0),
-	_(R16_SNORM     , A16     , ZERO, ZERO,   S1,  ONE, X, X, X, Y, 1, 1, 1, 1),
-	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X, 0, 0, 0, 0),
-	_(L8A8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y, 0, 0, 0, 0),
-	_(Z16_UNORM     , Z16     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
-	_(S8_USCALED_Z24_UNORM   , Z24     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
-	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
-	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
-	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
-	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
-	{},
-};
-
-static struct nv40_texture_format *
-nv40_fragtex_format(uint pipe_format)
+void
+nv40_sampler_view_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_view *sv)
 {
-	struct nv40_texture_format *tf = nv40_texture_formats;
-
-	while (tf->defined) {
-		if (tf->pipe == pipe_format)
-			return tf;
-		tf++;
+	struct pipe_resource* pt = sv->base.texture;
+	struct nvfx_miptree* mt = (struct nvfx_miptree*)pt;
+	struct nvfx_texture_format *tf = &nvfx_texture_formats[sv->base.format];
+	unsigned txf;
+	unsigned level = pt->target == PIPE_TEXTURE_CUBE ? 0 : sv->base.first_level;
+	assert(tf->fmt[4] >= 0);
+
+	txf = sv->u.init_fmt;
+	txf |= 0x8000;
+	if(pt->target == PIPE_TEXTURE_CUBE)
+		txf |= ((pt->last_level + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
+	else
+		txf |= (((sv->base.last_level - sv->base.first_level) + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
+
+	if (!mt->linear_pitch)
+		sv->u.nv40.npot_size2 = 0;
+	else {
+		sv->u.nv40.npot_size2  = mt->linear_pitch;
+		txf |= NV40TCL_TEX_FORMAT_LINEAR;
 	}
 
-	NOUVEAU_ERR("unknown texture format %s\n", util_format_name(pipe_format));
-	return NULL;
-}
+	sv->u.nv40.fmt[0] = tf->fmt[4] | txf;
+	sv->u.nv40.fmt[1] = tf->fmt[5] | txf;
 
+	sv->u.nv40.npot_size2 |= (u_minify(pt->depth0, level) << NV40TCL_TEX_SIZE1_DEPTH_SHIFT);
+
+	sv->lod_offset = (sv->base.first_level - level) * 256;
+	sv->max_lod_limit = (sv->base.last_level - level) * 256;
+}
 
 void
 nv40_fragtex_set(struct nvfx_context *nvfx, int unit)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
 	struct nvfx_sampler_state *ps = nvfx->tex_sampler[unit];
-	struct nvfx_miptree *nv40mt = (struct nvfx_miptree *)nvfx->fragment_sampler_views[unit]->texture;
-	struct nouveau_bo *bo = nv40mt->base.bo;
-	struct pipe_resource *pt = &nv40mt->base.base;
-	struct nv40_texture_format *tf;
-
-	uint32_t txf, txs, txp;
+	struct nvfx_sampler_view* sv = (struct nvfx_sampler_view*)nvfx->fragment_sampler_views[unit];
+	struct nouveau_bo *bo = ((struct nvfx_miptree *)sv->base.texture)->base.bo;
 	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+	unsigned txf;
+	unsigned max_lod = MIN2(ps->max_lod + sv->lod_offset, sv->max_lod_limit);
+	unsigned min_lod = MIN2(ps->min_lod + sv->lod_offset, max_lod);
 
-	tf = nv40_fragtex_format(pt->format);
-	if (!tf)
-		assert(0);
-
-	txf  = ps->fmt;
-	txf |= tf->format | 0x8000;
-	txf |= ((pt->last_level + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
-
-	if (1) /* XXX */
-		txf |= NV34TCL_TX_FORMAT_NO_BORDER;
-
-	switch (pt->target) {
-	case PIPE_TEXTURE_CUBE:
-		txf |= NV34TCL_TX_FORMAT_CUBIC;
-		/* fall-through */
-	case PIPE_TEXTURE_2D:
-		txf |= NV34TCL_TX_FORMAT_DIMS_2D;
-		break;
-	case PIPE_TEXTURE_3D:
-		txf |= NV34TCL_TX_FORMAT_DIMS_3D;
-		break;
-	case PIPE_TEXTURE_1D:
-		txf |= NV34TCL_TX_FORMAT_DIMS_1D;
-		break;
-	default:
-		NOUVEAU_ERR("Unknown target %d\n", pt->target);
-		return;
-	}
-
-	if (!(pt->flags & NVFX_RESOURCE_FLAG_LINEAR)) {
-		txp = 0;
-	} else {
-		txp  = nv40mt->level[0].pitch;
-		txf |= NV40TCL_TEX_FORMAT_LINEAR;
-	}
-
-	txs = tf->swizzle;
+	txf = sv->u.nv40.fmt[ps->compare] | ps->fmt;
 
-	MARK_RING(chan, 11 + 2 * !unit, 2);
+	MARK_RING(chan, 11, 2);
 	OUT_RING(chan, RING_3D(NV34TCL_TX_OFFSET(unit), 8));
-	OUT_RELOC(chan, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	OUT_RELOC(chan, bo, sv->offset, tex_flags | NOUVEAU_BO_LOW, 0, 0);
 	OUT_RELOC(chan, bo, txf, tex_flags | NOUVEAU_BO_OR,
 			NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
-	OUT_RING(chan, ps->wrap);
-	OUT_RING(chan, NV40TCL_TEX_ENABLE_ENABLE | ps->en);
-	OUT_RING(chan, txs);
-	OUT_RING(chan, ps->filt | tf->sign | 0x2000 /*voodoo*/);
-	OUT_RING(chan, (pt->width0 << NV34TCL_TX_NPOT_SIZE_W_SHIFT) | pt->height0);
+	OUT_RING(chan, (ps->wrap & sv->wrap_mask) | sv->wrap);
+	OUT_RING(chan, ps->en | (min_lod << 19) | (max_lod << 7));
+	OUT_RING(chan, sv->swizzle);
+	OUT_RING(chan, ps->filt | sv->filt);
+	OUT_RING(chan, sv->npot_size);
 	OUT_RING(chan, ps->bcol);
 	OUT_RING(chan, RING_3D(NV40TCL_TEX_SIZE1(unit), 1));
-	OUT_RING(chan, (pt->depth0 << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp);
+	OUT_RING(chan, sv->u.nv40.npot_size2);
 
 	nvfx->hw_txf[unit] = txf;
 	nvfx->hw_samplers |= (1 << unit);
diff --git a/src/gallium/drivers/nvfx/nv40_vertprog.h b/src/gallium/drivers/nvfx/nv40_vertprog.h
index 7337293bab..3d0a1fe3d1 100644
--- a/src/gallium/drivers/nvfx/nv40_vertprog.h
+++ b/src/gallium/drivers/nvfx/nv40_vertprog.h
@@ -44,7 +44,7 @@
 #define NV40_VP_INST_SRC1_ABS                                          (1 << 22)
 #define NV40_VP_INST_SRC0_ABS                                          (1 << 21)
 #define NV40_VP_INST_VEC_DEST_TEMP_SHIFT                                      15
-#define NV40_VP_INST_VEC_DEST_TEMP_MASK                             (0x1F << 15)
+#define NV40_VP_INST_VEC_DEST_TEMP_MASK                             (0x3F << 15)
 #define NV40_VP_INST_COND_TEST_ENABLE                                  (1 << 13)
 #define NV40_VP_INST_COND_SHIFT                                               10
 #define NV40_VP_INST_COND_MASK                                       (0x7 << 10)
@@ -100,7 +100,7 @@
 #define NV40_VP_INST_SRC2H_SHIFT                                               0
 #define NV40_VP_INST_SRC2H_MASK                                      (0x3F << 0)
 #define NV40_VP_INST_IADDRH_SHIFT                                              0
-#define NV40_VP_INST_IADDRH_MASK                                     (0x1F << 0)
+#define NV40_VP_INST_IADDRH_MASK                                     (0x3F << 0)
 
 /* ---- OPCODE BITS 31:0 / data DWORD 3 --- */
 #define NV40_VP_INST_IADDRL_SHIFT                                             29
diff --git a/src/gallium/drivers/nvfx/nvfx_buffer.c b/src/gallium/drivers/nvfx/nvfx_buffer.c
index 05b824b8f7..041099e0e5 100644
--- a/src/gallium/drivers/nvfx/nvfx_buffer.c
+++ b/src/gallium/drivers/nvfx/nvfx_buffer.c
@@ -6,115 +6,39 @@
 #include "nouveau/nouveau_screen.h"
 #include "nouveau/nouveau_winsys.h"
 #include "nvfx_resource.h"
+#include "nvfx_screen.h"
 
-
-/* Currently using separate implementations for buffers and textures,
- * even though gallium has a unified abstraction of these objects.
- * Eventually these should be combined, and mechanisms like transfers
- * be adapted to work for both buffer and texture uploads.
- */
-static void nvfx_buffer_destroy(struct pipe_screen *pscreen,
+void nvfx_buffer_destroy(struct pipe_screen *pscreen,
 				struct pipe_resource *presource)
 {
-	struct nvfx_resource *buffer = nvfx_resource(presource);
+	struct nvfx_buffer *buffer = nvfx_buffer(presource);
 
-	nouveau_screen_bo_release(pscreen, buffer->bo);
+	if(!(buffer->base.base.flags & NVFX_RESOURCE_FLAG_USER))
+		align_free(buffer->data);
+	nouveau_screen_bo_release(pscreen, buffer->base.bo);
 	FREE(buffer);
 }
 
-
-
-
-/* Utility functions for transfer create/destroy are hooked in and
- * just record the arguments to those functions.
- */
-static void *
-nvfx_buffer_transfer_map( struct pipe_context *pipe,
-			  struct pipe_transfer *transfer )
-{
-	struct nvfx_resource *buffer = nvfx_resource(transfer->resource);
-	uint8_t *map;
-
-	map = nouveau_screen_bo_map_range( pipe->screen,
-					   buffer->bo,
-					   transfer->box.x,
-					   transfer->box.width,
-					   nouveau_screen_transfer_flags(transfer->usage) );
-	if (map == NULL)
-		return NULL;
-	
-	return map + transfer->box.x;
-}
-
-
-
-static void nvfx_buffer_transfer_flush_region( struct pipe_context *pipe,
-					       struct pipe_transfer *transfer,
-					       const struct pipe_box *box)
-{
-	struct nvfx_resource *buffer = nvfx_resource(transfer->resource);
-
-	nouveau_screen_bo_map_flush_range(pipe->screen,
-					  buffer->bo,
-					  transfer->box.x + box->x,
-					  box->width);
-}
-
-static void nvfx_buffer_transfer_unmap( struct pipe_context *pipe,
-					struct pipe_transfer *transfer )
-{
-	struct nvfx_resource *buffer = nvfx_resource(transfer->resource);
-
-	nouveau_screen_bo_unmap(pipe->screen, buffer->bo);
-}
-
-
-
-
-struct u_resource_vtbl nvfx_buffer_vtbl = 
-{
-	u_default_resource_get_handle,      /* get_handle */
-	nvfx_buffer_destroy,		     /* resource_destroy */
-	NULL,			    /* is_resource_referenced */
-	u_default_get_transfer,	     /* get_transfer */
-	u_default_transfer_destroy,	     /* transfer_destroy */
-	nvfx_buffer_transfer_map,	     /* transfer_map */
-	nvfx_buffer_transfer_flush_region,  /* transfer_flush_region */
-	nvfx_buffer_transfer_unmap,	     /* transfer_unmap */
-	u_default_transfer_inline_write   /* transfer_inline_write */
-};
-
-
-
 struct pipe_resource *
 nvfx_buffer_create(struct pipe_screen *pscreen,
 		   const struct pipe_resource *template)
 {
-	struct nvfx_resource *buffer;
+	struct nvfx_screen* screen = nvfx_screen(pscreen);
+	struct nvfx_buffer* buffer;
 
-	buffer = CALLOC_STRUCT(nvfx_resource);
+	buffer = CALLOC_STRUCT(nvfx_buffer);
 	if (!buffer)
 		return NULL;
 
-	buffer->base = *template;
-	buffer->vtbl = &nvfx_buffer_vtbl;
-	pipe_reference_init(&buffer->base.reference, 1);
-	buffer->base.screen = pscreen;
-
-	buffer->bo = nouveau_screen_bo_new(pscreen,
-					   16,
-					   buffer->base.usage,
-					   buffer->base.bind,
-					   buffer->base.width0);
-
-	if (buffer->bo == NULL)
-		goto fail;
-
-	return &buffer->base;
+	buffer->base.base = *template;
+	buffer->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+	pipe_reference_init(&buffer->base.base.reference, 1);
+	buffer->base.base.screen = pscreen;
+	buffer->size = util_format_get_stride(template->format, template->width0);
+	buffer->bytes_to_draw_until_static = buffer->size * screen->static_reuse_threshold;
+	buffer->data = align_malloc(buffer->size, 16);
 
-fail:
-	FREE(buffer);
-	return NULL;
+	return &buffer->base.base;
 }
 
 
@@ -124,30 +48,49 @@ nvfx_user_buffer_create(struct pipe_screen *pscreen,
 			unsigned bytes,
 			unsigned usage)
 {
-	struct nvfx_resource *buffer;
+	struct nvfx_screen* screen = nvfx_screen(pscreen);
+	struct nvfx_buffer* buffer;
 
-	buffer = CALLOC_STRUCT(nvfx_resource);
+	buffer = CALLOC_STRUCT(nvfx_buffer);
 	if (!buffer)
 		return NULL;
 
-	pipe_reference_init(&buffer->base.reference, 1);
-	buffer->vtbl = &nvfx_buffer_vtbl;
-	buffer->base.screen = pscreen;
-	buffer->base.format = PIPE_FORMAT_R8_UNORM;
-	buffer->base.usage = PIPE_USAGE_IMMUTABLE;
-	buffer->base.bind = usage;
-	buffer->base.width0 = bytes;
-	buffer->base.height0 = 1;
-	buffer->base.depth0 = 1;
-
-	buffer->bo = nouveau_screen_bo_user(pscreen, ptr, bytes);
-	if (!buffer->bo)
-		goto fail;
-	
-	return &buffer->base;
-
-fail:
-	FREE(buffer);
-	return NULL;
+	pipe_reference_init(&buffer->base.base.reference, 1);
+	buffer->base.base.flags = NVFX_RESOURCE_FLAG_LINEAR | NVFX_RESOURCE_FLAG_USER;
+	buffer->base.base.screen = pscreen;
+	buffer->base.base.format = PIPE_FORMAT_R8_UNORM;
+	buffer->base.base.usage = PIPE_USAGE_IMMUTABLE;
+	buffer->base.base.bind = usage;
+	buffer->base.base.width0 = bytes;
+	buffer->base.base.height0 = 1;
+	buffer->base.base.depth0 = 1;
+	buffer->data = ptr;
+	buffer->size = bytes;
+	buffer->bytes_to_draw_until_static = bytes * screen->static_reuse_threshold;
+	buffer->dirty_end = bytes;
+
+	return &buffer->base.base;
 }
 
+void nvfx_buffer_upload(struct nvfx_buffer* buffer)
+{
+	unsigned dirty = buffer->dirty_end - buffer->dirty_begin;
+	if(!buffer->base.bo)
+	{
+		buffer->base.bo = nouveau_screen_bo_new(buffer->base.base.screen,
+					   16,
+					   buffer->base.base.usage,
+					   buffer->base.base.bind,
+					   buffer->base.base.width0);
+	}
+
+	if(dirty)
+	{
+		// TODO: may want to use a temporary in some cases
+		nouveau_bo_map(buffer->base.bo, NOUVEAU_BO_WR
+				| (buffer->dirty_unsynchronized ? NOUVEAU_BO_NOSYNC : 0));
+		memcpy((uint8_t*)buffer->base.bo->map + buffer->dirty_begin, buffer->data + buffer->dirty_begin, dirty);
+		nouveau_bo_unmap(buffer->base.bo);
+		buffer->dirty_begin = buffer->dirty_end = 0;
+	}
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_context.c b/src/gallium/drivers/nvfx/nvfx_context.c
index 7218abff22..5a2fa14c88 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.c
+++ b/src/gallium/drivers/nvfx/nvfx_context.c
@@ -1,5 +1,6 @@
 #include "draw/draw_context.h"
 #include "pipe/p_defines.h"
+#include "util/u_framebuffer.h"
 
 #include "nvfx_context.h"
 #include "nvfx_screen.h"
@@ -14,6 +15,7 @@ nvfx_flush(struct pipe_context *pipe, unsigned flags,
 	struct nouveau_channel *chan = screen->base.channel;
 	struct nouveau_grobj *eng3d = screen->eng3d;
 
+	/* XXX: we need to actually be intelligent here */
 	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
 		BEGIN_RING(chan, eng3d, 0x1fd8, 1);
 		OUT_RING  (chan, 2);
@@ -31,8 +33,22 @@ nvfx_destroy(struct pipe_context *pipe)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
 
+	if(nvfx->dummy_fs)
+		pipe->delete_fs_state(pipe, nvfx->dummy_fs);
+
+	for(unsigned i = 0; i < nvfx->vtxbuf_nr; ++i)
+		pipe_resource_reference(&nvfx->vtxbuf[i].buffer, 0);
+	pipe_resource_reference(&nvfx->idxbuf.buffer, 0);
+	util_unreference_framebuffer_state(&nvfx->framebuffer);
+	for(unsigned i = 0; i < PIPE_MAX_SAMPLERS; ++i)
+		pipe_sampler_view_reference(&nvfx->fragment_sampler_views[i], 0);
+
 	if (nvfx->draw)
 		draw_destroy(nvfx->draw);
+
+	if(nvfx->screen->cur_ctx == nvfx)
+		nvfx->screen->cur_ctx = NULL;
+
 	FREE(nvfx);
 }
 
@@ -59,14 +75,21 @@ nvfx_create(struct pipe_screen *pscreen, void *priv)
 	nvfx->pipe.clear = nvfx_clear;
 	nvfx->pipe.flush = nvfx_flush;
 
-	screen->base.channel->user_private = nvfx;
-
 	nvfx->is_nv4x = screen->is_nv4x;
+	/* TODO: it seems that nv30 might have fixed function clipping usable with vertex programs
+	 * However, my code for that doesn't work, so use vp clipping for all cards, which works.
+	 */
+	nvfx->use_vp_clipping = TRUE;
 
 	nvfx_init_query_functions(nvfx);
 	nvfx_init_surface_functions(nvfx);
 	nvfx_init_state_functions(nvfx);
+	nvfx_init_sampling_functions(nvfx);
+	nvfx_init_vbo_functions(nvfx);
+	nvfx_init_fragprog_functions(nvfx);
+	nvfx_init_vertprog_functions(nvfx);
 	nvfx_init_resource_functions(&nvfx->pipe);
+	nvfx_init_transfer_functions(&nvfx->pipe);
 
 	/* Create, configure, and install fallback swtnl path */
 	nvfx->draw = draw_create(&nvfx->pipe);
@@ -78,6 +101,12 @@ nvfx_create(struct pipe_screen *pscreen, void *priv)
 
 	/* set these to that we init them on first validation */
 	nvfx->state.scissor_enabled = ~0;
-	nvfx->state.stipple_enabled = ~0;
+	nvfx->hw_pointsprite_control = -1;
+	nvfx->hw_vp_output = -1;
+	nvfx->use_vertex_buffers = -1;
+	nvfx->relocs_needed = NVFX_RELOCATE_ALL;
+
+	LIST_INITHEAD(&nvfx->render_cache);
+
 	return &nvfx->pipe;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_context.h b/src/gallium/drivers/nvfx/nvfx_context.h
index 89f94c10bd..4c654bfa8b 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.h
+++ b/src/gallium/drivers/nvfx/nvfx_context.h
@@ -11,8 +11,10 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "util/u_inlines.h"
+#include "util/u_double_list.h"
 
 #include "draw/draw_vertex.h"
+#include "util/u_blitter.h"
 
 #include "nouveau/nouveau_winsys.h"
 #include "nouveau/nouveau_gldefs.h"
@@ -42,17 +44,26 @@
 #define NVFX_NEW_SR		(1 << 13)
 #define NVFX_NEW_VERTCONST	(1 << 14)
 #define NVFX_NEW_FRAGCONST	(1 << 15)
+#define NVFX_NEW_INDEX	(1 << 16)
+#define NVFX_NEW_SPRITE  (1 << 17)
+
+#define NVFX_RELOCATE_FRAMEBUFFER (1 << 0)
+#define NVFX_RELOCATE_FRAGTEX (1 << 1)
+#define NVFX_RELOCATE_FRAGPROG (1 << 2)
+#define NVFX_RELOCATE_VTXBUF (1 << 3)
+#define NVFX_RELOCATE_IDXBUF (1 << 4)
+#define NVFX_RELOCATE_ALL 0x1f
 
 struct nvfx_rasterizer_state {
 	struct pipe_rasterizer_state pipe;
 	unsigned sb_len;
-	uint32_t sb[32];
+	uint32_t sb[34];
 };
 
 struct nvfx_zsa_state {
 	struct pipe_depth_stencil_alpha_state pipe;
 	unsigned sb_len;
-	uint32_t sb[26];
+	uint32_t sb[24];
 };
 
 struct nvfx_blend_state {
@@ -64,13 +75,57 @@ struct nvfx_blend_state {
 
 struct nvfx_state {
 	unsigned scissor_enabled;
-	unsigned stipple_enabled;
 	unsigned fp_samplers;
+	unsigned render_temps;
+};
+
+struct nvfx_per_vertex_element {
+	unsigned idx;
+        unsigned vertex_buffer_index;
+        unsigned src_offset;
+};
+
+struct nvfx_low_frequency_element {
+	unsigned idx;
+	unsigned vertex_buffer_index;
+	unsigned src_offset;
+        void (*fetch_rgba_float)(float *dst, const uint8_t *src, unsigned i, unsigned j);
+        unsigned ncomp;
+};
+
+struct nvfx_per_instance_element {
+	struct nvfx_low_frequency_element base;
+	unsigned instance_divisor;
+};
+
+struct nvfx_per_vertex_buffer_info
+{
+	unsigned vertex_buffer_index;
+	unsigned per_vertex_size;
 };
 
 struct nvfx_vtxelt_state {
 	struct pipe_vertex_element pipe[16];
 	unsigned num_elements;
+	unsigned vtxfmt[16];
+
+	unsigned num_per_vertex_buffer_infos;
+	struct nvfx_per_vertex_buffer_info per_vertex_buffer_info[16];
+
+	unsigned num_per_vertex;
+	struct nvfx_per_vertex_element per_vertex[16];
+
+	unsigned num_per_instance;
+	struct nvfx_per_instance_element per_instance[16];
+
+	unsigned num_constant;
+	struct nvfx_low_frequency_element constant[16];
+
+	boolean needs_translate;
+	struct translate* translate;
+
+	unsigned vertex_length;
+	unsigned max_vertices_per_packet;
 };
 
 struct nvfx_render_target {
@@ -86,8 +141,11 @@ struct nvfx_context {
 	struct nvfx_screen *screen;
 
 	unsigned is_nv4x; /* either 0 or ~0 */
+	boolean use_vp_clipping;
 
 	struct draw_context *draw;
+	struct blitter_context* blitter;
+	struct list_head render_cache;
 
 	/* HW state derived from pipe states */
 	struct nvfx_state state;
@@ -111,7 +169,7 @@ struct nvfx_context {
 	unsigned stipple[32];
 	struct pipe_clip_state clip;
 	struct nvfx_vertex_program *vertprog;
-	struct nvfx_fragment_program *fragprog;
+	struct nvfx_pipe_fragment_program *fragprog;
 	struct pipe_resource *constbuf[PIPE_SHADER_TYPES];
 	unsigned constbuf_nr[PIPE_SHADER_TYPES];
 	struct nvfx_rasterizer_state *rasterizer;
@@ -122,23 +180,34 @@ struct nvfx_context {
 	struct pipe_viewport_state viewport;
 	struct pipe_framebuffer_state framebuffer;
 	struct pipe_index_buffer idxbuf;
-	struct pipe_resource *idxbuf_buffer;
-	unsigned idxbuf_format;
 	struct nvfx_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
 	struct pipe_sampler_view *fragment_sampler_views[PIPE_MAX_SAMPLERS];
+	struct nvfx_pipe_fragment_program* dummy_fs;
+
 	unsigned nr_samplers;
 	unsigned nr_textures;
 	unsigned dirty_samplers;
 	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
 	unsigned vtxbuf_nr;
 	struct nvfx_vtxelt_state *vtxelt;
+	int base_vertex;
+	boolean use_index_buffer;
+	/* -1 = hardware input setup is outdated
+	 * 0 = hardware input setup is for inline vertices
+	 * 1 = hardware input setup is for hardware vertices
+	 */
+	int use_vertex_buffers;
 
-	unsigned vbo_bo;
 	unsigned hw_vtxelt_nr;
 	uint8_t hw_samplers;
 	uint32_t hw_txf[8];
 	struct nvfx_render_target hw_rt[4];
 	struct nvfx_render_target hw_zeta;
+	int hw_pointsprite_control;
+	int hw_vp_output;
+	struct nvfx_fragment_program* hw_fragprog;
+
+	unsigned relocs_needed;
 };
 
 static INLINE struct nvfx_context *
@@ -175,15 +244,12 @@ extern void nvfx_clear(struct pipe_context *pipe, unsigned buffers,
 
 /* nvfx_draw.c */
 extern struct draw_stage *nvfx_draw_render_stage(struct nvfx_context *nvfx);
-extern void nvfx_draw_elements_swtnl(struct pipe_context *pipe,
-                                     struct pipe_resource *idxbuf,
-                                     unsigned ib_size, int ib_bias,
-                                     unsigned mode,
-                                     unsigned start, unsigned count);
+extern void nvfx_draw_vbo_swtnl(struct pipe_context *pipe, const struct pipe_draw_info* info);
 extern void nvfx_vtxfmt_validate(struct nvfx_context *nvfx);
 
 /* nvfx_fb.c */
-extern void nvfx_state_framebuffer_validate(struct nvfx_context *nvfx);
+extern int nvfx_framebuffer_prepare(struct nvfx_context *nvfx);
+extern void nvfx_framebuffer_validate(struct nvfx_context *nvfx, unsigned prepare_result);
 void
 nvfx_framebuffer_relocate(struct nvfx_context *nvfx);
 
@@ -191,19 +257,24 @@ nvfx_framebuffer_relocate(struct nvfx_context *nvfx);
 extern void nvfx_fragprog_destroy(struct nvfx_context *,
 				    struct nvfx_fragment_program *);
 extern void nvfx_fragprog_validate(struct nvfx_context *nvfx);
-extern void
-nvfx_fragprog_relocate(struct nvfx_context *nvfx);
+extern void nvfx_fragprog_relocate(struct nvfx_context *nvfx);
+extern void nvfx_init_fragprog_functions(struct nvfx_context *nvfx);
 
 /* nvfx_fragtex.c */
+extern void nvfx_init_sampling_functions(struct nvfx_context *nvfx);
 extern void nvfx_fragtex_validate(struct nvfx_context *nvfx);
-extern void
-nvfx_fragtex_relocate(struct nvfx_context *nvfx);
+extern void nvfx_fragtex_relocate(struct nvfx_context *nvfx);
+
+struct nvfx_sampler_view;
 
 /* nv30_fragtex.c */
 extern void
 nv30_sampler_state_init(struct pipe_context *pipe,
 			  struct nvfx_sampler_state *ps,
 			  const struct pipe_sampler_state *cso);
+extern void
+nv30_sampler_view_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_view *sv);
 extern void nv30_fragtex_set(struct nvfx_context *nvfx, int unit);
 
 /* nv40_fragtex.c */
@@ -211,6 +282,9 @@ extern void
 nv40_sampler_state_init(struct pipe_context *pipe,
 			  struct nvfx_sampler_state *ps,
 			  const struct pipe_sampler_state *cso);
+extern void
+nv40_sampler_view_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_view *sv);
 extern void nv40_fragtex_set(struct nvfx_context *nvfx, int unit);
 
 /* nvfx_state.c */
@@ -225,23 +299,75 @@ extern void nvfx_state_sr_validate(struct nvfx_context *nvfx);
 extern void nvfx_state_zsa_validate(struct nvfx_context *nvfx);
 
 /* nvfx_state_emit.c */
-extern void nvfx_state_relocate(struct nvfx_context *nvfx);
+extern void nvfx_state_relocate(struct nvfx_context *nvfx, unsigned relocs);
 extern boolean nvfx_state_validate(struct nvfx_context *nvfx);
 extern boolean nvfx_state_validate_swtnl(struct nvfx_context *nvfx);
-extern void nvfx_state_emit(struct nvfx_context *nvfx);
+
+static inline void
+nvfx_state_emit(struct nvfx_context *nvfx)
+{
+        unsigned relocs = NVFX_RELOCATE_FRAMEBUFFER | NVFX_RELOCATE_FRAGTEX | NVFX_RELOCATE_FRAGPROG;
+        if (nvfx->render_mode == HW)
+        {
+                relocs |= NVFX_RELOCATE_VTXBUF;
+                if(nvfx->use_index_buffer)
+                        relocs |= NVFX_RELOCATE_IDXBUF;
+        }
+
+        relocs &= nvfx->relocs_needed;
+        if(relocs)
+                nvfx_state_relocate(nvfx, relocs);
+}
 
 /* nvfx_transfer.c */
-extern void nvfx_init_transfer_functions(struct nvfx_context *nvfx);
+extern void nvfx_init_transfer_functions(struct pipe_context *pipe);
 
 /* nvfx_vbo.c */
 extern boolean nvfx_vbo_validate(struct nvfx_context *nvfx);
 extern void nvfx_vbo_relocate(struct nvfx_context *nvfx);
+extern void nvfx_idxbuf_validate(struct nvfx_context* nvfx);
+extern void nvfx_idxbuf_relocate(struct nvfx_context* nvfx);
 extern void nvfx_draw_vbo(struct pipe_context *pipe,
                           const struct pipe_draw_info *info);
+extern void nvfx_init_vbo_functions(struct nvfx_context *nvfx);
+extern unsigned nvfx_vertex_formats[];
 
 /* nvfx_vertprog.c */
 extern boolean nvfx_vertprog_validate(struct nvfx_context *nvfx);
 extern void nvfx_vertprog_destroy(struct nvfx_context *,
 				  struct nvfx_vertex_program *);
+extern void nvfx_init_vertprog_functions(struct nvfx_context *nvfx);
+
+/* nvfx_push.c */
+extern void nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info);
+
+/* must WAIT_RING(chan, ncomp + 1) or equivalent beforehand! */
+static inline void nvfx_emit_vtx_attr(struct nouveau_channel* chan, unsigned attrib, const float* v, unsigned ncomp)
+{
+	switch (ncomp) {
+	case 4:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_4F_X(attrib), 4));
+		OUT_RING(chan, fui(v[0]));
+		OUT_RING(chan, fui(v[1]));
+		OUT_RING(chan,  fui(v[2]));
+		OUT_RING(chan,  fui(v[3]));
+		break;
+	case 3:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_3F_X(attrib), 3));
+		OUT_RING(chan,  fui(v[0]));
+		OUT_RING(chan,  fui(v[1]));
+		OUT_RING(chan,  fui(v[2]));
+		break;
+	case 2:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_2F_X(attrib), 2));
+		OUT_RING(chan,  fui(v[0]));
+		OUT_RING(chan,  fui(v[1]));
+		break;
+	case 1:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_1F(attrib), 1));
+		OUT_RING(chan,  fui(v[0]));
+		break;
+	}
+}
 
 #endif
diff --git a/src/gallium/drivers/nvfx/nvfx_draw.c b/src/gallium/drivers/nvfx/nvfx_draw.c
index 22cff370b7..2601d5b8e2 100644
--- a/src/gallium/drivers/nvfx/nvfx_draw.c
+++ b/src/gallium/drivers/nvfx/nvfx_draw.c
@@ -9,6 +9,7 @@
 #include "draw/draw_pipe.h"
 
 #include "nvfx_context.h"
+#include "nvfx_resource.h"
 
 /* Simple, but crappy, swtnl path, hopefully we wont need to hit this very
  * often at all.  Uses "quadro style" vertex submission + a fixed vertex
@@ -39,30 +40,21 @@ nvfx_render_vertex(struct nvfx_context *nvfx, const struct vertex_header *v)
 		unsigned idx = nvfx->swtnl.draw[i];
 		unsigned hw = nvfx->swtnl.hw[i];
 
+		WAIT_RING(chan, 5);
 		switch (nvfx->swtnl.emit[i]) {
 		case EMIT_OMIT:
 			break;
 		case EMIT_1F:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_1F(hw), 1);
-			OUT_RING  (chan, fui(v->data[idx][0]));
+			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 1);
 			break;
 		case EMIT_2F:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_2F_X(hw), 2);
-			OUT_RING  (chan, fui(v->data[idx][0]));
-			OUT_RING  (chan, fui(v->data[idx][1]));
+			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 2);
 			break;
 		case EMIT_3F:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_3F_X(hw), 3);
-			OUT_RING  (chan, fui(v->data[idx][0]));
-			OUT_RING  (chan, fui(v->data[idx][1]));
-			OUT_RING  (chan, fui(v->data[idx][2]));
+			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 3);
 			break;
 		case EMIT_4F:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4);
-			OUT_RING  (chan, fui(v->data[idx][0]));
-			OUT_RING  (chan, fui(v->data[idx][1]));
-			OUT_RING  (chan, fui(v->data[idx][2]));
-			OUT_RING  (chan, fui(v->data[idx][3]));
+			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 4);
 			break;
 		case 0xff:
 			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4);
@@ -231,15 +223,9 @@ nvfx_draw_render_stage(struct nvfx_context *nvfx)
 }
 
 void
-nvfx_draw_elements_swtnl(struct pipe_context *pipe,
-			 struct pipe_resource *idxbuf,
-			 unsigned idxbuf_size, int idxbuf_bias,
-			 unsigned mode, unsigned start, unsigned count)
+nvfx_draw_vbo_swtnl(struct pipe_context *pipe, const struct pipe_draw_info* info)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS];
-	struct pipe_transfer *ib_transfer = NULL;
-	struct pipe_transfer *cb_transfer = NULL;
 	unsigned i;
 	void *map;
 
@@ -247,47 +233,28 @@ nvfx_draw_elements_swtnl(struct pipe_context *pipe,
 		return;
 	nvfx_state_emit(nvfx);
 
+	/* these must be passed without adding the offsets */
 	for (i = 0; i < nvfx->vtxbuf_nr; i++) {
-		map = pipe_buffer_map(pipe, nvfx->vtxbuf[i].buffer,
-                                      PIPE_TRANSFER_READ,
-				      &vb_transfer[i]);
+		map = nvfx_buffer(nvfx->vtxbuf[i].buffer)->data;
 		draw_set_mapped_vertex_buffer(nvfx->draw, i, map);
 	}
 
-	if (idxbuf) {
-		map = pipe_buffer_map(pipe, idxbuf,
-				      PIPE_TRANSFER_READ,
-				      &ib_transfer);
-		draw_set_mapped_element_buffer(nvfx->draw, idxbuf_size, idxbuf_bias, map);
-	} else {
-		draw_set_mapped_element_buffer(nvfx->draw, 0, 0, NULL);
-	}
+	map = NULL;
+	if (info->indexed && nvfx->idxbuf.buffer)
+		map = nvfx_buffer(nvfx->idxbuf.buffer)->data;
+	draw_set_mapped_index_buffer(nvfx->draw, map);
 
 	if (nvfx->constbuf[PIPE_SHADER_VERTEX]) {
 		const unsigned nr = nvfx->constbuf_nr[PIPE_SHADER_VERTEX];
 
-		map = pipe_buffer_map(pipe,
-				      nvfx->constbuf[PIPE_SHADER_VERTEX],
-				      PIPE_TRANSFER_READ,
-				      &cb_transfer);
+		map = nvfx_buffer(nvfx->constbuf[PIPE_SHADER_VERTEX])->data;
 		draw_set_mapped_constant_buffer(nvfx->draw, PIPE_SHADER_VERTEX, 0,
                                                 map, nr);
 	}
 
-	draw_arrays(nvfx->draw, mode, start, count);
-
-	for (i = 0; i < nvfx->vtxbuf_nr; i++)
-		pipe_buffer_unmap(pipe, nvfx->vtxbuf[i].buffer, vb_transfer[i]);
-
-	if (idxbuf)
-		pipe_buffer_unmap(pipe, idxbuf, ib_transfer);
-
-	if (nvfx->constbuf[PIPE_SHADER_VERTEX])
-		pipe_buffer_unmap(pipe, nvfx->constbuf[PIPE_SHADER_VERTEX],
-				  cb_transfer);
+	draw_vbo(nvfx->draw, info);
 
 	draw_flush(nvfx->draw);
-	pipe->flush(pipe, 0, NULL);
 }
 
 static INLINE void
@@ -305,19 +272,19 @@ emit_attrib(struct nvfx_context *nvfx, unsigned hw, unsigned emit,
 void
 nvfx_vtxfmt_validate(struct nvfx_context *nvfx)
 {
-	struct nvfx_fragment_program *fp = nvfx->fragprog;
+	struct nvfx_pipe_fragment_program *pfp = nvfx->fragprog;
 	unsigned colour = 0, texcoords = 0, fog = 0, i;
 
 	/* Determine needed fragprog inputs */
-	for (i = 0; i < fp->info.num_inputs; i++) {
-		switch (fp->info.input_semantic_name[i]) {
+	for (i = 0; i < pfp->info.num_inputs; i++) {
+		switch (pfp->info.input_semantic_name[i]) {
 		case TGSI_SEMANTIC_POSITION:
 			break;
 		case TGSI_SEMANTIC_COLOR:
-			colour |= (1 << fp->info.input_semantic_index[i]);
+			colour |= (1 << pfp->info.input_semantic_index[i]);
 			break;
 		case TGSI_SEMANTIC_GENERIC:
-			texcoords |= (1 << fp->info.input_semantic_index[i]);
+			texcoords |= (1 << pfp->info.input_semantic_index[i]);
 			break;
 		case TGSI_SEMANTIC_FOG:
 			fog = 1;
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index ee41f03b9b..275672a31f 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -2,25 +2,31 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
 #include "util/u_inlines.h"
+#include "util/u_debug.h"
 
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_ureg.h"
 
 #include "nvfx_context.h"
 #include "nvfx_shader.h"
+#include "nvfx_resource.h"
 
 #define MAX_CONSTS 128
 #define MAX_IMM 32
+
 struct nvfx_fpc {
+	struct nvfx_pipe_fragment_program* pfp;
 	struct nvfx_fragment_program *fp;
 
-	uint attrib_map[PIPE_MAX_SHADER_INPUTS];
-
-	unsigned r_temps;
-	unsigned r_temps_discard;
-	struct nvfx_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
-	struct nvfx_sreg *r_temp;
+	unsigned max_temps;
+	unsigned long long r_temps;
+	unsigned long long r_temps_discard;
+	struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nvfx_reg *r_temp;
+	unsigned sprite_coord_temp;
 
 	int num_regs;
 
@@ -33,34 +39,40 @@ struct nvfx_fpc {
 	} consts[MAX_CONSTS];
 	int nr_consts;
 
-	struct nvfx_sreg imm[MAX_IMM];
+	struct nvfx_reg imm[MAX_IMM];
 	unsigned nr_imm;
+
+	unsigned char generic_to_slot[256]; /* semantic idx for each input semantic */
+
+	struct util_dynarray if_stack;
+	//struct util_dynarray loop_stack;
+	struct util_dynarray label_relocs;
 };
 
-static INLINE struct nvfx_sreg
+static INLINE struct nvfx_reg
 temp(struct nvfx_fpc *fpc)
 {
-	int idx = ffs(~fpc->r_temps) - 1;
+	int idx = __builtin_ctzll(~fpc->r_temps);
 
-	if (idx < 0) {
+	if (idx >= fpc->max_temps) {
 		NOUVEAU_ERR("out of temps!!\n");
 		assert(0);
-		return nvfx_sr(NVFXSR_TEMP, 0);
+		return nvfx_reg(NVFXSR_TEMP, 0);
 	}
 
-	fpc->r_temps |= (1 << idx);
-	fpc->r_temps_discard |= (1 << idx);
-	return nvfx_sr(NVFXSR_TEMP, idx);
+	fpc->r_temps |= (1ULL << idx);
+	fpc->r_temps_discard |= (1ULL << idx);
+	return nvfx_reg(NVFXSR_TEMP, idx);
 }
 
 static INLINE void
 release_temps(struct nvfx_fpc *fpc)
 {
 	fpc->r_temps &= ~fpc->r_temps_discard;
-	fpc->r_temps_discard = 0;
+	fpc->r_temps_discard = 0ULL;
 }
 
-static INLINE struct nvfx_sreg
+static INLINE struct nvfx_reg
 constant(struct nvfx_fpc *fpc, int pipe, float vals[4])
 {
 	int idx;
@@ -72,16 +84,9 @@ constant(struct nvfx_fpc *fpc, int pipe, float vals[4])
 	fpc->consts[idx].pipe = pipe;
 	if (pipe == -1)
 		memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
-	return nvfx_sr(NVFXSR_CONST, idx);
+	return nvfx_reg(NVFXSR_CONST, idx);
 }
 
-#define arith(cc,s,o,d,m,s0,s1,s2) \
-	nvfx_fp_arith((cc), (s), NVFX_FP_OP_OPCODE_##o, \
-			(d), (m), (s0), (s1), (s2))
-#define tex(cc,s,o,u,d,m,s0,s1,s2) \
-	nvfx_fp_tex((cc), (s), NVFX_FP_OP_OPCODE_##o, (u), \
-		    (d), (m), (s0), none, none)
-
 static void
 grow_insns(struct nvfx_fpc *fpc, int size)
 {
@@ -92,23 +97,29 @@ grow_insns(struct nvfx_fpc *fpc, int size)
 }
 
 static void
-emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_sreg src)
+emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_src src)
 {
 	struct nvfx_fragment_program *fp = fpc->fp;
 	uint32_t *hw = &fp->insn[fpc->inst_offset];
 	uint32_t sr = 0;
 
-	switch (src.type) {
+	switch (src.reg.type) {
 	case NVFXSR_INPUT:
 		sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
-		hw[0] |= (src.index << NVFX_FP_OP_INPUT_SRC_SHIFT);
+		hw[0] |= (src.reg.index << NVFX_FP_OP_INPUT_SRC_SHIFT);
 		break;
 	case NVFXSR_OUTPUT:
 		sr |= NVFX_FP_REG_SRC_HALF;
 		/* fall-through */
 	case NVFXSR_TEMP:
 		sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT);
-		sr |= (src.index << NVFX_FP_REG_SRC_SHIFT);
+		sr |= (src.reg.index << NVFX_FP_REG_SRC_SHIFT);
+		break;
+	case NVFXSR_RELOCATED:
+		sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT);
+		sr |= (fpc->sprite_coord_temp << NVFX_FP_REG_SRC_SHIFT);
+		//printf("adding relocation at %x for %x\n", fpc->inst_offset, src.index);
+		util_dynarray_append(&fpc->fp->slot_relocations[src.reg.index], unsigned, fpc->inst_offset + pos + 1);
 		break;
 	case NVFXSR_CONST:
 		if (!fpc->have_const) {
@@ -117,18 +128,18 @@ emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_sreg src)
 		}
 
 		hw = &fp->insn[fpc->inst_offset];
-		if (fpc->consts[src.index].pipe >= 0) {
+		if (fpc->consts[src.reg.index].pipe >= 0) {
 			struct nvfx_fragment_program_data *fpd;
 
 			fp->consts = realloc(fp->consts, ++fp->nr_consts *
 					     sizeof(*fpd));
 			fpd = &fp->consts[fp->nr_consts - 1];
 			fpd->offset = fpc->inst_offset + 4;
-			fpd->index = fpc->consts[src.index].pipe;
+			fpd->index = fpc->consts[src.reg.index].pipe;
 			memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
 		} else {
 			memcpy(&fp->insn[fpc->inst_offset + 4],
-				fpc->consts[src.index].vals,
+				fpc->consts[src.reg.index].vals,
 				sizeof(uint32_t) * 4);
 		}
 
@@ -156,7 +167,7 @@ emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_sreg src)
 }
 
 static void
-emit_dst(struct nvfx_fpc *fpc, struct nvfx_sreg dst)
+emit_dst(struct nvfx_fpc *fpc, struct nvfx_reg dst)
 {
 	struct nvfx_fragment_program *fp = fpc->fp;
 	uint32_t *hw = &fp->insn[fpc->inst_offset];
@@ -184,9 +195,7 @@ emit_dst(struct nvfx_fpc *fpc, struct nvfx_sreg dst)
 }
 
 static void
-nvfx_fp_arith(struct nvfx_fpc *fpc, int sat, int op,
-	      struct nvfx_sreg dst, int mask,
-	      struct nvfx_sreg s0, struct nvfx_sreg s1, struct nvfx_sreg s2)
+nvfx_fp_emit(struct nvfx_fpc *fpc, struct nvfx_insn insn)
 {
 	struct nvfx_fragment_program *fp = fpc->fp;
 	uint32_t *hw;
@@ -197,68 +206,225 @@ nvfx_fp_arith(struct nvfx_fpc *fpc, int sat, int op,
 	hw = &fp->insn[fpc->inst_offset];
 	memset(hw, 0, sizeof(uint32_t) * 4);
 
-	if (op == NVFX_FP_OP_OPCODE_KIL)
+	if (insn.op == NVFX_FP_OP_OPCODE_KIL)
 		fp->fp_control |= NV34TCL_FP_CONTROL_USES_KIL;
-	hw[0] |= (op << NVFX_FP_OP_OPCODE_SHIFT);
-	hw[0] |= (mask << NVFX_FP_OP_OUTMASK_SHIFT);
-	hw[2] |= (dst.dst_scale << NVFX_FP_OP_DST_SCALE_SHIFT);
+	hw[0] |= (insn.op << NVFX_FP_OP_OPCODE_SHIFT);
+	hw[0] |= (insn.mask << NVFX_FP_OP_OUTMASK_SHIFT);
+	hw[2] |= (insn.scale << NVFX_FP_OP_DST_SCALE_SHIFT);
 
-	if (sat)
+	if (insn.sat)
 		hw[0] |= NVFX_FP_OP_OUT_SAT;
 
-	if (dst.cc_update)
+	if (insn.cc_update)
 		hw[0] |= NVFX_FP_OP_COND_WRITE_ENABLE;
-	hw[1] |= (dst.cc_test << NVFX_FP_OP_COND_SHIFT);
-	hw[1] |= ((dst.cc_swz[0] << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
-		  (dst.cc_swz[1] << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
-		  (dst.cc_swz[2] << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
-		  (dst.cc_swz[3] << NVFX_FP_OP_COND_SWZ_W_SHIFT));
-
-	emit_dst(fpc, dst);
-	emit_src(fpc, 0, s0);
-	emit_src(fpc, 1, s1);
-	emit_src(fpc, 2, s2);
+	hw[1] |= (insn.cc_test << NVFX_FP_OP_COND_SHIFT);
+	hw[1] |= ((insn.cc_swz[0] << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+		  (insn.cc_swz[1] << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
+		  (insn.cc_swz[2] << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
+		  (insn.cc_swz[3] << NVFX_FP_OP_COND_SWZ_W_SHIFT));
+
+	if(insn.unit >= 0)
+	{
+		hw[0] |= (insn.unit << NVFX_FP_OP_TEX_UNIT_SHIFT);
+		fp->samplers |= (1 << insn.unit);
+	}
+
+	emit_dst(fpc, insn.dst);
+	emit_src(fpc, 0, insn.src[0]);
+	emit_src(fpc, 1, insn.src[1]);
+	emit_src(fpc, 2, insn.src[2]);
 }
 
+#define arith(s,o,d,m,s0,s1,s2) \
+       nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, -1, \
+                       (d), (m), (s0), (s1), (s2))
+
+#define tex(s,o,u,d,m,s0,s1,s2) \
+	nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, (u), \
+                   (d), (m), (s0), none, none)
+
+/* IF src.x != 0, as TGSI specifies */
 static void
-nvfx_fp_tex(struct nvfx_fpc *fpc, int sat, int op, int unit,
-	    struct nvfx_sreg dst, int mask,
-	    struct nvfx_sreg s0, struct nvfx_sreg s1, struct nvfx_sreg s2)
+nv40_fp_if(struct nvfx_fpc *fpc, struct nvfx_src src)
 {
-	struct nvfx_fragment_program *fp = fpc->fp;
+	const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
+	struct nvfx_insn insn = arith(0, MOV, none.reg, NVFX_FP_MASK_X, src, none, none);
+	uint32_t *hw;
+	insn.cc_update = 1;
+	nvfx_fp_emit(fpc, insn);
 
-	nvfx_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
+	fpc->inst_offset = fpc->fp->insn_len;
+	grow_insns(fpc, 4);
+	hw = &fpc->fp->insn[fpc->inst_offset];
+	/* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+	hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
+		NV40_FP_OP_OUT_NONE |
+		(NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
+	/* Use .xxxx swizzle so that we check only src[0].x*/
+	hw[1] = (0 << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+			(0 << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
+			(0 << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
+			(0 << NVFX_FP_OP_COND_SWZ_W_SHIFT) |
+			(NVFX_FP_OP_COND_NE << NVFX_FP_OP_COND_SHIFT);
+	hw[2] = 0; /* | NV40_FP_OP_OPCODE_IS_BRANCH | else_offset */
+	hw[3] = 0; /* | endif_offset */
+	util_dynarray_append(&fpc->if_stack, unsigned, fpc->inst_offset);
+}
+
+/* IF src.x != 0, as TGSI specifies */
+static void
+nv40_fp_cal(struct nvfx_fpc *fpc, unsigned target)
+{
+        struct nvfx_relocation reloc;
+        uint32_t *hw;
+        fpc->inst_offset = fpc->fp->insn_len;
+        grow_insns(fpc, 4);
+        hw = &fpc->fp->insn[fpc->inst_offset];
+        /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+        hw[0] = (NV40_FP_OP_BRA_OPCODE_CAL << NVFX_FP_OP_OPCODE_SHIFT);
+        /* Use .xxxx swizzle so that we check only src[0].x*/
+        hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
+                        (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+        hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
+        hw[3] = 0;
+        reloc.target = target;
+        reloc.location = fpc->inst_offset + 2;
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
+}
+
+static void
+nv40_fp_ret(struct nvfx_fpc *fpc)
+{
+	uint32_t *hw;
+	fpc->inst_offset = fpc->fp->insn_len;
+	grow_insns(fpc, 4);
+	hw = &fpc->fp->insn[fpc->inst_offset];
+	/* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+	hw[0] = (NV40_FP_OP_BRA_OPCODE_RET << NVFX_FP_OP_OPCODE_SHIFT);
+	/* Use .xxxx swizzle so that we check only src[0].x*/
+	hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
+			(NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+	hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
+	hw[3] = 0;
+}
 
-	fp->insn[fpc->inst_offset] |= (unit << NVFX_FP_OP_TEX_UNIT_SHIFT);
-	fp->samplers |= (1 << unit);
+static void
+nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target)
+{
+        struct nvfx_relocation reloc;
+        uint32_t *hw;
+        fpc->inst_offset = fpc->fp->insn_len;
+        grow_insns(fpc, 4);
+        hw = &fpc->fp->insn[fpc->inst_offset];
+        /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+        hw[0] = (NV40_FP_OP_BRA_OPCODE_REP << NVFX_FP_OP_OPCODE_SHIFT) |
+                        NV40_FP_OP_OUT_NONE |
+                        (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
+        /* Use .xxxx swizzle so that we check only src[0].x*/
+        hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
+                        (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+        hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH |
+                        (count << NV40_FP_OP_REP_COUNT1_SHIFT) |
+                        (count << NV40_FP_OP_REP_COUNT2_SHIFT) |
+                        (count << NV40_FP_OP_REP_COUNT3_SHIFT);
+        hw[3] = 0; /* | end_offset */
+        reloc.target = target;
+        reloc.location = fpc->inst_offset + 3;
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
+        //util_dynarray_append(&fpc->loop_stack, unsigned, target);
 }
 
-static INLINE struct nvfx_sreg
+/* warning: this only works forward, and probably only if not inside any IF */
+static void
+nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target)
+{
+        struct nvfx_relocation reloc;
+        uint32_t *hw;
+        fpc->inst_offset = fpc->fp->insn_len;
+        grow_insns(fpc, 4);
+        hw = &fpc->fp->insn[fpc->inst_offset];
+        /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+        hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
+                NV40_FP_OP_OUT_NONE |
+                (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
+        /* Use .xxxx swizzle so that we check only src[0].x*/
+        hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+                        (NVFX_FP_OP_COND_FL << NVFX_FP_OP_COND_SHIFT);
+        hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | else_offset */
+        hw[3] = 0; /* | endif_offset */
+        reloc.target = target;
+        reloc.location = fpc->inst_offset + 2;
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
+        reloc.target = target;
+        reloc.location = fpc->inst_offset + 3;
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
+}
+
+static void
+nv40_fp_brk(struct nvfx_fpc *fpc)
+{
+	uint32_t *hw;
+	fpc->inst_offset = fpc->fp->insn_len;
+	grow_insns(fpc, 4);
+	hw = &fpc->fp->insn[fpc->inst_offset];
+	/* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+	hw[0] = (NV40_FP_OP_BRA_OPCODE_BRK << NVFX_FP_OP_OPCODE_SHIFT) |
+		NV40_FP_OP_OUT_NONE;
+	/* Use .xxxx swizzle so that we check only src[0].x*/
+	hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+			(NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+	hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH;
+	hw[3] = 0;
+}
+
+static INLINE struct nvfx_src
 tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
 {
-	struct nvfx_sreg src = { 0 };
+	struct nvfx_src src;
 
 	switch (fsrc->Register.File) {
 	case TGSI_FILE_INPUT:
-		src = nvfx_sr(NVFXSR_INPUT,
-			      fpc->attrib_map[fsrc->Register.Index]);
+		if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_POSITION) {
+			assert(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0);
+			src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_POSITION);
+		} else if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_COLOR) {
+			if(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0)
+				src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_COL0);
+			else if(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 1)
+				src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_COL1);
+			else
+				assert(0);
+		} else if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FOG) {
+			assert(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0);
+			src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_FOGC);
+		} else if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FACE) {
+			/* TODO: check this has the correct values */
+			/* XXX: what do we do for nv30 here (assuming it lacks facing)?!  */
+			assert(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0);
+			src.reg = nvfx_reg(NVFXSR_INPUT, NV40_FP_OP_INPUT_SRC_FACING);
+		} else {
+			assert(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_GENERIC);
+			src.reg = nvfx_reg(NVFXSR_RELOCATED, fpc->generic_to_slot[fpc->pfp->info.input_semantic_index[fsrc->Register.Index]]);
+		}
 		break;
 	case TGSI_FILE_CONSTANT:
-		src = constant(fpc, fsrc->Register.Index, NULL);
+		src.reg = constant(fpc, fsrc->Register.Index, NULL);
 		break;
 	case TGSI_FILE_IMMEDIATE:
 		assert(fsrc->Register.Index < fpc->nr_imm);
-		src = fpc->imm[fsrc->Register.Index];
+		src.reg = fpc->imm[fsrc->Register.Index];
 		break;
 	case TGSI_FILE_TEMPORARY:
-		src = fpc->r_temp[fsrc->Register.Index];
+		src.reg = fpc->r_temp[fsrc->Register.Index];
 		break;
 	/* NV40 fragprog result regs are just temps, so this is simple */
 	case TGSI_FILE_OUTPUT:
-		src = fpc->r_result[fsrc->Register.Index];
+		src.reg = fpc->r_result[fsrc->Register.Index];
 		break;
 	default:
 		NOUVEAU_ERR("bad src file\n");
+		src.reg.index = 0;
+		src.reg.type = 0;
 		break;
 	}
 
@@ -271,7 +437,7 @@ tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
 	return src;
 }
 
-static INLINE struct nvfx_sreg
+static INLINE struct nvfx_reg
 tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
 	switch (fdst->Register.File) {
 	case TGSI_FILE_OUTPUT:
@@ -279,10 +445,10 @@ tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
 	case TGSI_FILE_TEMPORARY:
 		return fpc->r_temp[fdst->Register.Index];
 	case TGSI_FILE_NULL:
-		return nvfx_sr(NVFXSR_NONE, 0);
+		return nvfx_reg(NVFXSR_NONE, 0);
 	default:
 		NOUVEAU_ERR("bad dst file %d\n", fdst->Register.File);
-		return nvfx_sr(NVFXSR_NONE, 0);
+		return nvfx_reg(NVFXSR_NONE, 0);
 	}
 }
 
@@ -302,8 +468,10 @@ static boolean
 nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
 				const struct tgsi_full_instruction *finst)
 {
-	const struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0);
-	struct nvfx_sreg src[3], dst, tmp;
+	const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
+	struct nvfx_insn insn;
+	struct nvfx_src src[3], tmp, tmp2;
+	struct nvfx_reg dst;
 	int mask, sat, unit = 0;
 	int ai = -1, ci = -1, ii = -1;
 	int i;
@@ -331,9 +499,8 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
 				ai = fsrc->Register.Index;
 				src[i] = tgsi_src(fpc, fsrc);
 			} else {
-				src[i] = temp(fpc);
-				arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL,
-				      tgsi_src(fpc, fsrc), none, none);
+				src[i] = nvfx_src(temp(fpc));
+				nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
 			}
 			break;
 		case TGSI_FILE_CONSTANT:
@@ -342,9 +509,8 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
 				ci = fsrc->Register.Index;
 				src[i] = tgsi_src(fpc, fsrc);
 			} else {
-				src[i] = temp(fpc);
-				arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL,
-				      tgsi_src(fpc, fsrc), none, none);
+				src[i] = nvfx_src(temp(fpc));
+				nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
 			}
 			break;
 		case TGSI_FILE_IMMEDIATE:
@@ -353,9 +519,8 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
 				ii = fsrc->Register.Index;
 				src[i] = tgsi_src(fpc, fsrc);
 			} else {
-				src[i] = temp(fpc);
-				arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL,
-				      tgsi_src(fpc, fsrc), none, none);
+				src[i] = nvfx_src(temp(fpc));
+				nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
 			}
 			break;
 		case TGSI_FILE_TEMPORARY:
@@ -378,277 +543,345 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
 
 	switch (finst->Instruction.Opcode) {
 	case TGSI_OPCODE_ABS:
-		arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
+		nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, abs(src[0]), none, none));
 		break;
 	case TGSI_OPCODE_ADD:
-		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_CMP:
-		tmp = nvfx_sr(NVFXSR_NONE, 0);
-		tmp.cc_update = 1;
-		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
-		dst.cc_test = NVFX_COND_GE;
-		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
-		dst.cc_test = NVFX_COND_LT;
-		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
+		insn = arith(0, MOV, none.reg, mask, src[0], none, none);
+		insn.cc_update = 1;
+		nvfx_fp_emit(fpc, insn);
+
+		insn = arith(sat, MOV, dst, mask, src[2], none, none);
+		insn.cc_test = NVFX_COND_GE;
+		nvfx_fp_emit(fpc, insn);
+
+		insn = arith(sat, MOV, dst, mask, src[1], none, none);
+		insn.cc_test = NVFX_COND_LT;
+		nvfx_fp_emit(fpc, insn);
 		break;
 	case TGSI_OPCODE_COS:
-		arith(fpc, sat, COS, dst, mask, src[0], none, none);
+		nvfx_fp_emit(fpc, arith(sat, COS, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_DDX:
 		if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
-			tmp = temp(fpc);
-			arith(fpc, sat, DDX, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y,
-			      swz(src[0], Z, W, Z, W), none, none);
-			arith(fpc, 0, MOV, tmp, NVFX_FP_MASK_Z | NVFX_FP_MASK_W,
-			      swz(tmp, X, Y, X, Y), none, none);
-			arith(fpc, sat, DDX, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0],
-			      none, none);
-			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+			tmp = nvfx_src(temp(fpc));
+			nvfx_fp_emit(fpc, arith(sat, DDX, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, swz(src[0], Z, W, Z, W), none, none));
+			nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, swz(tmp, X, Y, X, Y), none, none));
+			nvfx_fp_emit(fpc, arith(sat, DDX, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], none, none));
+			nvfx_fp_emit(fpc, arith(0, MOV, dst, mask, tmp, none, none));
 		} else {
-			arith(fpc, sat, DDX, dst, mask, src[0], none, none);
+			nvfx_fp_emit(fpc, arith(sat, DDX, dst, mask, src[0], none, none));
 		}
 		break;
 	case TGSI_OPCODE_DDY:
 		if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
-			tmp = temp(fpc);
-			arith(fpc, sat, DDY, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y,
-			      swz(src[0], Z, W, Z, W), none, none);
-			arith(fpc, 0, MOV, tmp, NVFX_FP_MASK_Z | NVFX_FP_MASK_W,
-			      swz(tmp, X, Y, X, Y), none, none);
-			arith(fpc, sat, DDY, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0],
-			      none, none);
-			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+			tmp = nvfx_src(temp(fpc));
+			nvfx_fp_emit(fpc, arith(sat, DDY, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, swz(src[0], Z, W, Z, W), none, none));
+			nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, swz(tmp, X, Y, X, Y), none, none));
+			nvfx_fp_emit(fpc, arith(sat, DDY, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], none, none));
+			nvfx_fp_emit(fpc, arith(0, MOV, dst, mask, tmp, none, none));
 		} else {
-			arith(fpc, sat, DDY, dst, mask, src[0], none, none);
+			nvfx_fp_emit(fpc, arith(sat, DDY, dst, mask, src[0], none, none));
 		}
 		break;
+	case TGSI_OPCODE_DP2:
+		tmp = nvfx_src(temp(fpc));
+		nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], src[1], none));
+		nvfx_fp_emit(fpc, arith(0, ADD, dst, mask, swz(tmp, X, X, X, X), swz(tmp, Y, Y, Y, Y), none));
+		break;
 	case TGSI_OPCODE_DP3:
-		arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, DP3, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_DP4:
-		arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, DP4, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_DPH:
-		tmp = temp(fpc);
-		arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_X, src[0], src[1], none);
-		arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
-		      swz(src[1], W, W, W, W), none);
+		tmp = nvfx_src(temp(fpc));
+		nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_X, src[0], src[1], none));
+		nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, swz(tmp, X, X, X, X), swz(src[1], W, W, W, W), none));
 		break;
 	case TGSI_OPCODE_DST:
-		arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, DST, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_EX2:
-		arith(fpc, sat, EX2, dst, mask, src[0], none, none);
+		nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_FLR:
-		arith(fpc, sat, FLR, dst, mask, src[0], none, none);
+		nvfx_fp_emit(fpc, arith(sat, FLR, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_FRC:
-		arith(fpc, sat, FRC, dst, mask, src[0], none, none);
+		nvfx_fp_emit(fpc, arith(sat, FRC, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_KILP:
-		arith(fpc, 0, KIL, none, 0, none, none, none);
+		nvfx_fp_emit(fpc, arith(0, KIL, none.reg, 0, none, none, none));
 		break;
 	case TGSI_OPCODE_KIL:
-		dst = nvfx_sr(NVFXSR_NONE, 0);
-		dst.cc_update = 1;
-		arith(fpc, 0, MOV, dst, NVFX_FP_MASK_ALL, src[0], none, none);
-		dst.cc_update = 0; dst.cc_test = NVFX_COND_LT;
-		arith(fpc, 0, KIL, dst, 0, none, none, none);
+		insn = arith(0, MOV, none.reg, NVFX_FP_MASK_ALL, src[0], none, none);
+		insn.cc_update = 1;
+		nvfx_fp_emit(fpc, insn);
+
+		insn = arith(0, KIL, none.reg, 0, none, none, none);
+		insn.cc_test = NVFX_COND_LT;
+		nvfx_fp_emit(fpc, insn);
 		break;
 	case TGSI_OPCODE_LG2:
-		arith(fpc, sat, LG2, dst, mask, src[0], none, none);
+		nvfx_fp_emit(fpc, arith(sat, LG2, dst, mask, src[0], none, none));
 		break;
 //	case TGSI_OPCODE_LIT:
 	case TGSI_OPCODE_LRP:
 		if(!nvfx->is_nv4x)
-			arith(fpc, sat, LRP_NV30, dst, mask, src[0], src[1], src[2]);
+			nvfx_fp_emit(fpc, arith(sat, LRP_NV30, dst, mask, src[0], src[1], src[2]));
 		else {
-			tmp = temp(fpc);
-			arith(fpc, 0, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
-			arith(fpc, sat, MAD, dst, mask, src[0], src[1], tmp);
+			tmp = nvfx_src(temp(fpc));
+			nvfx_fp_emit(fpc, arith(0, MAD, tmp.reg, mask, neg(src[0]), src[2], src[2]));
+			nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, src[0], src[1], tmp));
 		}
 		break;
 	case TGSI_OPCODE_MAD:
-		arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
+		nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, src[0], src[1], src[2]));
 		break;
 	case TGSI_OPCODE_MAX:
-		arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, MAX, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_MIN:
-		arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, MIN, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_MOV:
-		arith(fpc, sat, MOV, dst, mask, src[0], none, none);
+		nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_MUL:
-		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, MUL, dst, mask, src[0], src[1], none));
+		break;
+	case TGSI_OPCODE_NOP:
 		break;
 	case TGSI_OPCODE_POW:
 		if(!nvfx->is_nv4x)
-			arith(fpc, sat, POW_NV30, dst, mask, src[0], src[1], none);
+			nvfx_fp_emit(fpc, arith(sat, POW_NV30, dst, mask, src[0], src[1], none));
 		else {
-			tmp = temp(fpc);
-			arith(fpc, 0, LG2, tmp, NVFX_FP_MASK_X,
-			      swz(src[0], X, X, X, X), none, none);
-			arith(fpc, 0, MUL, tmp, NVFX_FP_MASK_X, swz(tmp, X, X, X, X),
-			      swz(src[1], X, X, X, X), none);
-			arith(fpc, sat, EX2, dst, mask,
-			      swz(tmp, X, X, X, X), none, none);
+			tmp = nvfx_src(temp(fpc));
+			nvfx_fp_emit(fpc, arith(0, LG2, tmp.reg, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
+			nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X, swz(tmp, X, X, X, X), swz(src[1], X, X, X, X), none));
+			nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, swz(tmp, X, X, X, X), none, none));
 		}
 		break;
 	case TGSI_OPCODE_RCP:
-		arith(fpc, sat, RCP, dst, mask, src[0], none, none);
-		break;
-	case TGSI_OPCODE_RET:
-		assert(0);
+		nvfx_fp_emit(fpc, arith(sat, RCP, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_RFL:
 		if(!nvfx->is_nv4x)
-			arith(fpc, 0, RFL_NV30, dst, mask, src[0], src[1], none);
+			nvfx_fp_emit(fpc, arith(0, RFL_NV30, dst, mask, src[0], src[1], none));
 		else {
-			tmp = temp(fpc);
-			arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_X, src[0], src[0], none);
-			arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_Y, src[0], src[1], none);
-			arith(fpc, 0, DIV, scale(tmp, 2X), NVFX_FP_MASK_Z,
-			      swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
-			arith(fpc, sat, MAD, dst, mask,
-			      swz(tmp, Z, Z, Z, Z), src[0], neg(src[1]));
+			tmp = nvfx_src(temp(fpc));
+			nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_X, src[0], src[0], none));
+			nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_Y, src[0], src[1], none));
+			insn = arith(0, DIV, tmp.reg, NVFX_FP_MASK_Z, swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
+			insn.scale = NVFX_FP_OP_DST_SCALE_2X;
+			nvfx_fp_emit(fpc, insn);
+			nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, swz(tmp, Z, Z, Z, Z), src[0], neg(src[1])));
 		}
 		break;
 	case TGSI_OPCODE_RSQ:
 		if(!nvfx->is_nv4x)
-			arith(fpc, sat, RSQ_NV30, dst, mask, abs(swz(src[0], X, X, X, X)), none, none);
+			nvfx_fp_emit(fpc, arith(sat, RSQ_NV30, dst, mask, abs(swz(src[0], X, X, X, X)), none, none));
 		else {
-			tmp = temp(fpc);
-			arith(fpc, 0, LG2, scale(tmp, INV_2X), NVFX_FP_MASK_X,
-			      abs(swz(src[0], X, X, X, X)), none, none);
-			arith(fpc, sat, EX2, dst, mask,
-			      neg(swz(tmp, X, X, X, X)), none, none);
+			tmp = nvfx_src(temp(fpc));
+			insn = arith(0, LG2, tmp.reg, NVFX_FP_MASK_X, abs(swz(src[0], X, X, X, X)), none, none);
+			insn.scale = NVFX_FP_OP_DST_SCALE_INV_2X;
+			nvfx_fp_emit(fpc, insn);
+			nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, neg(swz(tmp, X, X, X, X)), none, none));
 		}
 		break;
 	case TGSI_OPCODE_SCS:
 		/* avoid overwriting the source */
 		if(src[0].swz[NVFX_SWZ_X] != NVFX_SWZ_X)
 		{
-			if (mask & NVFX_FP_MASK_X) {
-				arith(fpc, sat, COS, dst, NVFX_FP_MASK_X,
-				      swz(src[0], X, X, X, X), none, none);
-			}
-			if (mask & NVFX_FP_MASK_Y) {
-				arith(fpc, sat, SIN, dst, NVFX_FP_MASK_Y,
-				      swz(src[0], X, X, X, X), none, none);
-			}
+			if (mask & NVFX_FP_MASK_X)
+				nvfx_fp_emit(fpc, arith(sat, COS, dst, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
+			if (mask & NVFX_FP_MASK_Y)
+				nvfx_fp_emit(fpc, arith(sat, SIN, dst, NVFX_FP_MASK_Y, swz(src[0], X, X, X, X), none, none));
 		}
 		else
 		{
-			if (mask & NVFX_FP_MASK_Y) {
-				arith(fpc, sat, SIN, dst, NVFX_FP_MASK_Y,
-				      swz(src[0], X, X, X, X), none, none);
-			}
-			if (mask & NVFX_FP_MASK_X) {
-				arith(fpc, sat, COS, dst, NVFX_FP_MASK_X,
-				      swz(src[0], X, X, X, X), none, none);
-			}
+			if (mask & NVFX_FP_MASK_Y)
+				nvfx_fp_emit(fpc, arith(sat, SIN, dst, NVFX_FP_MASK_Y, swz(src[0], X, X, X, X), none, none));
+			if (mask & NVFX_FP_MASK_X)
+				nvfx_fp_emit(fpc, arith(sat, COS, dst, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
 		}
 		break;
 	case TGSI_OPCODE_SEQ:
-		arith(fpc, sat, SEQ, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, SEQ, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SFL:
-		arith(fpc, sat, SFL, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, SFL, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SGE:
-		arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, SGE, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SGT:
-		arith(fpc, sat, SGT, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, SGT, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SIN:
-		arith(fpc, sat, SIN, dst, mask, src[0], none, none);
+		nvfx_fp_emit(fpc, arith(sat, SIN, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_SLE:
-		arith(fpc, sat, SLE, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, SLE, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SLT:
-		arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, SLT, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SNE:
-		arith(fpc, sat, SNE, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, SNE, dst, mask, src[0], src[1], none));
+		break;
+	case TGSI_OPCODE_SSG:
+		tmp = nvfx_src(temp(fpc));
+		tmp2 = nvfx_src(temp(fpc));
+		nvfx_fp_emit(fpc, arith(0, SGT, tmp.reg, mask, src[0], nvfx_src(nvfx_reg(NVFXSR_CONST, 0)), none));
+		nvfx_fp_emit(fpc, arith(0, SLT, tmp.reg, mask, src[0], nvfx_src(nvfx_reg(NVFXSR_CONST, 0)), none));
+		nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, tmp, neg(tmp2), none));
 		break;
 	case TGSI_OPCODE_STR:
-		arith(fpc, sat, STR, dst, mask, src[0], src[1], none);
+		nvfx_fp_emit(fpc, arith(sat, STR, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SUB:
-		arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
+		nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], neg(src[1]), none));
 		break;
 	case TGSI_OPCODE_TEX:
-		tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
+		nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none));
 		break;
-	case TGSI_OPCODE_TXB:
-		tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
+        case TGSI_OPCODE_TRUNC:
+                tmp = nvfx_src(temp(fpc));
+                insn = arith(0, MOV, none.reg, mask, src[0], none, none);
+                insn.cc_update = 1;
+                nvfx_fp_emit(fpc, insn);
+
+                nvfx_fp_emit(fpc, arith(0, FLR, tmp.reg, mask, abs(src[0]), none, none));
+                nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, tmp, none, none));
+
+                insn = arith(sat, MOV, dst, mask, neg(tmp), none, none);
+                insn.cc_test = NVFX_COND_LT;
+                nvfx_fp_emit(fpc, insn);
+                break;
+        case TGSI_OPCODE_TXB:
+                nvfx_fp_emit(fpc, tex(sat, TXB, unit, dst, mask, src[0], none, none));
+                break;
+        case TGSI_OPCODE_TXL:
+                if(nvfx->is_nv4x)
+                        nvfx_fp_emit(fpc, tex(sat, TXL_NV40, unit, dst, mask, src[0], none, none));
+                else /* unsupported on nv30, use TEX and hope they like it */
+                        nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none));
+                break;
+        case TGSI_OPCODE_TXP:
+                nvfx_fp_emit(fpc, tex(sat, TXP, unit, dst, mask, src[0], none, none));
+                break;
+	case TGSI_OPCODE_XPD:
+		tmp = nvfx_src(temp(fpc));
+		nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none));
+		nvfx_fp_emit(fpc, arith(sat, MAD, dst, (mask & ~NVFX_FP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp)));
 		break;
-	case TGSI_OPCODE_TXP:
-		tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
+
+	case TGSI_OPCODE_IF:
+		// MOVRC0 R31 (TR0.xyzw), R<src>:
+		// IF (NE.xxxx) ELSE <else> END <end>
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		nv40_fp_if(fpc, src[0]);
 		break;
-	case TGSI_OPCODE_XPD:
-		tmp = temp(fpc);
-		arith(fpc, 0, MUL, tmp, mask,
-		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
-		arith(fpc, sat, MAD, dst, (mask & ~NVFX_FP_MASK_W),
-		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
-		      neg(tmp));
+
+	case TGSI_OPCODE_ELSE:
+	{
+		uint32_t *hw;
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		assert(util_dynarray_contains(&fpc->if_stack, unsigned));
+		hw = &fpc->fp->insn[util_dynarray_top(&fpc->if_stack, unsigned)];
+		hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
 		break;
-	default:
-		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
-		return FALSE;
 	}
 
-	release_temps(fpc);
-	return TRUE;
-}
+	case TGSI_OPCODE_ENDIF:
+	{
+		uint32_t *hw;
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		assert(util_dynarray_contains(&fpc->if_stack, unsigned));
+		hw = &fpc->fp->insn[util_dynarray_pop(&fpc->if_stack, unsigned)];
+		if(!hw[2])
+			hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
+		hw[3] = fpc->fp->insn_len;
+		break;
+	}
 
-static boolean
-nvfx_fragprog_parse_decl_attrib(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
-				const struct tgsi_full_declaration *fdec)
-{
-	int hw;
+	case TGSI_OPCODE_BRA:
+		/* This can in limited cases be implemented with an IF with the else and endif labels pointing to the target */
+		/* no state tracker uses this, so don't implement this for now */
+		assert(0);
+		nv40_fp_bra(fpc, finst->Label.Label);
+		break;
 
-	switch (fdec->Semantic.Name) {
-	case TGSI_SEMANTIC_POSITION:
-		hw = NVFX_FP_OP_INPUT_SRC_POSITION;
+	case TGSI_OPCODE_BGNSUB:
+	case TGSI_OPCODE_ENDSUB:
+		/* nothing to do here */
 		break;
-	case TGSI_SEMANTIC_COLOR:
-		if (fdec->Semantic.Index == 0) {
-			hw = NVFX_FP_OP_INPUT_SRC_COL0;
-		} else
-		if (fdec->Semantic.Index == 1) {
-			hw = NVFX_FP_OP_INPUT_SRC_COL1;
-		} else {
-			NOUVEAU_ERR("bad colour semantic index\n");
-			return FALSE;
-		}
+
+	case TGSI_OPCODE_CAL:
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		nv40_fp_cal(fpc, finst->Label.Label);
 		break;
-	case TGSI_SEMANTIC_FOG:
-		hw = NVFX_FP_OP_INPUT_SRC_FOGC;
+
+	case TGSI_OPCODE_RET:
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		nv40_fp_ret(fpc);
 		break;
-	case TGSI_SEMANTIC_GENERIC:
-		if (fdec->Semantic.Index <= 7) {
-			hw = NVFX_FP_OP_INPUT_SRC_TC(fdec->Semantic.
-						     Index);
-		} else {
-			NOUVEAU_ERR("bad generic semantic index\n");
-			return FALSE;
+
+	case TGSI_OPCODE_BGNLOOP:
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		/* TODO: we should support using two nested REPs to allow a > 255 iteration count */
+		nv40_fp_rep(fpc, 255, finst->Label.Label);
+		break;
+
+	case TGSI_OPCODE_ENDLOOP:
+		break;
+
+	case TGSI_OPCODE_BRK:
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		nv40_fp_brk(fpc);
+		break;
+
+	case TGSI_OPCODE_CONT:
+	{
+		static int warned = 0;
+		if(!warned) {
+			NOUVEAU_ERR("Sorry, the continue keyword is not implemented: ignoring it.\n");
+			warned = 1;
 		}
 		break;
-	default:
-		NOUVEAU_ERR("bad input semantic\n");
+	}
+
+        default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
 		return FALSE;
 	}
 
-	fpc->attrib_map[fdec->Range.First] = hw;
+out:
+	release_temps(fpc);
 	return TRUE;
+nv3x_cflow:
+	{
+		static int warned = 0;
+		if(!warned) {
+			NOUVEAU_ERR(
+					"Sorry, control flow instructions are not supported in hardware on nv3x: ignoring them\n"
+					"If rendering is incorrect, try to disable GLSL support in the application.\n");
+			warned = 1;
+		}
+	}
+	goto out;
 }
 
 static boolean
@@ -680,8 +913,8 @@ nvfx_fragprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
 		return FALSE;
 	}
 
-	fpc->r_result[idx] = nvfx_sr(NVFXSR_OUTPUT, hw);
-	fpc->r_temps |= (1 << hw);
+	fpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
+	fpc->r_temps |= (1ULL << hw);
 	return TRUE;
 }
 
@@ -690,8 +923,22 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc)
 {
 	struct tgsi_parse_context p;
 	int high_temp = -1, i;
+	struct util_semantic_set set;
+	float const0v[4] = {0, 0, 0, 0};
+	struct nvfx_reg const0;
+
+	fpc->fp->num_slots = util_semantic_set_from_program_file(&set, fpc->pfp->pipe.tokens, TGSI_FILE_INPUT);
+	if(fpc->fp->num_slots > 8)
+		return FALSE;
+	util_semantic_layout_from_set(fpc->fp->slot_to_generic, &set, 0, 8);
+	util_semantic_table_from_layout(fpc->generic_to_slot, fpc->fp->slot_to_generic, 0, 8);
 
-	tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+	memset(fpc->fp->slot_to_fp_input, 0xff, sizeof(fpc->fp->slot_to_fp_input));
+
+	const0 = constant(fpc, -1, const0v);
+	assert(const0.index == 0);
+
+	tgsi_parse_init(&p, fpc->pfp->pipe.tokens);
 	while (!tgsi_parse_end_of_tokens(&p)) {
 		const union tgsi_full_token *tok = &p.FullToken;
 
@@ -702,10 +949,6 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc)
 			const struct tgsi_full_declaration *fdec;
 			fdec = &p.FullToken.FullDeclaration;
 			switch (fdec->Declaration.File) {
-			case TGSI_FILE_INPUT:
-				if (!nvfx_fragprog_parse_decl_attrib(nvfx, fpc, fdec))
-					goto out_err;
-				break;
 			case TGSI_FILE_OUTPUT:
 				if (!nvfx_fragprog_parse_decl_output(nvfx, fpc, fdec))
 					goto out_err;
@@ -744,40 +987,66 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc)
 	tgsi_parse_free(&p);
 
 	if (++high_temp) {
-		fpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_sreg));
+		fpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_reg));
 		for (i = 0; i < high_temp; i++)
 			fpc->r_temp[i] = temp(fpc);
-		fpc->r_temps_discard = 0;
+		fpc->r_temps_discard = 0ULL;
 	}
 
 	return TRUE;
 
 out_err:
-	if (fpc->r_temp)
+	if (fpc->r_temp) {
 		FREE(fpc->r_temp);
+		fpc->r_temp = NULL;
+	}
 	tgsi_parse_free(&p);
 	return FALSE;
 }
 
-static void
+DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", FALSE)
+
+static struct nvfx_fragment_program*
 nvfx_fragprog_translate(struct nvfx_context *nvfx,
-			struct nvfx_fragment_program *fp)
+			struct nvfx_pipe_fragment_program *pfp,
+			boolean emulate_sprite_flipping)
 {
 	struct tgsi_parse_context parse;
 	struct nvfx_fpc *fpc = NULL;
+	struct util_dynarray insns;
+	struct nvfx_fragment_program* fp = NULL;
+        const int min_size = 4096;
 
-	fpc = CALLOC(1, sizeof(struct nvfx_fpc));
+	fp = CALLOC_STRUCT(nvfx_fragment_program);
+	if(!fp)
+		goto out_err;
+
+	fpc = CALLOC_STRUCT(nvfx_fpc);
 	if (!fpc)
-		return;
+		goto out_err;
+
+	fpc->max_temps = nvfx->is_nv4x ? 48 : 32;
+	fpc->pfp = pfp;
 	fpc->fp = fp;
 	fpc->num_regs = 2;
 
-	if (!nvfx_fragprog_prepare(nvfx, fpc)) {
-		FREE(fpc);
-		return;
-	}
+	if (!nvfx_fragprog_prepare(nvfx, fpc))
+		goto out_err;
 
-	tgsi_parse_init(&parse, fp->pipe.tokens);
+	tgsi_parse_init(&parse, pfp->pipe.tokens);
+	util_dynarray_init(&insns);
+
+	if(emulate_sprite_flipping)
+	{
+		struct nvfx_reg reg = temp(fpc);
+		struct nvfx_src sprite_input = nvfx_src(nvfx_reg(NVFXSR_RELOCATED, fp->num_slots));
+		float v[4] = {1, -1, 0, 0};
+		struct nvfx_src imm = nvfx_src(constant(fpc, -1, v));
+
+		fpc->sprite_coord_temp = reg.index;
+		fpc->r_temps_discard = 0ULL;
+		nvfx_fp_emit(fpc, arith(0, MAD, reg, NVFX_FP_MASK_ALL, sprite_input, swz(imm, X, Y, X, X), swz(imm, Z, X, Z, Z)));
+	}
 
 	while (!tgsi_parse_end_of_tokens(&parse)) {
 		tgsi_parse_token(&parse);
@@ -787,6 +1056,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 		{
 			const struct tgsi_full_instruction *finst;
 
+			util_dynarray_append(&insns, unsigned, fp->insn_len);
 			finst = &parse.FullToken.FullInstruction;
 			if (!nvfx_fragprog_parse_instruction(nvfx, fpc, finst))
 				goto out_err;
@@ -796,6 +1066,14 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 			break;
 		}
 	}
+	util_dynarray_append(&insns, unsigned, fp->insn_len);
+
+	for(unsigned i = 0; i < fpc->label_relocs.size; i += sizeof(struct nvfx_relocation))
+	{
+		struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)fpc->label_relocs.data + i);
+		fp->insn[label_reloc->location] |= ((unsigned*)insns.data)[label_reloc->target];
+	}
+	util_dynarray_fini(&insns);
 
 	if(!nvfx->is_nv4x)
 		fp->fp_control |= (fpc->num_regs-1)/2;
@@ -804,9 +1082,9 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 
 	/* Terminate final instruction */
 	if(fp->insn)
-                fp->insn[fpc->inst_offset] |= 0x00000001;
+		fp->insn[fpc->inst_offset] |= 0x00000001;
 
-	/* Append NOP + END instruction, may or may not be necessary. */
+	/* Append NOP + END instruction for branches to the end of the program */
 	fpc->inst_offset = fp->insn_len;
 	grow_insns(fpc, 4);
 	fp->insn[fpc->inst_offset + 0] = 0x00000001;
@@ -814,12 +1092,48 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 	fp->insn[fpc->inst_offset + 2] = 0x00000000;
 	fp->insn[fpc->inst_offset + 3] = 0x00000000;
 
-	fp->translated = TRUE;
-out_err:
+	if(debug_get_option_nvfx_dump_fp())
+	{
+		debug_printf("\n");
+		tgsi_dump(pfp->pipe.tokens, 0);
+
+		debug_printf("\n%s fragment program:\n", nvfx->is_nv4x ? "nv4x" : "nv3x");
+		for (unsigned i = 0; i < fp->insn_len; i += 4)
+			debug_printf("%3u: %08x %08x %08x %08x\n", i >> 2, fp->insn[i], fp->insn[i + 1], fp->insn[i + 2], fp->insn[i + 3]);
+		debug_printf("\n");
+	}
+
+        fp->prog_size = (fp->insn_len * 4 + 63) & ~63;
+
+        if(fp->prog_size >= min_size)
+                fp->progs_per_bo = 1;
+        else
+                fp->progs_per_bo = min_size / fp->prog_size;
+        fp->bo_prog_idx = fp->progs_per_bo - 1;
+
+out:
 	tgsi_parse_free(&parse);
-	if (fpc->r_temp)
-		FREE(fpc->r_temp);
-	FREE(fpc);
+	if(fpc)
+	{
+		if (fpc->r_temp)
+			FREE(fpc->r_temp);
+		util_dynarray_fini(&fpc->if_stack);
+		util_dynarray_fini(&fpc->label_relocs);
+		//util_dynarray_fini(&fpc->loop_stack);
+		FREE(fpc);
+	}
+	return fp;
+
+out_err:
+	_debug_printf("Error: failed to compile this fragment program:\n");
+	tgsi_dump(pfp->pipe.tokens, 0);
+
+	if(fp)
+	{
+		FREE(fp);
+		fp = NULL;
+	}
+	goto out;
 }
 
 static inline void
@@ -836,53 +1150,189 @@ nvfx_fp_memcpy(void* dst, const void* src, size_t len)
 #endif
 }
 
+/* The hardware only supports immediate constants inside the fragment program,
+ * and at least on nv30 doesn't support an indirect linkage table.
+ *
+ * Hence, we need to patch the fragment program itself both to update constants
+ * and update linkage.
+ *
+ * Using a single fragment program would entail unacceptable stalls if the GPU is
+ * already rendering with that fragment program.
+ * Thus, we instead use a "rotating queue" of buffer objects, each of which is
+ * packed with multiple versions of the same program.
+ *
+ * Whenever we need to patch something, we move to the next program and
+ * patch it. If all buffer objects are in use by the GPU, we allocate another one,
+ * expanding the queue.
+ *
+ * As an additional optimization, we record when all the programs have the
+ * current input slot configuration, and at that point we stop patching inputs.
+ * This happens, for instance, if a given fragment program is always used with
+ * the same vertex program (i.e. always with GLSL), or if the layouts match
+ * enough (non-GLSL).
+ *
+ * Note that instead of using multiple programs, we could push commands
+ * on the FIFO to patch a single program: it's not fully clear which option is
+ * faster, but my guess is that the current way is faster.
+ *
+ * We also track the previous slot assignments for each version and don't
+ * patch if they are the same (this could perhaps be removed).
+ */
+
 void
 nvfx_fragprog_validate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	struct nvfx_fragment_program *fp = nvfx->fragprog;
-	int update = 0;
-
-	if (!fp->translated)
+	struct nvfx_pipe_fragment_program *pfp = nvfx->fragprog;
+	struct nvfx_vertex_program* vp;
+	/* Gallium always puts the point coord in GENERIC[0]
+	 * TODO: this is wrong, Gallium needs to be fixed
+	 */
+	unsigned sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * (nvfx->rasterizer->pipe.sprite_coord_enable | 1);
+
+	boolean emulate_sprite_flipping = sprite_coord_enable && nvfx->rasterizer->pipe.sprite_coord_mode;
+	unsigned key = emulate_sprite_flipping;
+	struct nvfx_fragment_program* fp;
+
+	fp = pfp->fps[key];
+	if (!fp)
 	{
-		const int min_size = 4096;
+		fp = nvfx_fragprog_translate(nvfx, pfp, emulate_sprite_flipping);
 
-		nvfx_fragprog_translate(nvfx, fp);
-		if (!fp->translated) {
-			static unsigned dummy[8] = {1, 0, 0, 0, 1, 0, 0, 0};
-			static int warned = 0;
-			if(!warned)
+		if(!fp)
+		{
+			if(!nvfx->dummy_fs)
 			{
-				fprintf(stderr, "nvfx: failed to translate fragment program!\n");
-				warned = 1;
+				struct ureg_program *ureg = ureg_create( TGSI_PROCESSOR_FRAGMENT );
+				if (ureg)
+				{
+					ureg_END( ureg );
+					nvfx->dummy_fs = ureg_create_shader_and_destroy( ureg, &nvfx->pipe );
+				}
+
+				if(!nvfx->dummy_fs)
+				{
+					_debug_printf("Error: unable to create a dummy fragment shader: aborting.");
+					abort();
+				}
 			}
 
-			/* use dummy program: we cannot fail here */
-			fp->translated = TRUE;
-			fp->insn = malloc(sizeof(dummy));
-			memcpy(fp->insn, dummy, sizeof(dummy));
-			fp->insn_len = sizeof(dummy) / sizeof(dummy[0]);
+			fp = nvfx_fragprog_translate(nvfx, nvfx->dummy_fs, FALSE);
+			emulate_sprite_flipping = FALSE;
+
+			if(!fp)
+			{
+				_debug_printf("Error: unable to compile even a dummy fragment shader: aborting.");
+				abort();
+			}
 		}
-		update = TRUE;
 
-		fp->prog_size = (fp->insn_len * 4 + 63) & ~63;
+		pfp->fps[key] = fp;
+	}
+
+	vp = nvfx->render_mode == HW ? nvfx->vertprog : nvfx->swtnl.vertprog;
 
-		if(fp->prog_size >= min_size)
-			fp->progs_per_bo = 1;
+	if (fp->last_vp_id != vp->id || fp->last_sprite_coord_enable != sprite_coord_enable) {
+		int sprite_real_input = -1;
+		int sprite_reloc_input;
+		unsigned i;
+		fp->last_vp_id = vp->id;
+		fp->last_sprite_coord_enable = sprite_coord_enable;
+
+		if(sprite_coord_enable)
+		{
+			sprite_real_input = vp->sprite_fp_input;
+			if(sprite_real_input < 0)
+			{
+				unsigned used_texcoords = 0;
+				for(unsigned i = 0; i < fp->num_slots; ++i) {
+					unsigned generic = fp->slot_to_generic[i];
+					if(!((1 << generic) & sprite_coord_enable))
+					{
+						unsigned char slot_mask = vp->generic_to_fp_input[generic];
+						if(slot_mask >= 0xf0)
+							used_texcoords |= 1 << ((slot_mask & 0xf) - NVFX_FP_OP_INPUT_SRC_TC0);
+					}
+				}
+
+				sprite_real_input = NVFX_FP_OP_INPUT_SRC_TC(__builtin_ctz(~used_texcoords));
+			}
+
+			fp->point_sprite_control |= (1 << (sprite_real_input - NVFX_FP_OP_INPUT_SRC_TC0 + 8));
+		}
 		else
-			fp->progs_per_bo = min_size / fp->prog_size;
-		fp->bo_prog_idx = fp->progs_per_bo - 1;
-	}
+			fp->point_sprite_control = 0;
 
-	/* we must update constants even on "just" fragprog changes, because
-	   we don't check whether the current constant buffer matches the latest
-	   one bound to this fragment program */
-	if (nvfx->dirty & (NVFX_NEW_FRAGCONST | NVFX_NEW_FRAGPROG))
-		update = TRUE;
+		if(emulate_sprite_flipping)
+		   sprite_reloc_input = 0;
+		else
+		   sprite_reloc_input = sprite_real_input;
 
-	if(update) {
+		for(i = 0; i < fp->num_slots; ++i) {
+			unsigned generic = fp->slot_to_generic[i];
+			if((1 << generic) & sprite_coord_enable)
+			{
+				if(fp->slot_to_fp_input[i] != sprite_reloc_input)
+					goto update_slots;
+			}
+			else
+			{
+				unsigned char slot_mask = vp->generic_to_fp_input[generic];
+				if((slot_mask >> 4) & (slot_mask ^ fp->slot_to_fp_input[i]))
+					goto update_slots;
+			}
+		}
+
+		if(emulate_sprite_flipping)
+		{
+			if(fp->slot_to_fp_input[fp->num_slots] != sprite_real_input)
+				goto update_slots;
+		}
+
+		if(0)
+		{
+update_slots:
+			/* optimization: we start updating from the slot we found the first difference in */
+			for(; i < fp->num_slots; ++i)
+			{
+				unsigned generic = fp->slot_to_generic[i];
+				if((1 << generic) & sprite_coord_enable)
+					fp->slot_to_fp_input[i] = sprite_reloc_input;
+				else
+					fp->slot_to_fp_input[i] = vp->generic_to_fp_input[generic] & 0xf;
+			}
+
+			fp->slot_to_fp_input[fp->num_slots] = sprite_real_input;
+
+			if(nvfx->is_nv4x)
+			{
+				fp->or = 0;
+				for(i = 0; i <= fp->num_slots; ++i) {
+					unsigned fp_input = fp->slot_to_fp_input[i];
+					if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(8))
+						fp->or |= (1 << 12);
+					else if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(9))
+						fp->or |= (1 << 13);
+					else if(fp_input >= NVFX_FP_OP_INPUT_SRC_TC(0) && fp_input <= NVFX_FP_OP_INPUT_SRC_TC(7))
+						fp->or |= (1 << (fp_input - NVFX_FP_OP_INPUT_SRC_TC0 + 14));
+				}
+			}
+
+			fp->progs_left_with_obsolete_slot_assignments = fp->progs;
+			goto update;
+		}
+	}
+
+	/* We must update constants even on "just" fragprog changes, because
+	  * we don't check whether the current constant buffer matches the latest
+	  * one bound to this fragment program.
+	  * Doing such a check would likely be a pessimization.
+	  */
+	if ((nvfx->hw_fragprog != fp) || (nvfx->dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST))) {
 		int offset;
+		uint32_t* fpmap;
 
+update:
 		++fp->bo_prog_idx;
 		if(fp->bo_prog_idx >= fp->progs_per_bo)
 		{
@@ -892,10 +1342,12 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 			}
 			else
 			{
-				struct nvfx_fragment_program_bo* fpbo = os_malloc_aligned(sizeof(struct nvfx_fragment_program) + fp->prog_size * fp->progs_per_bo, 16);
-				char *map, *buf;
-				int i;
+				struct nvfx_fragment_program_bo* fpbo = os_malloc_aligned(sizeof(struct nvfx_fragment_program) + (fp->prog_size + 8) * fp->progs_per_bo, 16);
+				uint8_t* map;
+				uint8_t* buf;
 
+				fpbo->slots = (unsigned char*)&fpbo->insn[(fp->prog_size) * fp->progs_per_bo];
+				memset(fpbo->slots, 0, 8 * fp->progs_per_bo);
 				if(fp->fpbo)
 				{
 					fpbo->next = fp->fpbo->next;
@@ -905,12 +1357,14 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 					fpbo->next = fpbo;
 				fp->fpbo = fpbo;
 				fpbo->bo = 0;
+				fp->progs += fp->progs_per_bo;
+				fp->progs_left_with_obsolete_slot_assignments += fp->progs_per_bo;
 				nouveau_bo_new(nvfx->screen->base.device, NOUVEAU_BO_VRAM | NOUVEAU_BO_MAP, 64, fp->prog_size * fp->progs_per_bo, &fpbo->bo);
 				nouveau_bo_map(fpbo->bo, NOUVEAU_BO_NOSYNC);
 
 				map = fpbo->bo->map;
-				buf = fpbo->insn;
-				for(i = 0; i < fp->progs_per_bo; ++i)
+				buf = (uint8_t*)fpbo->insn;
+				for(unsigned i = 0; i < fp->progs_per_bo; ++i)
 				{
 					memcpy(buf, fp->insn, fp->insn_len * 4);
 					nvfx_fp_memcpy(map, fp->insn, fp->insn_len * 4);
@@ -922,13 +1376,11 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 		}
 
 		offset = fp->bo_prog_idx * fp->prog_size;
+		fpmap = (uint32_t*)((char*)fp->fpbo->bo->map + offset);
 
 		if(nvfx->constbuf[PIPE_SHADER_FRAGMENT]) {
 			struct pipe_resource* constbuf = nvfx->constbuf[PIPE_SHADER_FRAGMENT];
-			// TODO: avoid using transfers, just directly the buffer
-			struct pipe_transfer* transfer;
-			// TODO: does this check make any sense, or should we do this unconditionally?
-			uint32_t* map = pipe_buffer_map(&nvfx->pipe, constbuf, PIPE_TRANSFER_READ, &transfer);
+			uint32_t* map = (uint32_t*)nvfx_buffer(constbuf)->data;
 			uint32_t* fpmap = (uint32_t*)((char*)fp->fpbo->bo->map + offset);
 			uint32_t* buf = (uint32_t*)((char*)fp->fpbo->insn + offset);
 			int i;
@@ -942,12 +1394,61 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 					nvfx_fp_memcpy(&fpmap[off], &map[idx], 4 * sizeof(uint32_t));
 				}
 			}
-			pipe_buffer_unmap(&nvfx->pipe, constbuf, transfer);
 		}
-	}
 
-	if(update || (nvfx->dirty & NVFX_NEW_FRAGPROG)) {
-		int offset = fp->bo_prog_idx * fp->prog_size;
+		/* we only do this if we aren't sure that all program versions have the
+		 * current slot assignments, otherwise we just update constants for speed
+		 */
+		if(fp->progs_left_with_obsolete_slot_assignments) {
+			unsigned char* fpbo_slots = &fp->fpbo->slots[fp->bo_prog_idx * 8];
+			/* also relocate sprite coord slot, if any */
+			for(unsigned i = 0; i <= fp->num_slots; ++i) {
+				unsigned value = fp->slot_to_fp_input[i];;
+				if(value != fpbo_slots[i]) {
+					unsigned* p;
+					unsigned* begin = (unsigned*)fp->slot_relocations[i].data;
+					unsigned* end = (unsigned*)((char*)fp->slot_relocations[i].data + fp->slot_relocations[i].size);
+					//printf("fp %p reloc slot %u/%u: %u -> %u\n", fp, i, fp->num_slots, fpbo_slots[i], value);
+					if(value == 0)
+					{
+						/* was relocated to an input, switch type to temporary */
+						for(p = begin; p != end; ++p) {
+							unsigned off = *p;
+							unsigned dw = fp->insn[off];
+							dw &=~ NVFX_FP_REG_TYPE_MASK;
+							//printf("reloc_tmp at %x\n", off);
+							nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw));
+						}
+					} else {
+						if(!fpbo_slots[i])
+						{
+							/* was relocated to a temporary, switch type to input */
+							for(p= begin; p != end; ++p) {
+								unsigned off = *p;
+								unsigned dw = fp->insn[off];
+								//printf("reloc_in at %x\n", off);
+								dw |= NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT;
+								nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw));
+							}
+						}
+
+						/* set the correct input index */
+						for(p = begin; p != end; ++p) {
+							unsigned off = *p & ~3;
+							unsigned dw = fp->insn[off];
+							//printf("reloc&~3 at %x\n", off);
+							dw = (dw & ~NVFX_FP_OP_INPUT_SRC_MASK) | (value << NVFX_FP_OP_INPUT_SRC_SHIFT);
+							nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw));
+						}
+					}
+					fpbo_slots[i] = value;
+				}
+			}
+			--fp->progs_left_with_obsolete_slot_assignments;
+		}
+
+		nvfx->hw_fragprog = fp;
+
 		MARK_RING(chan, 8, 1);
 		OUT_RING(chan, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1));
 		OUT_RELOC(chan, fp->fpbo->bo, offset, NOUVEAU_BO_VRAM |
@@ -963,13 +1464,26 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 			OUT_RING(chan, fp->samplers);
 		}
 	}
+
+	{
+		unsigned pointsprite_control = fp->point_sprite_control | nvfx->rasterizer->pipe.point_quad_rasterization;
+		if(pointsprite_control != nvfx->hw_pointsprite_control)
+		{
+			WAIT_RING(chan, 2);
+			OUT_RING(chan, RING_3D(NV34TCL_POINT_SPRITE, 1));
+			OUT_RING(chan, pointsprite_control);
+			nvfx->hw_pointsprite_control = pointsprite_control;
+		}
+	}
+
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAGPROG;
 }
 
 void
 nvfx_fragprog_relocate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	struct nvfx_fragment_program *fp = nvfx->fragprog;
+	struct nvfx_fragment_program *fp = nvfx->hw_fragprog;
 	struct nouveau_bo* bo = fp->fpbo->bo;
 	int offset = fp->bo_prog_idx * fp->prog_size;
 	unsigned fp_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD; // TODO: GART?
@@ -979,12 +1493,14 @@ nvfx_fragprog_relocate(struct nvfx_context *nvfx)
 	OUT_RELOC(chan, bo, offset, fp_flags | NOUVEAU_BO_LOW |
 		      NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0,
 		      NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAGPROG;
 }
 
 void
 nvfx_fragprog_destroy(struct nvfx_context *nvfx,
 		      struct nvfx_fragment_program *fp)
 {
+	unsigned i;
 	struct nvfx_fragment_program_bo* fpbo = fp->fpbo;
 	if(fpbo)
 	{
@@ -999,7 +1515,60 @@ nvfx_fragprog_destroy(struct nvfx_context *nvfx,
 		while(fpbo != fp->fpbo);
 	}
 
+	for(i = 0; i < Elements(fp->slot_relocations); ++i)
+		util_dynarray_fini(&fp->slot_relocations[i]);
+
 	if (fp->insn_len)
 		FREE(fp->insn);
 }
 
+static void *
+nvfx_fp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+        struct nvfx_pipe_fragment_program *pfp;
+
+        pfp = CALLOC(1, sizeof(struct nvfx_pipe_fragment_program));
+        pfp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+        tgsi_scan_shader(pfp->pipe.tokens, &pfp->info);
+
+        return (void *)pfp;
+}
+
+static void
+nvfx_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+        struct nvfx_context *nvfx = nvfx_context(pipe);
+
+        nvfx->fragprog = hwcso;
+        nvfx->dirty |= NVFX_NEW_FRAGPROG;
+}
+
+static void
+nvfx_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_pipe_fragment_program *pfp = hwcso;
+	unsigned i;
+
+	for(i = 0; i < Elements(pfp->fps); ++i)
+	{
+		if(pfp->fps[i])
+		{
+			nvfx_fragprog_destroy(nvfx, pfp->fps[i]);
+			FREE(pfp->fps[i]);
+		}
+	}
+
+        FREE((void*)pfp->pipe.tokens);
+        FREE(pfp);
+}
+
+void
+nvfx_init_fragprog_functions(struct nvfx_context *nvfx)
+{
+        nvfx->pipe.create_fs_state = nvfx_fp_state_create;
+        nvfx->pipe.bind_fs_state = nvfx_fp_state_bind;
+        nvfx->pipe.delete_fs_state = nvfx_fp_state_delete;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_fragtex.c b/src/gallium/drivers/nvfx/nvfx_fragtex.c
index 0b4a434fec..6503c7afcb 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragtex.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragtex.c
@@ -1,5 +1,177 @@
 #include "nvfx_context.h"
 #include "nvfx_resource.h"
+#include "nvfx_tex.h"
+
+static void *
+nvfx_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_sampler_state *ps;
+
+	ps = MALLOC(sizeof(struct nvfx_sampler_state));
+
+	/* on nv30, we use this as an internal flag */
+	ps->fmt = cso->normalized_coords ? 0 : NV40TCL_TEX_FORMAT_RECT;
+	ps->en = 0;
+	ps->filt = nvfx_tex_filter(cso) | 0x2000; /*voodoo*/
+	ps->wrap = (nvfx_tex_wrap_mode(cso->wrap_s) << NV34TCL_TX_WRAP_S_SHIFT) |
+		    (nvfx_tex_wrap_mode(cso->wrap_t) << NV34TCL_TX_WRAP_T_SHIFT) |
+		    (nvfx_tex_wrap_mode(cso->wrap_r) << NV34TCL_TX_WRAP_R_SHIFT);
+	ps->compare = FALSE;
+
+	if(cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
+	{
+		ps->wrap |= nvfx_tex_wrap_compare_mode(cso->compare_func);
+		ps->compare = TRUE;
+	}
+	ps->bcol = nvfx_tex_border_color(cso->border_color);
+
+	if(nvfx->is_nv4x)
+		nv40_sampler_state_init(pipe, ps, cso);
+	else
+		nv30_sampler_state_init(pipe, ps, cso);
+
+	return (void *)ps;
+}
+
+static void
+nvfx_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nvfx_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nvfx->tex_sampler[unit] = sampler[unit];
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nvfx->nr_samplers; unit++) {
+		nvfx->tex_sampler[unit] = NULL;
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	nvfx->nr_samplers = nr;
+	nvfx->dirty |= NVFX_NEW_SAMPLER;
+}
+
+static struct pipe_sampler_view *
+nvfx_create_sampler_view(struct pipe_context *pipe,
+			 struct pipe_resource *pt,
+			 const struct pipe_sampler_view *templ)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_sampler_view *sv = CALLOC_STRUCT(nvfx_sampler_view);
+	struct nvfx_texture_format *tf = &nvfx_texture_formats[templ->format];
+	unsigned txf;
+
+	if (!sv)
+		return NULL;
+
+	sv->base = *templ;
+	sv->base.reference.count = 1;
+	sv->base.texture = NULL;
+	pipe_resource_reference(&sv->base.texture, pt);
+	sv->base.context = pipe;
+
+	txf = NV34TCL_TX_FORMAT_NO_BORDER;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV34TCL_TX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+	case PIPE_TEXTURE_RECT:
+		txf |= NV34TCL_TX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_1D;
+		break;
+	default:
+		assert(0);
+	}
+	sv->u.init_fmt = txf;
+
+	sv->swizzle = 0
+			| (tf->src[sv->base.swizzle_r] << NV34TCL_TX_SWIZZLE_S0_Z_SHIFT)
+			| (tf->src[sv->base.swizzle_g] << NV34TCL_TX_SWIZZLE_S0_Y_SHIFT)
+			| (tf->src[sv->base.swizzle_b] << NV34TCL_TX_SWIZZLE_S0_X_SHIFT)
+			| (tf->src[sv->base.swizzle_a] << NV34TCL_TX_SWIZZLE_S0_W_SHIFT)
+			| (tf->comp[sv->base.swizzle_r] << NV34TCL_TX_SWIZZLE_S1_Z_SHIFT)
+			| (tf->comp[sv->base.swizzle_g] << NV34TCL_TX_SWIZZLE_S1_Y_SHIFT)
+			| (tf->comp[sv->base.swizzle_b] << NV34TCL_TX_SWIZZLE_S1_X_SHIFT)
+			| (tf->comp[sv->base.swizzle_a] << NV34TCL_TX_SWIZZLE_S1_W_SHIFT);
+
+	sv->filt = tf->sign;
+	sv->wrap = tf->wrap;
+	sv->wrap_mask = ~0;
+
+	if (pt->target == PIPE_TEXTURE_CUBE)
+	{
+		sv->offset = 0;
+		sv->npot_size = (pt->width0 << NV34TCL_TX_NPOT_SIZE_W_SHIFT) | pt->height0;
+	}
+	else
+	{
+		sv->offset = nvfx_subresource_offset(pt, 0, sv->base.first_level, 0);
+		sv->npot_size = (u_minify(pt->width0, sv->base.first_level) << NV34TCL_TX_NPOT_SIZE_W_SHIFT) | u_minify(pt->height0, sv->base.first_level);
+
+		/* apparently, we need to ignore the t coordinate for 1D textures to fix piglit tex1d-2dborder */
+		if(pt->target == PIPE_TEXTURE_1D)
+		{
+			sv->wrap_mask &=~ NV34TCL_TX_WRAP_T_MASK;
+			sv->wrap |= NV34TCL_TX_WRAP_T_REPEAT;
+		}
+	}
+
+	if(nvfx->is_nv4x)
+		nv40_sampler_view_init(pipe, sv);
+	else
+		nv30_sampler_view_init(pipe, sv);
+
+	return &sv->base;
+}
+
+static void
+nvfx_sampler_view_destroy(struct pipe_context *pipe,
+			  struct pipe_sampler_view *view)
+{
+	pipe_resource_reference(&view->texture, NULL);
+	FREE(view);
+}
+
+static void
+nvfx_set_fragment_sampler_views(struct pipe_context *pipe,
+				unsigned nr,
+				struct pipe_sampler_view **views)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		pipe_sampler_view_reference(&nvfx->fragment_sampler_views[unit],
+                                            views[unit]);
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nvfx->nr_textures; unit++) {
+		pipe_sampler_view_reference(&nvfx->fragment_sampler_views[unit],
+                                            NULL);
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	nvfx->nr_textures = nr;
+	nvfx->dirty |= NVFX_NEW_SAMPLER;
+}
 
 void
 nvfx_fragtex_validate(struct nvfx_context *nvfx)
@@ -16,6 +188,10 @@ nvfx_fragtex_validate(struct nvfx_context *nvfx)
 		samplers &= ~(1 << unit);
 
 		if(nvfx->fragment_sampler_views[unit] && nvfx->tex_sampler[unit]) {
+			util_dirty_surfaces_use_for_sampling(&nvfx->pipe,
+					&((struct nvfx_miptree*)nvfx->fragment_sampler_views[unit]->texture)->dirty_surfaces,
+					nvfx_surface_flush);
+
 			if(!nvfx->is_nv4x)
 				nv30_fragtex_set(nvfx, unit);
 			else
@@ -29,6 +205,7 @@ nvfx_fragtex_validate(struct nvfx_context *nvfx)
 		}
 	}
 	nvfx->dirty_samplers = 0;
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAGTEX;
 }
 
 void
@@ -55,4 +232,128 @@ nvfx_fragtex_relocate(struct nvfx_context *nvfx)
 		OUT_RELOC(chan, bo, nvfx->hw_txf[unit], tex_flags | NOUVEAU_BO_OR | NOUVEAU_BO_DUMMY,
 				NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
 	}
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAGTEX;
+}
+
+void
+nvfx_init_sampling_functions(struct nvfx_context *nvfx)
+{
+	nvfx->pipe.create_sampler_state = nvfx_sampler_state_create;
+	nvfx->pipe.bind_fragment_sampler_states = nvfx_sampler_state_bind;
+	nvfx->pipe.delete_sampler_state = nvfx_sampler_state_delete;
+	nvfx->pipe.set_fragment_sampler_views = nvfx_set_fragment_sampler_views;
+	nvfx->pipe.create_sampler_view = nvfx_create_sampler_view;
+	nvfx->pipe.sampler_view_destroy = nvfx_sampler_view_destroy;
+}
+
+#define NV34TCL_TX_FORMAT_FORMAT_DXT1_RECT NV34TCL_TX_FORMAT_FORMAT_DXT1
+#define NV34TCL_TX_FORMAT_FORMAT_DXT3_RECT NV34TCL_TX_FORMAT_FORMAT_DXT3
+#define NV34TCL_TX_FORMAT_FORMAT_DXT5_RECT NV34TCL_TX_FORMAT_FORMAT_DXT5
+
+#define NV40TCL_TEX_FORMAT_FORMAT_HILO16 NV40TCL_TEX_FORMAT_FORMAT_A16L16
+
+#define NV34TCL_TX_FORMAT_FORMAT_RGBA16F 0x00004a00
+#define NV34TCL_TX_FORMAT_FORMAT_RGBA16F_RECT NV34TCL_TX_FORMAT_FORMAT_RGBA16F
+#define NV34TCL_TX_FORMAT_FORMAT_RGBA32F 0x00004b00
+#define NV34TCL_TX_FORMAT_FORMAT_RGBA32F_RECT NV34TCL_TX_FORMAT_FORMAT_RGBA32F
+#define NV34TCL_TX_FORMAT_FORMAT_R32F 0x00004c00
+#define NV34TCL_TX_FORMAT_FORMAT_R32F_RECT NV34TCL_TX_FORMAT_FORMAT_R32F
+
+// TODO: guess!
+#define NV40TCL_TEX_FORMAT_FORMAT_R32F 0x00001c00
+
+#define SRGB 0x00700000
+
+#define __(m,tf,tfc,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sign,wrap) \
+[PIPE_FORMAT_##m] = { \
+  {NV34TCL_TX_FORMAT_FORMAT_##tf, \
+  NV34TCL_TX_FORMAT_FORMAT_##tfc, \
+  NV34TCL_TX_FORMAT_FORMAT_##tf##_RECT, \
+  NV34TCL_TX_FORMAT_FORMAT_##tfc##_RECT, \
+  NV40TCL_TEX_FORMAT_FORMAT_##tf, \
+  NV40TCL_TEX_FORMAT_FORMAT_##tfc}, \
+  sign, wrap, \
+  {ts0z, ts0y, ts0x, ts0w, 0, 1}, {ts1z, ts1y, ts1x, ts1w, 0, 0} \
 }
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sign, wrap) \
+	__(m,tf,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sign, wrap)
+
+/* Depth formats works by reading the depth value most significant 8/16 bits.
+ * We are losing precision, but nVidia loses even more by using A8R8G8B8 instead of HILO16
+ * There is no 32-bit integer texture support, so other things are infeasible.
+ *
+ * TODO: is it possible to read 16 bits for Z16? A16 doesn't seem to work, either due to normalization or endianness issues
+ */
+
+#define T 2
+
+#define X 3
+#define Y 2
+#define Z 1
+#define W 0
+
+#define SNORM ((NV34TCL_TX_FILTER_SIGNED_RED) | (NV34TCL_TX_FILTER_SIGNED_GREEN) | (NV34TCL_TX_FILTER_SIGNED_BLUE) | (NV34TCL_TX_FILTER_SIGNED_ALPHA))
+#define UNORM 0
+
+struct nvfx_texture_format
+nvfx_texture_formats[PIPE_FORMAT_COUNT] = {
+	[0 ... PIPE_FORMAT_COUNT - 1] = {{-1, -1, -1, -1, -1, -1}},
+	_(B8G8R8X8_UNORM,	A8R8G8B8,	T, T, T, 1, X, Y, Z, W, UNORM, 0),
+	_(B8G8R8X8_SRGB,	A8R8G8B8,	T, T, T, 1, X, Y, Z, W, UNORM, SRGB),
+	_(B8G8R8A8_UNORM,	A8R8G8B8,	T, T, T, T, X, Y, Z, W, UNORM, 0),
+	_(B8G8R8A8_SRGB,	A8R8G8B8,	T, T, T, T, X, Y, Z, W, UNORM, SRGB),
+
+	_(R8G8B8A8_UNORM,	A8R8G8B8,	T, T, T, T, Z, Y, X, W, UNORM, 0),
+	_(R8G8B8A8_SRGB,	A8R8G8B8,	T, T, T, T, Z, Y, X, W, UNORM, SRGB),
+	_(R8G8B8X8_UNORM,	A8R8G8B8,	T, T, T, 1, Z, Y, X, W, UNORM, 0),
+
+	_(A8R8G8B8_UNORM,	A8R8G8B8,	T, T, T, T, W, Z, Y, X, UNORM, 0),
+	_(A8R8G8B8_SRGB,	A8R8G8B8,	T, T, T, T, W, Z, Y, X, UNORM, SRGB),
+	_(A8B8G8R8_UNORM,	A8R8G8B8,	T, T, T, T, W, X, Y, Z, UNORM, 0),
+	_(A8B8G8R8_SRGB,	A8R8G8B8,	T, T, T, T, W, X, Y, Z, UNORM, SRGB),
+	_(X8R8G8B8_UNORM,	A8R8G8B8,	T, T, T, 1, W, Z, Y, X, UNORM, 0),
+	_(X8R8G8B8_SRGB,	A8R8G8B8,	T, T, T, 1, W, Z, Y, X, UNORM, SRGB),
+
+	_(B5G5R5A1_UNORM,	A1R5G5B5, 	T, T, T, T, X, Y, Z, W, UNORM, 0),
+	_(B5G5R5X1_UNORM,	A1R5G5B5, 	T, T, T, 1, X, Y, Z, W, UNORM, 0),
+
+	_(B4G4R4A4_UNORM,	A4R4G4B4, 	T, T, T, T, X, Y, Z, W, UNORM, 0),
+	_(B4G4R4X4_UNORM,	A4R4G4B4, 	T, T, T, 1, X, Y, Z, W, UNORM, 0),
+
+	_(B5G6R5_UNORM,		R5G6B5, 	T, T, T, 1, X, Y, Z, W, UNORM, 0),
+
+	_(R8_UNORM,		L8,		T, 0, 0, 1, X, X, X, X, UNORM, 0),
+	_(R8_SNORM,		L8,		T, 0, 0, 1, X, X, X, X, SNORM, 0),
+	_(L8_UNORM,		L8,		T, T, T, 1, X, X, X, X, UNORM, 0),
+	_(L8_SRGB,		L8,		T, T, T, 1, X, X, X, X, UNORM, SRGB),
+	_(A8_UNORM,		L8, 		0, 0, 0, T, X, X, X, X, UNORM, 0),
+	_(I8_UNORM,		L8, 		T, T, T, T, X, X, X, X, UNORM, 0),
+
+	_(R8G8_UNORM,		A8L8, 		T, T, T, T, X, X, X, W, UNORM, 0),
+	_(R8G8_SNORM,		A8L8, 		T, T, T, T, X, X, X, W, SNORM, 0),
+	_(L8A8_UNORM,		A8L8, 		T, T, T, T, X, X, X, W, UNORM, 0),
+	_(L8A8_SRGB,		A8L8,		T, T, T, T, X, X, X, W, UNORM, SRGB),
+
+	_(DXT1_RGB,		DXT1,		T, T, T, 1, X, Y, Z, W, UNORM, 0),
+	_(DXT1_SRGB,		DXT1,		T, T, T, 1, X, Y, Z, W, UNORM, SRGB),
+	_(DXT1_RGBA,		DXT1,		T, T, T, T, X, Y, Z, W, UNORM, 0),
+	_(DXT1_SRGBA,		DXT1,		T, T, T, T, X, Y, Z, W, UNORM, SRGB),
+	_(DXT3_RGBA,		DXT3,		T, T, T, T, X, Y, Z, W, UNORM, 0),
+	_(DXT3_SRGBA,		DXT3,		T, T, T, T, X, Y, Z, W, UNORM, SRGB),
+	_(DXT5_RGBA,		DXT5,		T, T, T, T, X, Y, Z, W, UNORM, 0),
+	_(DXT5_SRGBA,		DXT5,		T, T, T, T, X, Y, Z, W, UNORM, SRGB),
+
+	__(Z16_UNORM,		A8L8, Z16,	T, T, T, 1, W, W, W, W, UNORM, 0),
+	__(S8_USCALED_Z24_UNORM,HILO16,Z24,	T, T, T, 1, W, W, W, W, UNORM, 0),
+	__(X8Z24_UNORM,		HILO16,Z24,	T, T, T, 1, W, W, W, W, UNORM, 0),
+
+	_(R16_UNORM,		A16,		T, 0, 0, 1, X, X, X, X, UNORM, 0),
+	_(R16_SNORM,		A16,		T, 0, 0, 1, X, X, X, X, SNORM, 0),
+	_(R16G16_UNORM,		HILO16,		T, T, 0, 1, X, Y, X, X, UNORM, 0),
+	_(R16G16_SNORM,		HILO16,		T, T, 0, 1, X, Y, X, X, SNORM, 0),
+
+	_(R16G16B16A16_FLOAT,		RGBA16F,	T, T, T, T, X, Y, Z, W, UNORM, 0),
+	_(R32G32B32A32_FLOAT,		RGBA32F,	T, T, T, T, X, Y, Z, W, UNORM, 0),
+	_(R32_FLOAT,		R32F,	T, 0, 0, 1, X, X, X, X, UNORM, 0)
+};
diff --git a/src/gallium/drivers/nvfx/nvfx_miptree.c b/src/gallium/drivers/nvfx/nvfx_miptree.c
index b5639bb464..0916aaa828 100644
--- a/src/gallium/drivers/nvfx/nvfx_miptree.c
+++ b/src/gallium/drivers/nvfx/nvfx_miptree.c
@@ -2,309 +2,220 @@
 #include "pipe/p_defines.h"
 #include "util/u_inlines.h"
 #include "util/u_format.h"
+#include "util/u_memory.h"
 #include "util/u_math.h"
-
-#include "nvfx_context.h"
+#include "util/u_staging.h"
+#include "state_tracker/drm_driver.h"
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_screen.h"
+#include "nvfx_screen.h"
 #include "nvfx_resource.h"
-#include "nvfx_transfer.h"
-#include "nv04_surface_2d.h"
-
-/* Currently using separate implementations for buffers and textures,
- * even though gallium has a unified abstraction of these objects.
- * Eventually these should be combined, and mechanisms like transfers
- * be adapted to work for both buffer and texture uploads.
- */
 
 static void
-nvfx_miptree_layout(struct nvfx_miptree *mt)
+nvfx_miptree_choose_format(struct nvfx_miptree *mt)
 {
 	struct pipe_resource *pt = &mt->base.base;
-	uint width = pt->width0;
-	uint offset = 0;
-	int nr_faces, l, f;
-	uint wide_pitch = pt->bind & (PIPE_BIND_SAMPLER_VIEW |
-				      PIPE_BIND_DEPTH_STENCIL |
-				      PIPE_BIND_RENDER_TARGET |
-				      PIPE_BIND_DISPLAY_TARGET |
-				      PIPE_BIND_SCANOUT);
-
-	if (pt->target == PIPE_TEXTURE_CUBE) {
-		nr_faces = 6;
-	} else
-	if (pt->target == PIPE_TEXTURE_3D) {
-		nr_faces = pt->depth0;
-	} else {
-		nr_faces = 1;
+	unsigned uniform_pitch = 0;
+	static int no_swizzle = -1;
+	if(no_swizzle < 0)
+		no_swizzle = debug_get_bool_option("NV40_NO_SWIZZLE", FALSE); /* this will break things on nv30 */
+
+	if (!util_is_power_of_two(pt->width0) ||
+	    !util_is_power_of_two(pt->height0) ||
+	    !util_is_power_of_two(pt->depth0) ||
+	    (!nvfx_screen(pt->screen)->is_nv4x && pt->target == PIPE_TEXTURE_RECT)
+	    )
+		uniform_pitch = 1;
+
+	if (
+		(pt->bind & (PIPE_BIND_SCANOUT | PIPE_BIND_DISPLAY_TARGET))
+		|| (pt->usage & PIPE_USAGE_DYNAMIC) || (pt->usage & PIPE_USAGE_STAGING)
+		|| util_format_is_compressed(pt->format)
+		|| no_swizzle
+	)
+		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+
+	/* non compressed formats with uniform pitch must be linear, and vice versa */
+	if(!util_format_is_s3tc(pt->format)
+		&& (uniform_pitch || mt->base.base.flags & NVFX_RESOURCE_FLAG_LINEAR))
+	{
+		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+		uniform_pitch = 1;
 	}
 
-	for (l = 0; l <= pt->last_level; l++) {
-		if (wide_pitch && (pt->flags & NVFX_RESOURCE_FLAG_LINEAR))
-			mt->level[l].pitch = align(util_format_get_stride(pt->format, pt->width0), 64);
-		else
-			mt->level[l].pitch = util_format_get_stride(pt->format, width);
+	if(uniform_pitch)
+	{
+		mt->linear_pitch = util_format_get_stride(pt->format, pt->width0);
 
-		mt->level[l].image_offset =
-			CALLOC(nr_faces, sizeof(unsigned));
+		// TODO: this is only a constraint for rendering and not sampling, apparently
+		// we may also want this unconditionally
+		if(pt->bind & (PIPE_BIND_SAMPLER_VIEW |
+			PIPE_BIND_DEPTH_STENCIL |
+			PIPE_BIND_RENDER_TARGET |
+			PIPE_BIND_DISPLAY_TARGET |
+			PIPE_BIND_SCANOUT))
+			mt->linear_pitch = align(mt->linear_pitch, 64);
+	}
+	else
+		mt->linear_pitch = 0;
+}
+
+static unsigned
+nvfx_miptree_layout(struct nvfx_miptree *mt)
+{
+	struct pipe_resource* pt = &mt->base.base;
+        uint offset = 0;
 
-		width  = u_minify(width, 1);
+	if(!nvfx_screen(pt->screen)->is_nv4x)
+	{
+		assert(pt->target == PIPE_TEXTURE_RECT
+			|| (util_is_power_of_two(pt->width0) && util_is_power_of_two(pt->height0)));
 	}
 
-	for (f = 0; f < nr_faces; f++) {
-		for (l = 0; l < pt->last_level; l++) {
-			mt->level[l].image_offset[f] = offset;
+	for (unsigned l = 0; l <= pt->last_level; l++)
+	{
+		unsigned size;
+		mt->level_offset[l] = offset;
 
-			if (!(pt->flags & NVFX_RESOURCE_FLAG_LINEAR) &&
-			    u_minify(pt->width0, l + 1) > 1 && u_minify(pt->height0, l + 1) > 1)
-				offset += align(mt->level[l].pitch * u_minify(pt->height0, l), 64);
-			else
-				offset += mt->level[l].pitch * u_minify(pt->height0, l);
-		}
+		if(mt->linear_pitch)
+			size = mt->linear_pitch;
+		else
+			size = util_format_get_stride(pt->format, u_minify(pt->width0, l));
+		size = util_format_get_2d_size(pt->format, size, u_minify(pt->height0, l));
 
-		mt->level[l].image_offset[f] = offset;
-		offset += mt->level[l].pitch * u_minify(pt->height0, l);
+		if(pt->target == PIPE_TEXTURE_3D)
+			size *= u_minify(pt->depth0, l);
+
+		offset += size;
 	}
 
-	mt->total_size = offset;
+	offset = align(offset, 128);
+	mt->face_size = offset;
+	if(mt->base.base.target == PIPE_TEXTURE_CUBE)
+		offset += 5 * mt->face_size;
+	return offset;
 }
 
-static boolean
-nvfx_miptree_get_handle(struct pipe_screen *pscreen,
-			struct pipe_resource *ptexture,
-			struct winsys_handle *whandle)
+static void
+nvfx_miptree_surface_final_destroy(struct pipe_surface* ps)
 {
-	struct nvfx_miptree* mt = (struct nvfx_miptree*)ptexture;
-
-	if (!mt || !mt->base.bo)
-		return FALSE;
-
-	return nouveau_screen_bo_get_handle(pscreen,
-					    mt->base.bo,
-					    mt->level[0].pitch,
-					    whandle);
+	struct nvfx_surface* ns = (struct nvfx_surface*)ps;
+	pipe_resource_reference(&ps->texture, 0);
+	pipe_resource_reference((struct pipe_resource**)&ns->temp, 0);
+	FREE(ps);
 }
 
-
-static void
+void
 nvfx_miptree_destroy(struct pipe_screen *screen, struct pipe_resource *pt)
 {
 	struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
-	int l;
-
+	util_surfaces_destroy(&mt->surfaces, pt, nvfx_miptree_surface_final_destroy);
 	nouveau_screen_bo_release(screen, mt->base.bo);
-
-	for (l = 0; l <= pt->last_level; l++) {
-		if (mt->level[l].image_offset)
-			FREE(mt->level[l].image_offset);
-	}
-
 	FREE(mt);
 }
 
-
-
-
-struct u_resource_vtbl nvfx_miptree_vtbl = 
+static struct nvfx_miptree*
+nvfx_miptree_create_skeleton(struct pipe_screen *pscreen, const struct pipe_resource *pt)
 {
-   nvfx_miptree_get_handle,	      /* get_handle */
-   nvfx_miptree_destroy,	      /* resource_destroy */
-   NULL,			      /* is_resource_referenced */
-   nvfx_miptree_transfer_new,	      /* get_transfer */
-   nvfx_miptree_transfer_del,     /* transfer_destroy */
-   nvfx_miptree_transfer_map,	      /* transfer_map */
-   u_default_transfer_flush_region,   /* transfer_flush_region */
-   nvfx_miptree_transfer_unmap,	      /* transfer_unmap */
-   u_default_transfer_inline_write    /* transfer_inline_write */
-};
+        struct nvfx_miptree *mt;
 
+        if(pt->width0 > 4096 || pt->height0 > 4096)
+                return NULL;
 
+        mt = CALLOC_STRUCT(nvfx_miptree);
+        if (!mt)
+                return NULL;
 
-struct pipe_resource *
-nvfx_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *pt)
-{
-	struct nvfx_miptree *mt;
-	static int no_swizzle = -1;
-	if(no_swizzle < 0)
-		no_swizzle = debug_get_bool_option("NOUVEAU_NO_SWIZZLE", FALSE);
-
-	mt = CALLOC_STRUCT(nvfx_miptree);
-	if (!mt)
-		return NULL;
-
-	mt->base.base = *pt;
-	mt->base.vtbl = &nvfx_miptree_vtbl;
-	pipe_reference_init(&mt->base.base.reference, 1);
-	mt->base.base.screen = pscreen;
+        mt->base.base = *pt;
+        util_dirty_surfaces_init(&mt->dirty_surfaces);
 
-	/* Swizzled textures must be POT */
-	if (pt->width0 & (pt->width0 - 1) ||
-	    pt->height0 & (pt->height0 - 1))
-		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
-	else
-	if (pt->bind & (PIPE_BIND_SCANOUT |
-			PIPE_BIND_DISPLAY_TARGET |
-			PIPE_BIND_DEPTH_STENCIL))
-		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
-	else
-	if (pt->usage == PIPE_USAGE_DYNAMIC)
-		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
-	else {
-		switch (pt->format) {
-		case PIPE_FORMAT_B5G6R5_UNORM:
-		case PIPE_FORMAT_L8A8_UNORM:
-		case PIPE_FORMAT_A8_UNORM:
-		case PIPE_FORMAT_L8_UNORM:
-		case PIPE_FORMAT_I8_UNORM:
-			/* TODO: we can actually swizzle these formats on nv40, we
-				are just preserving the pre-unification behavior.
-				The whole 2D code is going to be rewritten anyway. */
-			if(nvfx_screen(pscreen)->is_nv4x) {
-				mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
-				break;
-			}
-		/* TODO: Figure out which formats can be swizzled */
-		case PIPE_FORMAT_B8G8R8A8_UNORM:
-		case PIPE_FORMAT_B8G8R8X8_UNORM:
-		case PIPE_FORMAT_R16_SNORM:
-		{
-			if (no_swizzle)
-				mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
-			break;
-		}
-		default:
-			mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
-		}
-	}
+        pipe_reference_init(&mt->base.base.reference, 1);
+        mt->base.base.screen = pscreen;
 
-	/* apparently we can't render to swizzled surfaces smaller than 64 bytes, so make them linear.
-	 * If the user did not ask for a render target, they can still render to it, but it will cost them an extra copy.
-	 * This also happens for small mipmaps of large textures. */
-	if (pt->bind & PIPE_BIND_RENDER_TARGET &&
-	    util_format_get_stride(pt->format, pt->width0) < 64)
-		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+        // set this to the actual capabilities, we use it to decide whether to use the 3D engine for copies
+        // TODO: is this the correct way to use Gallium?
+        mt->base.base.bind = pt->bind | PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_DEPTH_STENCIL;
 
-	nvfx_miptree_layout(mt);
+        // on our current driver (and the driver too), format support does not depend on geometry, so don't bother computing it
+        // TODO: may want to revisit this
+        if(!pscreen->is_format_supported(pscreen, pt->format, pt->target, 0, PIPE_BIND_RENDER_TARGET, 0))
+                mt->base.base.bind &=~ PIPE_BIND_RENDER_TARGET;
+        if(!pscreen->is_format_supported(pscreen, pt->format, pt->target, 0, PIPE_BIND_SAMPLER_VIEW, 0))
+                mt->base.base.bind &=~ PIPE_BIND_SAMPLER_VIEW;
+        if(!pscreen->is_format_supported(pscreen, pt->format, pt->target, 0, PIPE_BIND_DEPTH_STENCIL, 0))
+                mt->base.base.bind &=~ PIPE_BIND_DEPTH_STENCIL;
 
-	mt->base.bo = nouveau_screen_bo_new(pscreen, 256,
-            pt->usage, pt->bind, mt->total_size);
-	if (!mt->base.bo) {
-		FREE(mt);
-		return NULL;
-	}
-	return &mt->base.base;
+        return mt;
 }
 
 
-
-
 struct pipe_resource *
-nvfx_miptree_from_handle(struct pipe_screen *pscreen,
-			 const struct pipe_resource *template,
-			 struct winsys_handle *whandle)
+nvfx_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *pt)
 {
-	struct nvfx_miptree *mt;
-	unsigned stride;
+	struct nvfx_miptree* mt = nvfx_miptree_create_skeleton(pscreen, pt);
+        unsigned size;
+	nvfx_miptree_choose_format(mt);
 
-	/* Only supports 2D, non-mipmapped textures for the moment */
-	if (template->target != PIPE_TEXTURE_2D ||
-	    template->last_level != 0 ||
-	    template->depth0 != 1)
-		return NULL;
+        size = nvfx_miptree_layout(mt);
 
-	mt = CALLOC_STRUCT(nvfx_miptree);
-	if (!mt)
-		return NULL;
+	mt->base.bo = nouveau_screen_bo_new(pscreen, 256, pt->usage, pt->bind, size);
 
-	mt->base.bo = nouveau_screen_bo_from_handle(pscreen, whandle, &stride);
-	if (mt->base.bo == NULL) {
+	if (!mt->base.bo) {
 		FREE(mt);
 		return NULL;
 	}
-
-	mt->base.base = *template;
-	mt->base.vtbl = &nvfx_miptree_vtbl;
-	pipe_reference_init(&mt->base.base.reference, 1);
-	mt->base.base.screen = pscreen;
-	mt->level[0].pitch = stride;
-	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
-
-	/* Assume whoever created this buffer expects it to be linear for now */
-	mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
-
-	/* XXX: Need to adjust bo refcount??
-	 */
-	/* nouveau_bo_ref(bo, &mt->base.bo); */
 	return &mt->base.base;
 }
 
+// TODO: redo this, just calling miptree_layout
+struct pipe_resource *
+nvfx_miptree_from_handle(struct pipe_screen *pscreen, const struct pipe_resource *template, struct winsys_handle *whandle)
+{
+        struct nvfx_miptree* mt = nvfx_miptree_create_skeleton(pscreen, template);
+        unsigned stride;
+        if(whandle->stride) {
+		mt->linear_pitch = whandle->stride;
+		mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+        } else
+		nvfx_miptree_choose_format(mt);
 
+        nvfx_miptree_layout(mt);
 
+        mt->base.bo = nouveau_screen_bo_from_handle(pscreen, whandle, &stride);
+        if (mt->base.bo == NULL) {
+                FREE(mt);
+                return NULL;
+        }
+        return &mt->base.base;
+}
 
-
-/* Surface helpers, not strictly required to implement the resource vtbl:
- */
 struct pipe_surface *
 nvfx_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_resource *pt,
 			 unsigned face, unsigned level, unsigned zslice,
 			 unsigned flags)
 {
-	struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
-	struct nv04_surface *ns;
-
-	ns = CALLOC_STRUCT(nv04_surface);
-	if (!ns)
-		return NULL;
-	pipe_resource_reference(&ns->base.texture, pt);
-	ns->base.format = pt->format;
-	ns->base.width = u_minify(pt->width0, level);
-	ns->base.height = u_minify(pt->height0, level);
-	ns->base.usage = flags;
-	pipe_reference_init(&ns->base.reference, 1);
-	ns->base.face = face;
-	ns->base.level = level;
-	ns->base.zslice = zslice;
-	ns->pitch = mt->level[level].pitch;
-
-	if (pt->target == PIPE_TEXTURE_CUBE) {
-		ns->base.offset = mt->level[level].image_offset[face];
-	} else
-	if (pt->target == PIPE_TEXTURE_3D) {
-		ns->base.offset = mt->level[level].image_offset[zslice];
-	} else {
-		ns->base.offset = mt->level[level].image_offset[0];
-	}
-
-	/* create a linear temporary that we can render into if
-	 * necessary.
-	 *
-	 * Note that ns->pitch is always a multiple of 64 for linear
-	 * surfaces and swizzled surfaces are POT, so ns->pitch & 63
-	 * is equivalent to (ns->pitch < 64 && swizzled)
-	 */
-
-	if ((ns->pitch & 63) && 
-	    (ns->base.usage & PIPE_BIND_RENDER_TARGET))
-	{
-		struct nv04_surface_2d* eng2d  =
-			((struct nvfx_screen*)pscreen)->eng2d;
-
-		ns = nv04_surface_wrap_for_render(pscreen, eng2d, ns);
+	struct nvfx_miptree* mt = (struct nvfx_miptree*)pt;
+	struct nvfx_surface *ns;
+
+	ns = (struct nvfx_surface*)util_surfaces_get(&mt->surfaces, sizeof(struct nvfx_surface), pscreen, pt, face, level, zslice, flags);
+	if(ns->base.base.offset == ~0) {
+		util_dirty_surface_init(&ns->base);
+		ns->pitch = nvfx_subresource_pitch(pt, level);
+		ns->base.base.offset = nvfx_subresource_offset(pt, face, level, zslice);
 	}
 
-	return &ns->base;
+	return &ns->base.base;
 }
 
 void
 nvfx_miptree_surface_del(struct pipe_surface *ps)
 {
-	struct nv04_surface* ns = (struct nv04_surface*)ps;
-	if(ns->backing)
+	struct nvfx_surface* ns = (struct nvfx_surface*)ps;
+
+	if(!ns->temp)
 	{
-		struct nvfx_screen* screen = (struct nvfx_screen*)ps->texture->screen;
-		if(1 /*ns->backing->base.usage & PIPE_BIND_BLIT_DESTINATION*/)
-			screen->eng2d->copy(screen->eng2d, &ns->backing->base, 0, 0, ps, 0, 0, ns->base.width, ns->base.height);
-		nvfx_miptree_surface_del(&ns->backing->base);
+		util_surfaces_detach(&((struct nvfx_miptree*)ps->texture)->surfaces, ps);
+		pipe_resource_reference(&ps->texture, 0);
+		FREE(ps);
 	}
-
-	pipe_resource_reference(&ps->texture, NULL);
-	FREE(ps);
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_push.c b/src/gallium/drivers/nvfx/nvfx_push.c
new file mode 100644
index 0000000000..ffe7e98357
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_push.c
@@ -0,0 +1,414 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_split_prim.h"
+#include "translate/translate.h"
+
+#include "nvfx_context.h"
+#include "nvfx_resource.h"
+
+struct push_context {
+	struct nouveau_channel* chan;
+
+	void *idxbuf;
+	int32_t idxbias;
+
+	float edgeflag;
+	int edgeflag_attr;
+
+	unsigned vertex_length;
+	unsigned max_vertices_per_packet;
+
+	struct translate* translate;
+};
+
+static void
+emit_edgeflag(void *priv, boolean enabled)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+
+	OUT_RING(chan, RING_3D(NV34TCL_EDGEFLAG_ENABLE, 1));
+	OUT_RING(chan, enabled ? 1 : 0);
+}
+
+static void
+emit_vertices_lookup8(void *priv, unsigned start, unsigned count)
+{
+        struct push_context *ctx = priv;
+        uint8_t* elts = (uint8_t*)ctx->idxbuf + start;
+
+        while(count)
+        {
+                unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+                unsigned length = push * ctx->vertex_length;
+
+                OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+                ctx->translate->run_elts8(ctx->translate, elts, push, 0, ctx->chan->cur);
+                ctx->chan->cur += length;
+
+                count -= push;
+                elts += push;
+        }
+}
+
+static void
+emit_vertices_lookup16(void *priv, unsigned start, unsigned count)
+{
+	struct push_context *ctx = priv;
+        uint16_t* elts = (uint16_t*)ctx->idxbuf + start;
+
+        while(count)
+        {
+                unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+                unsigned length = push * ctx->vertex_length;
+
+                OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+                ctx->translate->run_elts16(ctx->translate, elts, push, 0, ctx->chan->cur);
+                ctx->chan->cur += length;
+
+                count -= push;
+                elts += push;
+        }
+}
+
+static void
+emit_vertices_lookup32(void *priv, unsigned start, unsigned count)
+{
+        struct push_context *ctx = priv;
+        uint32_t* elts = (uint32_t*)ctx->idxbuf + start;
+
+        while(count)
+        {
+                unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+                unsigned length = push * ctx->vertex_length;
+
+                OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+                ctx->translate->run_elts(ctx->translate, elts, push, 0, ctx->chan->cur);
+                ctx->chan->cur += length;
+
+                count -= push;
+                elts += push;
+        }
+}
+
+static void
+emit_vertices(void *priv, unsigned start, unsigned count)
+{
+        struct push_context *ctx = priv;
+
+        while(count)
+        {
+		unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+		unsigned length = push * ctx->vertex_length;
+
+		OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+		ctx->translate->run(ctx->translate, start, push, 0, ctx->chan->cur);
+		ctx->chan->cur += length;
+
+		count -= push;
+		start += push;
+        }
+}
+
+static void
+emit_ranges(void* priv, unsigned start, unsigned vc, unsigned reg)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	unsigned nr = (vc & 0xff);
+	if (nr) {
+		OUT_RING(chan, RING_3D(reg, 1));
+		OUT_RING  (chan, ((nr - 1) << 24) | start);
+		start += nr;
+	}
+
+	nr = vc >> 8;
+	while (nr) {
+		unsigned push = nr > 2047 ? 2047 : nr;
+
+		nr -= push;
+
+		OUT_RING(chan, RING_3D_NI(reg, push));
+		while (push--) {
+			OUT_RING(chan, ((0x100 - 1) << 24) | start);
+			start += 0x100;
+		}
+	}
+}
+
+static void
+emit_ib_ranges(void* priv, unsigned start, unsigned vc)
+{
+	emit_ranges(priv, start, vc, NV34TCL_VB_INDEX_BATCH);
+}
+
+static void
+emit_vb_ranges(void* priv, unsigned start, unsigned vc)
+{
+	emit_ranges(priv, start, vc, NV34TCL_VB_VERTEX_BATCH);
+}
+
+static INLINE void
+emit_elt8(void* priv, unsigned start, unsigned vc)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	uint8_t *elts = (uint8_t *)ctx->idxbuf + start;
+	int idxbias = ctx->idxbias;
+
+	if (vc & 1) {
+		OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
+		OUT_RING  (chan, elts[0]);
+		elts++; vc--;
+	}
+
+	while (vc) {
+		unsigned i;
+		unsigned push = MIN2(vc, 2047 * 2);
+
+		OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
+		for (i = 0; i < push; i+=2)
+			OUT_RING(chan, ((elts[i+1] + idxbias) << 16) | (elts[i] + idxbias));
+
+		vc -= push;
+		elts += push;
+	}
+}
+
+static INLINE void
+emit_elt16(void* priv, unsigned start, unsigned vc)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	uint16_t *elts = (uint16_t *)ctx->idxbuf + start;
+	int idxbias = ctx->idxbias;
+
+	if (vc & 1) {
+		OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
+		OUT_RING  (chan, elts[0]);
+		elts++; vc--;
+	}
+
+	while (vc) {
+		unsigned i;
+		unsigned push = MIN2(vc, 2047 * 2);
+
+		OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
+		for (i = 0; i < push; i+=2)
+			OUT_RING(chan, ((elts[i+1] + idxbias) << 16) | (elts[i] + idxbias));
+
+		vc -= push;
+		elts += push;
+	}
+}
+
+static INLINE void
+emit_elt32(void* priv, unsigned start, unsigned vc)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	uint32_t *elts = (uint32_t *)ctx->idxbuf + start;
+	int idxbias = ctx->idxbias;
+
+	while (vc) {
+		unsigned push = MIN2(vc, 2047);
+
+		OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U32, push));
+		assert(AVAIL_RING(chan) >= push);
+		if(idxbias)
+		{
+			for(unsigned i = 0; i < push; ++i)
+				OUT_RING(chan, elts[i] + idxbias);
+		}
+		else
+			OUT_RINGp(chan, elts, push);
+
+		vc -= push;
+		elts += push;
+	}
+}
+
+void
+nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct push_context ctx;
+	struct util_split_prim s;
+	unsigned instances_left = info->instance_count;
+	int vtx_value;
+	unsigned hw_mode = nvgl_primitive(info->mode);
+	int i;
+	struct
+	{
+		uint8_t* map;
+		unsigned step;
+	} per_instance[16];
+	unsigned p_overhead = 64 /* magic fix */
+			+ 4 /* begin/end */
+			+ 4; /* potential edgeflag enable/disable */
+
+	ctx.chan = nvfx->screen->base.channel;
+	ctx.translate = nvfx->vtxelt->translate;
+	ctx.idxbuf = NULL;
+	ctx.vertex_length = nvfx->vtxelt->vertex_length;
+	ctx.max_vertices_per_packet = nvfx->vtxelt->max_vertices_per_packet;
+	ctx.edgeflag = 0.5f;
+	// TODO: figure out if we really want to handle this, and do so in that case
+	ctx.edgeflag_attr = 0xff; // nvfx->vertprog->cfg.edgeflag_in;
+
+	if(!nvfx->use_vertex_buffers)
+	{
+		for(i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; ++i)
+		{
+			struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+			uint8_t* data = nvfx_buffer(vb->buffer)->data + vb->buffer_offset;
+			if(info->indexed)
+				data += info->index_bias * vb->stride;
+			ctx.translate->set_buffer(ctx.translate, i, data, vb->stride, ~0);
+		}
+
+		if(ctx.edgeflag_attr < 16)
+			vtx_value = -(ctx.vertex_length + 3);  /* vertex data and edgeflag header and value */
+		else
+		{
+			p_overhead += 1; /* initial vertex_data header */
+			vtx_value = -ctx.vertex_length;  /* vertex data and edgeflag header and value */
+		}
+
+		if (info->indexed) {
+			// XXX: this case and is broken and probably need a new VTX_ATTR push path
+			if (nvfx->idxbuf.index_size == 1)
+				s.emit = emit_vertices_lookup8;
+			else if (nvfx->idxbuf.index_size == 2)
+				s.emit = emit_vertices_lookup16;
+			else
+				s.emit = emit_vertices_lookup32;
+		} else
+			s.emit = emit_vertices;
+	}
+	else
+	{
+		if(!info->indexed || nvfx->use_index_buffer)
+		{
+			s.emit = info->indexed ? emit_ib_ranges : emit_vb_ranges;
+			p_overhead += 3;
+			vtx_value = 0;
+		}
+		else if (nvfx->idxbuf.index_size == 4)
+		{
+			s.emit = emit_elt32;
+			p_overhead += 1;
+			vtx_value = 8;
+		}
+		else
+		{
+			s.emit = (nvfx->idxbuf.index_size == 2) ? emit_elt16 : emit_elt8;
+			p_overhead += 3;
+			vtx_value = 7;
+		}
+	}
+
+	ctx.idxbias = info->index_bias;
+	if(nvfx->use_vertex_buffers)
+		ctx.idxbias -= nvfx->base_vertex;
+
+	/* map index buffer, if present */
+	if (info->indexed && !nvfx->use_index_buffer)
+		ctx.idxbuf = nvfx_buffer(nvfx->idxbuf.buffer)->data + nvfx->idxbuf.offset;
+
+	s.priv = &ctx;
+	s.edge = emit_edgeflag;
+
+	for (i = 0; i < nvfx->vtxelt->num_per_instance; ++i)
+	{
+		struct nvfx_per_instance_element *ve = &nvfx->vtxelt->per_instance[i];
+		struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->base.vertex_buffer_index];
+		float v[4];
+		per_instance[i].step = info->start_instance % ve->instance_divisor;
+		per_instance[i].map = nvfx_buffer(vb->buffer)->data + vb->buffer_offset + ve->base.src_offset;
+
+		nvfx->vtxelt->per_instance[i].base.fetch_rgba_float(v, per_instance[i].map, 0, 0);
+
+		WAIT_RING(chan, 5);
+		nvfx_emit_vtx_attr(chan, nvfx->vtxelt->per_instance[i].base.idx, v, nvfx->vtxelt->per_instance[i].base.ncomp);
+	}
+
+	/* per-instance loop */
+	while (instances_left--) {
+		int max_verts;
+		boolean done;
+
+		util_split_prim_init(&s, info->mode, info->start, info->count);
+		nvfx_state_emit(nvfx);
+		for(;;) {
+			max_verts  = AVAIL_RING(chan);
+			max_verts -= p_overhead;
+
+			/* if vtx_value < 0, each vertex is -vtx_value words long
+			 * otherwise, each vertex is 2^(vtx_value) / 255 words long (this is an approximation)
+			 */
+			if(vtx_value < 0)
+			{
+				max_verts /= -vtx_value;
+				max_verts -= (max_verts >> 10); /* vertex data headers */
+			}
+			else
+			{
+				if(max_verts >= (1 << 23)) /* avoid overflow here */
+					max_verts = (1 << 23);
+				max_verts = (max_verts * 255) >> vtx_value;
+			}
+
+			//printf("avail %u max_verts %u\n", AVAIL_RING(chan), max_verts);
+
+			if(max_verts >= 16)
+			{
+				/* XXX: any command a lot of times seems to (mostly) fix corruption that would otherwise happen */
+				/* this seems to cause issues on nv3x, and also be unneeded there */
+				if(nvfx->is_nv4x)
+				{
+					int i;
+					for(i = 0; i < 32; ++i)
+					{
+						OUT_RING(chan, RING_3D(0x1dac, 1));
+						OUT_RING(chan, 0);
+					}
+				}
+
+				OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+				OUT_RING(chan, hw_mode);
+				done = util_split_prim_next(&s, max_verts);
+				OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+				OUT_RING(chan, 0);
+
+				if(done)
+					break;
+			}
+
+			FIRE_RING(chan);
+			nvfx_state_emit(nvfx);
+		}
+
+		/* set data for the next instance, if any changed */
+		for (i = 0; i < nvfx->vtxelt->num_per_instance; ++i)
+		{
+			struct nvfx_per_instance_element *ve = &nvfx->vtxelt->per_instance[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->base.vertex_buffer_index];
+
+			if(++per_instance[i].step == ve->instance_divisor)
+			{
+				float v[4];
+				per_instance[i].map += vb->stride;
+				per_instance[i].step = 0;
+
+				nvfx->vtxelt->per_instance[i].base.fetch_rgba_float(v, per_instance[i].map, 0, 0);
+				WAIT_RING(chan, 5);
+				nvfx_emit_vtx_attr(chan, nvfx->vtxelt->per_instance[i].base.idx, v, nvfx->vtxelt->per_instance[i].base.ncomp);
+			}
+		}
+	}
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_resource.c b/src/gallium/drivers/nvfx/nvfx_resource.c
index 10cdeed2a3..39ae893f1b 100644
--- a/src/gallium/drivers/nvfx/nvfx_resource.c
+++ b/src/gallium/drivers/nvfx/nvfx_resource.c
@@ -1,23 +1,15 @@
 
 #include "pipe/p_context.h"
+#include "util/u_staging.h"
 #include "nvfx_resource.h"
 #include "nouveau/nouveau_screen.h"
 
-
-/* This doesn't look quite right - this query is supposed to ask
- * whether the particular context has references to the resource in
- * any unflushed rendering command buffer, and hence requires a
- * pipe->flush() for serializing some modification to that resource.
- *
- * This seems to be answering the question of whether the resource is
- * currently on hardware.
- */
 static unsigned int
 nvfx_resource_is_referenced(struct pipe_context *pipe,
-			    struct pipe_resource *resource,
+			    struct pipe_resource *pr,
 			    unsigned face, unsigned level)
 {
-	return nouveau_reference_flags(nvfx_resource(resource)->bo);
+	return !!nouveau_reference_flags(nvfx_resource(pr)->bo);
 }
 
 static struct pipe_resource *
@@ -30,6 +22,15 @@ nvfx_resource_create(struct pipe_screen *screen,
 		return nvfx_miptree_create(screen, template);
 }
 
+static void
+nvfx_resource_destroy(struct pipe_screen *screen, struct pipe_resource *pr)
+{
+	if (pr->target == PIPE_BUFFER)
+		return nvfx_buffer_destroy(screen, pr);
+	else
+		return nvfx_miptree_destroy(screen, pr);
+}
+
 static struct pipe_resource *
 nvfx_resource_from_handle(struct pipe_screen * screen,
 			  const struct pipe_resource *template,
@@ -41,15 +42,22 @@ nvfx_resource_from_handle(struct pipe_screen * screen,
 		return nvfx_miptree_from_handle(screen, template, whandle);
 }
 
+static boolean
+nvfx_resource_get_handle(struct pipe_screen *pscreen,
+                        struct pipe_resource *pr,
+                        struct winsys_handle *whandle)
+{
+	struct nvfx_resource* res = (struct nvfx_resource*)pr;
+
+	if (!res || !res->bo)
+		return FALSE;
+
+	return nouveau_screen_bo_get_handle(pscreen, res->bo, nvfx_subresource_pitch(pr, 0), whandle);
+}
+
 void
 nvfx_init_resource_functions(struct pipe_context *pipe)
 {
-	pipe->get_transfer = u_get_transfer_vtbl;
-	pipe->transfer_map = u_transfer_map_vtbl;
-	pipe->transfer_flush_region = u_transfer_flush_region_vtbl;
-	pipe->transfer_unmap = u_transfer_unmap_vtbl;
-	pipe->transfer_destroy = u_transfer_destroy_vtbl;
-	pipe->transfer_inline_write = u_transfer_inline_write_vtbl;
 	pipe->is_resource_referenced = nvfx_resource_is_referenced;
 }
 
@@ -58,10 +66,10 @@ nvfx_screen_init_resource_functions(struct pipe_screen *pscreen)
 {
 	pscreen->resource_create = nvfx_resource_create;
 	pscreen->resource_from_handle = nvfx_resource_from_handle;
-	pscreen->resource_get_handle = u_resource_get_handle_vtbl;
-	pscreen->resource_destroy = u_resource_destroy_vtbl;
+	pscreen->resource_get_handle = nvfx_resource_get_handle;
+	pscreen->resource_destroy = nvfx_resource_destroy;
 	pscreen->user_buffer_create = nvfx_user_buffer_create;
-   
+
 	pscreen->get_tex_surface = nvfx_miptree_surface_new;
 	pscreen->tex_surface_destroy = nvfx_miptree_surface_del;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_resource.h b/src/gallium/drivers/nvfx/nvfx_resource.h
index a68c14cf3f..583be4de2a 100644
--- a/src/gallium/drivers/nvfx/nvfx_resource.h
+++ b/src/gallium/drivers/nvfx/nvfx_resource.h
@@ -1,44 +1,82 @@
-
 #ifndef NVFX_RESOURCE_H
 #define NVFX_RESOURCE_H
 
 #include "util/u_transfer.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_double_list.h"
+#include "util/u_surfaces.h"
+#include "util/u_dirty_surfaces.h"
+#include <nouveau/nouveau_bo.h>
 
 struct pipe_resource;
-struct nouveau_bo;
-
+struct nv04_region;
 
-/* This gets further specialized into either buffer or texture
- * structures.  In the future we'll want to remove much of that
- * distinction, but for now try to keep as close to the existing code
- * as possible and use the vtbl struct to choose between the two
- * underlying implementations.
- */
 struct nvfx_resource {
 	struct pipe_resource base;
-	struct u_resource_vtbl *vtbl;
 	struct nouveau_bo *bo;
 };
 
+static INLINE
+struct nvfx_resource *nvfx_resource(struct pipe_resource *resource)
+{
+	return (struct nvfx_resource *)resource;
+}
+
+#define NVFX_RESOURCE_FLAG_LINEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
+#define NVFX_RESOURCE_FLAG_USER (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
+
+/* is resource mapped into the GPU's address space (i.e. VRAM or GART) ? */
+static INLINE boolean
+nvfx_resource_mapped_by_gpu(struct pipe_resource *resource)
+{
+   return nvfx_resource(resource)->bo->handle;
+}
+
+/* is resource in VRAM? */
+static inline int
+nvfx_resource_on_gpu(struct pipe_resource* pr)
+{
+#if 0
+	// a compiler error here means you need to apply libdrm-nouveau-add-domain.patch to libdrm
+	// TODO: return FALSE if not VRAM and on a PCI-E system
+	return ((struct nvfx_resource*)pr)->bo->domain & (NOUVEAU_BO_VRAM | NOUVEAU_BO_GART);
+#else
+	return TRUE;
+#endif
+}
+
 #define NVFX_MAX_TEXTURE_LEVELS  16
 
+/* We have the following invariants for render temporaries
+ *
+ * 1. Render temporaries are always linear
+ * 2. Render temporaries are always up to date
+ * 3. Currently, render temporaries are destroyed when the resource is used for sampling, but kept for any other use
+ *
+ * Also, we do NOT flush temporaries on any pipe->flush().
+ * This is fine, as long as scanout targets and shared resources never need temps.
+ *
+ * TODO: we may want to also support swizzled temporaries to improve performance in some cases.
+ */
+
 struct nvfx_miptree {
-	struct nvfx_resource base;
-	uint total_size;
+        struct nvfx_resource base;
 
-	struct {
-		uint pitch;
-		uint *image_offset;
-	} level[NVFX_MAX_TEXTURE_LEVELS];
+        unsigned linear_pitch; /* for linear textures, 0 for swizzled and compressed textures with level-dependent minimal pitch */
+        unsigned face_size; /* 128-byte aligned face/total size */
+        unsigned level_offset[NVFX_MAX_TEXTURE_LEVELS];
 
-	unsigned image_nr;
+        struct util_surfaces surfaces;
+        struct util_dirty_surfaces dirty_surfaces;
 };
 
-static INLINE 
-struct nvfx_resource *nvfx_resource(struct pipe_resource *resource)
-{
-	return (struct nvfx_resource *)resource;
-}
+struct nvfx_surface {
+	struct util_dirty_surface base;
+	unsigned pitch;
+
+	struct nvfx_miptree* temp;
+};
 
 static INLINE struct nouveau_bo *
 nvfx_surface_buffer(struct pipe_surface *surf)
@@ -48,6 +86,12 @@ nvfx_surface_buffer(struct pipe_surface *surf)
 	return mt->bo;
 }
 
+static INLINE struct util_dirty_surfaces*
+nvfx_surface_get_dirty_surfaces(struct pipe_surface* surf)
+{
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)surf->texture;
+	return &mt->dirty_surfaces;
+}
 
 void
 nvfx_init_resource_functions(struct pipe_context *pipe);
@@ -62,30 +106,118 @@ nvfx_screen_init_resource_functions(struct pipe_screen *pscreen);
 struct pipe_resource *
 nvfx_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *pt);
 
+void
+nvfx_miptree_destroy(struct pipe_screen *pscreen,
+                     struct pipe_resource *presource);
+
 struct pipe_resource *
 nvfx_miptree_from_handle(struct pipe_screen *pscreen,
 			 const struct pipe_resource *template,
 			 struct winsys_handle *whandle);
 
+void
+nvfx_miptree_surface_del(struct pipe_surface *ps);
+
+struct pipe_surface *
+nvfx_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_resource *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags);
+
+/* only for miptrees, don't use for buffers */
+
+/* NOTE: for swizzled 3D textures, this just returns the offset of the mipmap level */
+static inline unsigned
+nvfx_subresource_offset(struct pipe_resource* pt, unsigned face, unsigned level, unsigned zslice)
+{
+	if(pt->target == PIPE_BUFFER)
+		return 0;
+	else
+	{
+		struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
+
+		unsigned offset = mt->level_offset[level];
+		if (pt->target == PIPE_TEXTURE_CUBE)
+			offset += mt->face_size * face;
+		else if (pt->target == PIPE_TEXTURE_3D && mt->linear_pitch)
+			offset += zslice * util_format_get_2d_size(pt->format, (mt->linear_pitch ? mt->linear_pitch : util_format_get_stride(pt->format, u_minify(pt->width0, level))),  u_minify(pt->height0, level));
+		return offset;
+	}
+}
+
+static inline unsigned
+nvfx_subresource_pitch(struct pipe_resource* pt, unsigned level)
+{
+	if(pt->target == PIPE_BUFFER)
+		return ((struct nvfx_resource*)pt)->bo->size;
+	else
+	{
+		struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
+
+		if(mt->linear_pitch)
+			return mt->linear_pitch;
+		else
+			return util_format_get_stride(pt->format, u_minify(pt->width0, level));
+	}
+}
+
+void
+nvfx_surface_create_temp(struct pipe_context* pipe, struct pipe_surface* surf);
+
+void
+nvfx_surface_flush(struct pipe_context* pipe, struct pipe_surface* surf);
+
+struct nvfx_buffer
+{
+	struct nvfx_resource base;
+	uint8_t* data;
+	unsigned size;
+
+	/* the range of data not yet uploaded to the GPU bo */
+	unsigned dirty_begin;
+	unsigned dirty_end;
+
+	/* whether all transfers were unsynchronized */
+	boolean dirty_unsynchronized;
+
+	/* whether it would have been profitable to upload
+	 * the latest updated data to the GPU immediately */
+	boolean last_update_static;
+
+	/* how many bytes we need to draw before we deem
+	 * the buffer to be static
+	 */
+	long long bytes_to_draw_until_static;
+};
+
+static inline struct nvfx_buffer* nvfx_buffer(struct pipe_resource* pr)
+{
+	return (struct nvfx_buffer*)pr;
+}
+
+/* this is an heuristic to determine whether we are better off uploading the
+ * buffer to the GPU, or just continuing pushing it on the FIFO
+ */
+static inline boolean nvfx_buffer_seems_static(struct nvfx_buffer* buffer)
+{
+	return buffer->last_update_static
+		|| buffer->bytes_to_draw_until_static < 0;
+}
+
 struct pipe_resource *
 nvfx_buffer_create(struct pipe_screen *pscreen,
 		   const struct pipe_resource *template);
 
+void
+nvfx_buffer_destroy(struct pipe_screen *pscreen,
+                    struct pipe_resource *presource);
+
 struct pipe_resource *
 nvfx_user_buffer_create(struct pipe_screen *screen,
 			void *ptr,
 			unsigned bytes,
 			unsigned usage);
 
-
-
 void
-nvfx_miptree_surface_del(struct pipe_surface *ps);
-
-struct pipe_surface *
-nvfx_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_resource *pt,
-			 unsigned face, unsigned level, unsigned zslice,
-			 unsigned flags);
-
+nvfx_buffer_upload(struct nvfx_buffer* buffer);
 
 #endif
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.c b/src/gallium/drivers/nvfx/nvfx_screen.c
index f2525ccb38..65ca265d45 100644
--- a/src/gallium/drivers/nvfx/nvfx_screen.c
+++ b/src/gallium/drivers/nvfx/nvfx_screen.c
@@ -8,23 +8,12 @@
 #include "nvfx_context.h"
 #include "nvfx_screen.h"
 #include "nvfx_resource.h"
+#include "nvfx_tex.h"
 
 #define NV30TCL_CHIPSET_3X_MASK 0x00000003
 #define NV34TCL_CHIPSET_3X_MASK 0x00000010
 #define NV35TCL_CHIPSET_3X_MASK 0x000001e0
 
-/* FIXME: It seems I should not include directly ../../winsys/drm/nouveau/drm/nouveau_drm_api.h
-* to get the pointer to the context front buffer, so I copied nouveau_winsys here.
-* nv30_screen_surface_format_supported() can then use it to enforce creating fbo
-* with same number of bits everywhere.
-*/
-struct nouveau_winsys {
-	struct pipe_winsys base;
-
-	struct pipe_screen *pscreen;
-
-	struct pipe_surface *front;
-};
 #define NV4X_GRCLASS4097_CHIPSETS 0x00000baf
 #define NV4X_GRCLASS4497_CHIPSETS 0x00005450
 #define NV6X_GRCLASS4497_CHIPSETS 0x00000088
@@ -43,7 +32,7 @@ nvfx_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TWO_SIDED_STENCIL:
 		return 1;
 	case PIPE_CAP_GLSL:
-		return 0;
+		return 1;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
@@ -162,77 +151,74 @@ nvfx_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_cap param)
 }
 
 static boolean
-nvfx_screen_surface_format_supported(struct pipe_screen *pscreen,
+nvfx_screen_is_format_supported(struct pipe_screen *pscreen,
 				     enum pipe_format format,
 				     enum pipe_texture_target target,
 				     unsigned sample_count,
-				     unsigned tex_usage, unsigned geom_flags)
+				     unsigned bind, unsigned geom_flags)
 {
 	struct nvfx_screen *screen = nvfx_screen(pscreen);
-	struct pipe_surface *front = ((struct nouveau_winsys *) pscreen->winsys)->front;
 
 	 if (sample_count > 1)
 		return FALSE;
 
-	if (tex_usage & PIPE_BIND_RENDER_TARGET) {
+	if (bind & PIPE_BIND_RENDER_TARGET) {
 		switch (format) {
 		case PIPE_FORMAT_B8G8R8A8_UNORM:
 		case PIPE_FORMAT_B8G8R8X8_UNORM:
 		case PIPE_FORMAT_B5G6R5_UNORM:
-			return TRUE;
-		default:
 			break;
+		default:
+			return FALSE;
 		}
-	} else
-	if (tex_usage & PIPE_BIND_DEPTH_STENCIL) {
+	}
+
+	if (bind & PIPE_BIND_DEPTH_STENCIL) {
 		switch (format) {
 		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
 		case PIPE_FORMAT_X8Z24_UNORM:
-			return TRUE;
 		case PIPE_FORMAT_Z16_UNORM:
-			/* TODO: this nv30 limitation probably does not exist */
-			if (!screen->is_nv4x && front)
-				return (front->format == PIPE_FORMAT_B5G6R5_UNORM);
-			return TRUE;
-		default:
 			break;
+		default:
+			return FALSE;
 		}
-	} else {
-		switch (format) {
-		if (tex_usage & PIPE_BIND_SAMPLER_VIEW) {
-			switch (format) {
-			case PIPE_FORMAT_DXT1_RGB:
-			case PIPE_FORMAT_DXT1_RGBA:
-			case PIPE_FORMAT_DXT3_RGBA:
-			case PIPE_FORMAT_DXT5_RGBA:
-				return util_format_s3tc_enabled;
-			default:
-				break;
-			}
+	}
+
+	if (bind & PIPE_BIND_SAMPLER_VIEW) {
+		struct nvfx_texture_format* tf = &nvfx_texture_formats[format];
+		if(util_format_is_s3tc(format) && !util_format_s3tc_enabled)
+			return FALSE;
+
+		if(screen->is_nv4x)
+		{
+			if(tf->fmt[4] < 0)
+				return FALSE;
 		}
-		case PIPE_FORMAT_B8G8R8A8_UNORM:
-		case PIPE_FORMAT_B8G8R8X8_UNORM:
-		case PIPE_FORMAT_B5G5R5A1_UNORM:
-		case PIPE_FORMAT_B4G4R4A4_UNORM:
-		case PIPE_FORMAT_B5G6R5_UNORM:
-		case PIPE_FORMAT_L8_UNORM:
-		case PIPE_FORMAT_A8_UNORM:
-		case PIPE_FORMAT_I8_UNORM:
-		case PIPE_FORMAT_L8A8_UNORM:
-		case PIPE_FORMAT_Z16_UNORM:
-		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
-			return TRUE;
-		/* TODO: does nv30 support this? */
-		case PIPE_FORMAT_R16_SNORM:
-			return !!screen->is_nv4x;
-		default:
-			break;
+		else
+		{
+			if(tf->fmt[0] < 0)
+				return FALSE;
 		}
 	}
 
-	return FALSE;
-}
+	// note that we do actually support everything through translate
+	if (bind & PIPE_BIND_VERTEX_BUFFER) {
+		unsigned type = nvfx_vertex_formats[format];
+		if(!type)
+			return FALSE;
+	}
+
+	if (bind & PIPE_BIND_INDEX_BUFFER) {
+		// 8-bit indices supported, but not in hardware index buffer
+		if(format != PIPE_FORMAT_R16_USCALED && format != PIPE_FORMAT_R32_USCALED)
+			return FALSE;
+	}
+
+	if(bind & PIPE_BIND_STREAM_OUTPUT)
+		return FALSE;
 
+	return TRUE;
+}
 
 static void
 nvfx_screen_destroy(struct pipe_screen *pscreen)
@@ -245,7 +231,7 @@ nvfx_screen_destroy(struct pipe_screen *pscreen)
 	nouveau_notifier_free(&screen->query);
 	nouveau_notifier_free(&screen->sync);
 	nouveau_grobj_free(&screen->eng3d);
-	nv04_surface_2d_takedown(&screen->eng2d);
+	nvfx_screen_surface_takedown(pscreen);
 
 	nouveau_screen_fini(&screen->base);
 
@@ -374,6 +360,14 @@ nvfx_screen_get_vertex_buffer_flags(struct nvfx_screen* screen)
 	return vram_hack ? NOUVEAU_BO_VRAM : NOUVEAU_BO_GART;
 }
 
+static void nvfx_channel_flush_notify(struct nouveau_channel* chan)
+{
+	struct nvfx_screen* screen = chan->user_private;
+	struct nvfx_context* nvfx = screen->cur_ctx;
+	if(nvfx)
+		nvfx->relocs_needed = NVFX_RELOCATE_ALL;
+}
+
 struct pipe_screen *
 nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 {
@@ -395,12 +389,15 @@ nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		return NULL;
 	}
 	chan = screen->base.channel;
+	screen->cur_ctx = NULL;
+	chan->user_private = screen;
+	chan->flush_notify = nvfx_channel_flush_notify;
 
 	pscreen->winsys = ws;
 	pscreen->destroy = nvfx_screen_destroy;
 	pscreen->get_param = nvfx_screen_get_param;
 	pscreen->get_paramf = nvfx_screen_get_paramf;
-	pscreen->is_format_supported = nvfx_screen_surface_format_supported;
+	pscreen->is_format_supported = nvfx_screen_is_format_supported;
 	pscreen->context_create = nvfx_create;
 
 	switch (dev->chipset & 0xf0) {
@@ -432,6 +429,11 @@ nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	screen->force_swtnl = debug_get_bool_option("NOUVEAU_SWTNL", FALSE);
+	screen->trace_draw = debug_get_bool_option("NVFX_TRACE_DRAW", FALSE);
+
+	screen->buffer_allocation_cost = debug_get_num_option("NVFX_BUFFER_ALLOCATION_COST", 16384);
+	screen->inline_cost_per_hardware_cost = atof(debug_get_option("NVFX_INLINE_COST_PER_HARDWARE_COST", "1.0"));
+	screen->static_reuse_threshold = atof(debug_get_option("NVFX_STATIC_REUSE_THRESHOLD", "2.0"));
 
 	screen->vertex_buffer_reloc_flags = nvfx_screen_get_vertex_buffer_flags(screen);
 
@@ -451,8 +453,7 @@ nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	/* 2D engine setup */
-	screen->eng2d = nv04_surface_2d_init(&screen->base);
-	screen->eng2d->buf = nvfx_surface_buffer;
+	nvfx_screen_surface_init(pscreen);
 
 	/* Notifier for sync purposes */
 	ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync);
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.h b/src/gallium/drivers/nvfx/nvfx_screen.h
index 5e1c3945ae..1b79235ae0 100644
--- a/src/gallium/drivers/nvfx/nvfx_screen.h
+++ b/src/gallium/drivers/nvfx/nvfx_screen.h
@@ -1,11 +1,11 @@
 #ifndef __NVFX_SCREEN_H__
 #define __NVFX_SCREEN_H__
 
+#include "pipe/p_compiler.h"
 #include "util/u_double_list.h"
 #include "nouveau/nouveau_screen.h"
-#include "nv04_surface_2d.h"
 
-struct nvfx_context;
+struct pipe_screen;
 
 struct nvfx_screen {
 	struct nouveau_screen base;
@@ -16,11 +16,11 @@ struct nvfx_screen {
 
 	unsigned is_nv4x; /* either 0 or ~0 */
 	boolean force_swtnl;
+	boolean trace_draw;
 	unsigned vertex_buffer_reloc_flags;
 	unsigned index_buffer_reloc_flags;
 
 	/* HW graphics objects */
-	struct nv04_surface_2d *eng2d;
 	struct nouveau_grobj *eng3d;
 	struct nouveau_notifier *sync;
 
@@ -32,6 +32,20 @@ struct nvfx_screen {
 	/* Vtxprog resources */
 	struct nouveau_resource *vp_exec_heap;
 	struct nouveau_resource *vp_data_heap;
+
+	struct nv04_2d_context* eng2d;
+
+	/* Once the amount of bytes drawn from the buffer reaches the updated size times this value,
+	 * we will assume that the buffer will be drawn an huge number of times before the
+	 * next modification
+	 */
+	float static_reuse_threshold;
+
+	/* Cost of allocating a buffer in terms of the cost of copying a byte to an hardware buffer */
+	unsigned buffer_allocation_cost;
+
+	/* inline_cost/hardware_cost conversion ration */
+	float inline_cost_per_hardware_cost;
 };
 
 static INLINE struct nvfx_screen *
@@ -40,4 +54,7 @@ nvfx_screen(struct pipe_screen *screen)
 	return (struct nvfx_screen *)screen;
 }
 
+int nvfx_screen_surface_init(struct pipe_screen *pscreen);
+void nvfx_screen_surface_takedown(struct pipe_screen *pscreen);
+
 #endif
diff --git a/src/gallium/drivers/nvfx/nvfx_shader.h b/src/gallium/drivers/nvfx/nvfx_shader.h
index 50830b3916..35006eec3d 100644
--- a/src/gallium/drivers/nvfx/nvfx_shader.h
+++ b/src/gallium/drivers/nvfx/nvfx_shader.h
@@ -1,6 +1,12 @@
 #ifndef __NVFX_SHADER_H__
 #define __NVFX_SHADER_H__
 
+#include <stdint.h>
+
+#include "pipe/p_compiler.h"
+
+#define NVFX_SWZ_IDENTITY ((3 << 6) | (2 << 4) | (1 << 2) | (0 << 0))
+
 /* this will resolve to either the NV30 or the NV40 version
  * depending on the current hardware */
 /* unusual, but very fast and compact method */
@@ -71,11 +77,58 @@
 /*
  * Each fragment program opcode appears to be comprised of 4 32-bit values.
  *
- *   0 - Opcode, output reg/mask, ATTRIB source
- *   1 - Source 0
- *   2 - Source 1
- *   3 - Source 2
+ * 0: OPDEST
+ * 	0: program end
+ * 	1-6: destination register
+ * 	7: destination register is fp16?? (use for outputs)
+ * 	8: set condition code
+ * 	9: writemask x
+ *  	10: writemask y
+ *  	11: writemask z
+ *  	12: writemask w
+ *  	13-16: source attribute register number (e.g. COL0)
+ *  	17-20: texture unit number
+ *  	21: expand value on texture operation (x -> 2x - 1)
+ *  	22-23: precision 0 = fp32, 1 = fp16, 2 = s1.10 fixed, 3 = s0.8 fixed (nv40-only))
+ * 	24-29: opcode
+ * 	30: no destination
+ * 	31: saturate
+ * 1 - SRC0
+ * 	0-17: see common source fields
+ * 	18: execute if condition code less
+ * 	19: execute if condition code equal
+ * 	20: execute if condition code greater
+ * 	21-22: condition code swizzle x source component
+ * 	23-24: condition code swizzle y source component
+ * 	25-26: condition code swizzle z source component
+ * 	27-28: condition code swizzle w source component
+ * 	29: source 0 absolute
+ * 	30: always 0 in renouveau tests
+ * 	31: always 0 in renouveau tests
+ * 2 - SRC1
+ * 	0-17: see common source fields
+ * 	18: source 1 absolute
+ * 	19-20: input precision 0 = fp32, 1 = fp16, 2 = s1.10 fixed, 3 = ???
+ * 	21-27: always 0 in renouveau tests
+ * 	28-30: scale (0 = 1x, 1 = 2x, 2 = 4x, 3 = 8x, 4 = ???, 5, = 1/2, 6 = 1/4, 7 = 1/8)
+ * 	31: opcode is branch
+ * 3 - SRC2
+ * 	0-17: see common source fields
+ * 	18: source 2 absolute
+ * 	19-29: address register displacement
+ * 	30: use index register
+ * 	31: disable perspective-correct interpolation?
  *
+* Common fields of 0, 1, 2 - SRC
+ * 	0-1: source register type (0 = temp, 1 = input, 2 = immediate, 3 = ???)
+ * 	2-7: source temp register index
+ * 	8: source register is fp16??
+ * 	9-10: source swizzle x source component
+ * 	11-12: source swizzle y source component
+ * 	13-14: source swizzle z source component
+ * 	15-16: source swizzle w source component
+ *	17: negate
+
  * There appears to be no special difference between result regs and temp regs.
  *     result.color == R0.xyzw
  *     result.depth == R1.z
@@ -210,6 +263,7 @@
 
 /* NV40 only fragment program opcodes */
 #define NVFX_FP_OP_OPCODE_TXL_NV40 0x2F
+
 /* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/
 #define NV40_FP_OP_BRA_OPCODE_BRK                                    0x0
 #define NV40_FP_OP_BRA_OPCODE_CAL                                    0x1
@@ -218,10 +272,11 @@
 #define NV40_FP_OP_BRA_OPCODE_REP                                    0x4
 #define NV40_FP_OP_BRA_OPCODE_RET                                    0x5
 
+#define NV40_FP_OP_OUT_NONE         (1 << 30)
 #define NVFX_FP_OP_OUT_SAT          (1 << 31)
 
 /* high order bits of SRC0 */
-#define NVFX_FP_OP_OUT_ABS          (1 << 29)
+#define NVFX_FP_OP_SRC0_ABS          (1 << 29)
 #define NVFX_FP_OP_COND_SWZ_W_SHIFT        27
 #define NVFX_FP_OP_COND_SWZ_W_MASK        (3 << 27)
 #define NVFX_FP_OP_COND_SWZ_Z_SHIFT        25
@@ -254,6 +309,7 @@
 #define NVFX_FP_OP_DST_SCALE_INV_2X                                            5
 #define NVFX_FP_OP_DST_SCALE_INV_4X                                            6
 #define NVFX_FP_OP_DST_SCALE_INV_8X                                            7
+#define NVFX_FP_OP_SRC1_ABS          (1 << 18)
 
 /* SRC1 LOOP */
 #define NV40_FP_OP_LOOP_INCR_SHIFT                                            19
@@ -263,13 +319,13 @@
 #define NV40_FP_OP_LOOP_COUNT_SHIFT                                            2
 #define NV40_FP_OP_LOOP_COUNT_MASK                                   (0xFF << 2)
 
-/* SRC1 IF */
-#define NV40_FP_OP_ELSE_ID_SHIFT                                               2
-#define NV40_FP_OP_ELSE_ID_MASK                                      (0xFF << 2)
+/* SRC1 IF: absolute offset in dwords */
+#define NV40_FP_OP_ELSE_OFFSET_SHIFT                                           0
+#define NV40_FP_OP_ELSE_OFFSET_MASK                             (0x7FFFFFFF << 0)
 
 /* SRC1 CAL */
-#define NV40_FP_OP_IADDR_SHIFT                                                 2
-#define NV40_FP_OP_IADDR_MASK                                        (0xFF << 2)
+#define NV40_FP_OP_SUB_OFFSET_SHIFT                                                 0
+#define NV40_FP_OP_SUB_OFFSET_MASK                                   (0x7FFFFFFF << 0)
 
 /* SRC1 REP
  *   I have no idea why there are 3 count values here..  but they
@@ -283,9 +339,9 @@
 #define NV40_FP_OP_REP_COUNT3_SHIFT                                           19
 #define NV40_FP_OP_REP_COUNT3_MASK                                  (0xFF << 19)
 
-/* SRC2 REP/IF */
-#define NV40_FP_OP_END_ID_SHIFT                                                2
-#define NV40_FP_OP_END_ID_MASK                                       (0xFF << 2)
+/* SRC2 REP/IF: absolute offset in dwords */
+#define NV40_FP_OP_END_OFFSET_SHIFT                                            0
+#define NV40_FP_OP_END_OFFSET_MASK                              (0x7FFFFFFF << 0)
 
 /* high order bits of SRC2 */
 #define NVFX_FP_OP_INDEX_INPUT          (1 << 30)
@@ -323,6 +379,7 @@
 #define NVFXSR_INPUT	2
 #define NVFXSR_TEMP	3
 #define NVFXSR_CONST	4
+#define NVFXSR_RELOCATED	5
 
 #define NVFX_COND_FL  0
 #define NVFX_COND_LT  1
@@ -352,51 +409,88 @@
 #define NVFX_SWZ_Z 2
 #define NVFX_SWZ_W 3
 
-#define swz(s,x,y,z,w) nvfx_sr_swz((s), NVFX_SWZ_##x, NVFX_SWZ_##y, NVFX_SWZ_##z, NVFX_SWZ_##w)
-#define neg(s) nvfx_sr_neg((s))
-#define abs(s) nvfx_sr_abs((s))
-#define scale(s,v) nvfx_sr_scale((s), NVFX_FP_OP_DST_SCALE_##v)
+#define swz(s,x,y,z,w) nvfx_src_swz((s), NVFX_SWZ_##x, NVFX_SWZ_##y, NVFX_SWZ_##z, NVFX_SWZ_##w)
+#define neg(s) nvfx_src_neg((s))
+#define abs(s) nvfx_src_abs((s))
 
-struct nvfx_sreg {
-	int type;
-	int index;
+struct nvfx_reg {
+	uint8_t type;
+	uint32_t index;
+};
 
-	int dst_scale;
+struct nvfx_src {
+	struct nvfx_reg reg;
 
-	int negate;
-	int abs;
-	int swz[4];
+	/* src only */
+	uint8_t negate : 1;
+	uint8_t abs : 1;
+	uint8_t swz[4];
+};
 
-	int cc_update;
-	int cc_update_reg;
-	int cc_test;
-	int cc_test_reg;
-	int cc_swz[4];
+struct nvfx_insn
+{
+	uint8_t op;
+	char scale;
+	int8_t unit;
+	uint8_t mask;
+	uint8_t cc_swz[4];
+
+	uint8_t sat : 1;
+	uint8_t cc_update : 1;
+	uint8_t cc_update_reg : 1;
+	uint8_t cc_test : 3;
+	uint8_t cc_test_reg : 1;
+
+	struct nvfx_reg dst;
+	struct nvfx_src src[3];
 };
 
-static INLINE struct nvfx_sreg
-nvfx_sr(int type, int index)
+static INLINE struct nvfx_insn
+nvfx_insn(boolean sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask, struct nvfx_src s0, struct nvfx_src s1, struct nvfx_src s2)
 {
-	struct nvfx_sreg temp = {
-		.type = type,
-		.index = index,
-		.dst_scale = 0,
-		.abs = 0,
-		.negate = 0,
-		.swz = { 0, 1, 2, 3 },
+	struct nvfx_insn insn = {
+		.op = op,
+		.scale = 0,
+		.unit = unit,
+		.sat = sat,
+		.mask = mask,
 		.cc_update = 0,
 		.cc_update_reg = 0,
 		.cc_test = NVFX_COND_TR,
 		.cc_test_reg = 0,
 		.cc_swz = { 0, 1, 2, 3 },
+		.dst = dst,
+		.src = {s0, s1, s2}
+	};
+	return insn;
+}
+
+static INLINE struct nvfx_reg
+nvfx_reg(int type, int index)
+{
+	struct nvfx_reg temp = {
+		.type = type,
+		.index = index,
 	};
 	return temp;
 }
 
-static INLINE struct nvfx_sreg
-nvfx_sr_swz(struct nvfx_sreg src, int x, int y, int z, int w)
+static INLINE struct nvfx_src
+nvfx_src(struct nvfx_reg reg)
 {
-	struct nvfx_sreg dst = src;
+	struct nvfx_src temp = {
+		.reg = reg,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nvfx_src
+nvfx_src_swz(struct nvfx_src src, int x, int y, int z, int w)
+{
+	struct nvfx_src dst = src;
 
 	dst.swz[NVFX_SWZ_X] = src.swz[x];
 	dst.swz[NVFX_SWZ_Y] = src.swz[y];
@@ -405,25 +499,23 @@ nvfx_sr_swz(struct nvfx_sreg src, int x, int y, int z, int w)
 	return dst;
 }
 
-static INLINE struct nvfx_sreg
-nvfx_sr_neg(struct nvfx_sreg src)
+static INLINE struct nvfx_src
+nvfx_src_neg(struct nvfx_src src)
 {
 	src.negate = !src.negate;
 	return src;
 }
 
-static INLINE struct nvfx_sreg
-nvfx_sr_abs(struct nvfx_sreg src)
+static INLINE struct nvfx_src
+nvfx_src_abs(struct nvfx_src src)
 {
 	src.abs = 1;
 	return src;
 }
 
-static INLINE struct nvfx_sreg
-nvfx_sr_scale(struct nvfx_sreg src, int scale)
-{
-	src.dst_scale = scale;
-	return src;
-}
+struct nvfx_relocation {
+        unsigned location;
+        unsigned target;
+};
 
 #endif
diff --git a/src/gallium/drivers/nvfx/nvfx_state.c b/src/gallium/drivers/nvfx/nvfx_state.c
index cd58e439d7..5bd7dc07f0 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.c
+++ b/src/gallium/drivers/nvfx/nvfx_state.c
@@ -1,6 +1,7 @@
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
 #include "util/u_inlines.h"
+#include "util/u_framebuffer.h"
 
 #include "draw/draw_context.h"
 
@@ -81,111 +82,6 @@ nvfx_blend_state_delete(struct pipe_context *pipe, void *hwcso)
 }
 
 static void *
-nvfx_sampler_state_create(struct pipe_context *pipe,
-			  const struct pipe_sampler_state *cso)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nvfx_sampler_state *ps;
-
-	ps = MALLOC(sizeof(struct nvfx_sampler_state));
-
-	/* on nv30, we use this as an internal flag */
-	ps->fmt = cso->normalized_coords ? 0 : NV40TCL_TEX_FORMAT_RECT;
-	ps->en = 0;
-	ps->filt = nvfx_tex_filter(cso);
-	ps->wrap = (nvfx_tex_wrap_mode(cso->wrap_s) << NV34TCL_TX_WRAP_S_SHIFT) |
-		    (nvfx_tex_wrap_mode(cso->wrap_t) << NV34TCL_TX_WRAP_T_SHIFT) |
-		    (nvfx_tex_wrap_mode(cso->wrap_r) << NV34TCL_TX_WRAP_R_SHIFT) |
-		    nvfx_tex_wrap_compare_mode(cso);
-	ps->bcol = nvfx_tex_border_color(cso->border_color);
-
-	if(nvfx->is_nv4x)
-		nv40_sampler_state_init(pipe, ps, cso);
-	else
-		nv30_sampler_state_init(pipe, ps, cso);
-
-	return (void *)ps;
-}
-
-static void
-nvfx_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	unsigned unit;
-
-	for (unit = 0; unit < nr; unit++) {
-		nvfx->tex_sampler[unit] = sampler[unit];
-		nvfx->dirty_samplers |= (1 << unit);
-	}
-
-	for (unit = nr; unit < nvfx->nr_samplers; unit++) {
-		nvfx->tex_sampler[unit] = NULL;
-		nvfx->dirty_samplers |= (1 << unit);
-	}
-
-	nvfx->nr_samplers = nr;
-	nvfx->dirty |= NVFX_NEW_SAMPLER;
-}
-
-static void
-nvfx_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
-{
-	FREE(hwcso);
-}
-
-static void
-nvfx_set_fragment_sampler_views(struct pipe_context *pipe,
-				unsigned nr,
-				struct pipe_sampler_view **views)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	unsigned unit;
-
-	for (unit = 0; unit < nr; unit++) {
-		pipe_sampler_view_reference(&nvfx->fragment_sampler_views[unit],
-                                            views[unit]);
-		nvfx->dirty_samplers |= (1 << unit);
-	}
-
-	for (unit = nr; unit < nvfx->nr_textures; unit++) {
-		pipe_sampler_view_reference(&nvfx->fragment_sampler_views[unit],
-                                            NULL);
-		nvfx->dirty_samplers |= (1 << unit);
-	}
-
-	nvfx->nr_textures = nr;
-	nvfx->dirty |= NVFX_NEW_SAMPLER;
-}
-
-
-static struct pipe_sampler_view *
-nvfx_create_sampler_view(struct pipe_context *pipe,
-			 struct pipe_resource *texture,
-			 const struct pipe_sampler_view *templ)
-{
-	struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view);
-
-	if (view) {
-		*view = *templ;
-		view->reference.count = 1;
-		view->texture = NULL;
-		pipe_resource_reference(&view->texture, texture);
-		view->context = pipe;
-	}
-
-	return view;
-}
-
-
-static void
-nvfx_sampler_view_destroy(struct pipe_context *pipe,
-			  struct pipe_sampler_view *view)
-{
-	pipe_resource_reference(&view->texture, NULL);
-	FREE(view);
-}
-
-static void *
 nvfx_rasterizer_state_create(struct pipe_context *pipe,
 			     const struct pipe_rasterizer_state *cso)
 {
@@ -195,6 +91,7 @@ nvfx_rasterizer_state_create(struct pipe_context *pipe,
 	/*XXX: ignored:
 	 * 	point_smooth -nohw
 	 * 	multisample
+	 *     sprite_coord_origin
 	 */
 
 	sb_method(sb, NV34TCL_SHADE_MODEL, 1);
@@ -254,19 +151,8 @@ nvfx_rasterizer_state_create(struct pipe_context *pipe,
 		sb_data(sb, fui(cso->offset_units * 2));
 	}
 
-	sb_method(sb, NV34TCL_POINT_SPRITE, 1);
-	if (cso->point_quad_rasterization) {
-		unsigned psctl = (1 << 0), i;
-
-		for (i = 0; i < 8; i++) {
-			if ((cso->sprite_coord_enable >> i) & 1)
-				psctl |= (1 << (8 + i));
-		}
-
-		sb_data(sb, psctl);
-	} else {
-		sb_data(sb, 0);
-	}
+	sb_method(sb, NV34TCL_FLATSHADE_FIRST, 1);
+	sb_data(sb, cso->flatshade_first);
 
 	rsso->pipe = *cso;
 	rsso->sb_len = sb_len(sb, rsso->sb);
@@ -287,11 +173,11 @@ nvfx_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
 			nvfx->draw_dirty |= NVFX_NEW_SCISSOR;
 		}
 
-		if(((struct nvfx_rasterizer_state*)hwcso)->pipe.poly_stipple_enable
-					!= nvfx->rasterizer->pipe.poly_stipple_enable)
+		if(((struct nvfx_rasterizer_state*)hwcso)->pipe.point_quad_rasterization != nvfx->rasterizer->pipe.point_quad_rasterization
+				|| ((struct nvfx_rasterizer_state*)hwcso)->pipe.sprite_coord_enable != nvfx->rasterizer->pipe.sprite_coord_enable
+				|| ((struct nvfx_rasterizer_state*)hwcso)->pipe.sprite_coord_mode != nvfx->rasterizer->pipe.sprite_coord_mode)
 		{
-			nvfx->dirty |= NVFX_NEW_STIPPLE;
-			nvfx->draw_dirty |= NVFX_NEW_STIPPLE;
+			nvfx->dirty |= NVFX_NEW_SPRITE;
 		}
 	}
 
@@ -315,10 +201,8 @@ nvfx_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 	struct nvfx_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
 	struct nouveau_statebuf_builder sb = sb_init(zsaso->sb);
 
-	sb_method(sb, NV34TCL_DEPTH_FUNC, 3);
+	sb_method(sb, NV34TCL_DEPTH_FUNC, 1);
 	sb_data  (sb, nvgl_comparison_op(cso->depth.func));
-	sb_data  (sb, cso->depth.writemask ? 1 : 0);
-	sb_data  (sb, cso->depth.enabled ? 1 : 0);
 
 	sb_method(sb, NV34TCL_ALPHA_FUNC_ENABLE, 3);
 	sb_data  (sb, cso->alpha.enabled ? 1 : 0);
@@ -377,76 +261,6 @@ nvfx_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
 	FREE(zsaso);
 }
 
-static void *
-nvfx_vp_state_create(struct pipe_context *pipe,
-		     const struct pipe_shader_state *cso)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nvfx_vertex_program *vp;
-
-	vp = CALLOC(1, sizeof(struct nvfx_vertex_program));
-	vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
-	vp->draw = draw_create_vertex_shader(nvfx->draw, &vp->pipe);
-
-	return (void *)vp;
-}
-
-static void
-nvfx_vp_state_bind(struct pipe_context *pipe, void *hwcso)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-
-	nvfx->vertprog = hwcso;
-	nvfx->dirty |= NVFX_NEW_VERTPROG;
-	nvfx->draw_dirty |= NVFX_NEW_VERTPROG;
-}
-
-static void
-nvfx_vp_state_delete(struct pipe_context *pipe, void *hwcso)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nvfx_vertex_program *vp = hwcso;
-
-	draw_delete_vertex_shader(nvfx->draw, vp->draw);
-	nvfx_vertprog_destroy(nvfx, vp);
-	FREE((void*)vp->pipe.tokens);
-	FREE(vp);
-}
-
-static void *
-nvfx_fp_state_create(struct pipe_context *pipe,
-		     const struct pipe_shader_state *cso)
-{
-	struct nvfx_fragment_program *fp;
-
-	fp = CALLOC(1, sizeof(struct nvfx_fragment_program));
-	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
-
-	tgsi_scan_shader(fp->pipe.tokens, &fp->info);
-
-	return (void *)fp;
-}
-
-static void
-nvfx_fp_state_bind(struct pipe_context *pipe, void *hwcso)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-
-	nvfx->fragprog = hwcso;
-	nvfx->dirty |= NVFX_NEW_FRAGPROG;
-}
-
-static void
-nvfx_fp_state_delete(struct pipe_context *pipe, void *hwcso)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nvfx_fragment_program *fp = hwcso;
-
-	nvfx_fragprog_destroy(nvfx, fp);
-	FREE((void*)fp->pipe.tokens);
-	FREE(fp);
-}
-
 static void
 nvfx_set_blend_color(struct pipe_context *pipe,
 		     const struct pipe_blend_color *bcol)
@@ -507,7 +321,10 @@ nvfx_set_framebuffer_state(struct pipe_context *pipe,
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
 
-	nvfx->framebuffer = *fb;
+	if(fb)
+		util_copy_framebuffer_state(&nvfx->framebuffer, fb);
+	else
+		util_unreference_framebuffer_state(&nvfx->framebuffer);
 	nvfx->dirty |= NVFX_NEW_FB;
 }
 
@@ -542,65 +359,6 @@ nvfx_set_viewport_state(struct pipe_context *pipe,
 	nvfx->draw_dirty |= NVFX_NEW_VIEWPORT;
 }
 
-static void
-nvfx_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
-			const struct pipe_vertex_buffer *vb)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-
-	memcpy(nvfx->vtxbuf, vb, sizeof(*vb) * count);
-	nvfx->vtxbuf_nr = count;
-
-	nvfx->dirty |= NVFX_NEW_ARRAYS;
-	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
-}
-
-static void
-nvfx_set_index_buffer(struct pipe_context *pipe,
-		      const struct pipe_index_buffer *ib)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-
-	if (ib)
-		memcpy(&nvfx->idxbuf, ib, sizeof(nvfx->idxbuf));
-	else
-		memset(&nvfx->idxbuf, 0, sizeof(nvfx->idxbuf));
-
-	/* TODO make this more like a state */
-}
-
-static void *
-nvfx_vtxelts_state_create(struct pipe_context *pipe,
-			  unsigned num_elements,
-			  const struct pipe_vertex_element *elements)
-{
-	struct nvfx_vtxelt_state *cso = CALLOC_STRUCT(nvfx_vtxelt_state);
-
-	assert(num_elements < 16); /* not doing fallbacks yet */
-	cso->num_elements = num_elements;
-	memcpy(cso->pipe, elements, num_elements * sizeof(*elements));
-
-/*	nvfx_vtxelt_construct(cso);*/
-
-	return (void *)cso;
-}
-
-static void
-nvfx_vtxelts_state_delete(struct pipe_context *pipe, void *hwcso)
-{
-	FREE(hwcso);
-}
-
-static void
-nvfx_vtxelts_state_bind(struct pipe_context *pipe, void *hwcso)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-
-	nvfx->vtxelt = hwcso;
-	nvfx->dirty |= NVFX_NEW_ARRAYS;
-	/*nvfx->draw_dirty |= NVFX_NEW_ARRAYS;*/
-}
-
 void
 nvfx_init_state_functions(struct nvfx_context *nvfx)
 {
@@ -608,13 +366,6 @@ nvfx_init_state_functions(struct nvfx_context *nvfx)
 	nvfx->pipe.bind_blend_state = nvfx_blend_state_bind;
 	nvfx->pipe.delete_blend_state = nvfx_blend_state_delete;
 
-	nvfx->pipe.create_sampler_state = nvfx_sampler_state_create;
-	nvfx->pipe.bind_fragment_sampler_states = nvfx_sampler_state_bind;
-	nvfx->pipe.delete_sampler_state = nvfx_sampler_state_delete;
-	nvfx->pipe.set_fragment_sampler_views = nvfx_set_fragment_sampler_views;
-        nvfx->pipe.create_sampler_view = nvfx_create_sampler_view;
-        nvfx->pipe.sampler_view_destroy = nvfx_sampler_view_destroy;
-
 	nvfx->pipe.create_rasterizer_state = nvfx_rasterizer_state_create;
 	nvfx->pipe.bind_rasterizer_state = nvfx_rasterizer_state_bind;
 	nvfx->pipe.delete_rasterizer_state = nvfx_rasterizer_state_delete;
@@ -626,14 +377,6 @@ nvfx_init_state_functions(struct nvfx_context *nvfx)
 	nvfx->pipe.delete_depth_stencil_alpha_state =
 		nvfx_depth_stencil_alpha_state_delete;
 
-	nvfx->pipe.create_vs_state = nvfx_vp_state_create;
-	nvfx->pipe.bind_vs_state = nvfx_vp_state_bind;
-	nvfx->pipe.delete_vs_state = nvfx_vp_state_delete;
-
-	nvfx->pipe.create_fs_state = nvfx_fp_state_create;
-	nvfx->pipe.bind_fs_state = nvfx_fp_state_bind;
-	nvfx->pipe.delete_fs_state = nvfx_fp_state_delete;
-
 	nvfx->pipe.set_blend_color = nvfx_set_blend_color;
         nvfx->pipe.set_stencil_ref = nvfx_set_stencil_ref;
 	nvfx->pipe.set_clip_state = nvfx_set_clip_state;
@@ -643,11 +386,4 @@ nvfx_init_state_functions(struct nvfx_context *nvfx)
 	nvfx->pipe.set_polygon_stipple = nvfx_set_polygon_stipple;
 	nvfx->pipe.set_scissor_state = nvfx_set_scissor_state;
 	nvfx->pipe.set_viewport_state = nvfx_set_viewport_state;
-
-	nvfx->pipe.create_vertex_elements_state = nvfx_vtxelts_state_create;
-	nvfx->pipe.delete_vertex_elements_state = nvfx_vtxelts_state_delete;
-	nvfx->pipe.bind_vertex_elements_state = nvfx_vtxelts_state_bind;
-
-	nvfx->pipe.set_vertex_buffers = nvfx_set_vertex_buffers;
-	nvfx->pipe.set_index_buffer = nvfx_set_index_buffer;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h
index 9ceb2577ec..e9c1f2c26d 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.h
+++ b/src/gallium/drivers/nvfx/nvfx_state.h
@@ -4,11 +4,11 @@
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_scan.h"
 #include "nouveau/nouveau_statebuf.h"
+#include "util/u_dynarray.h"
+#include "util/u_linkage.h"
 
 struct nvfx_vertex_program_exec {
 	uint32_t data[4];
-	boolean has_branch_offset;
-	int const_index;
 };
 
 struct nvfx_vertex_program_data {
@@ -18,18 +18,20 @@ struct nvfx_vertex_program_data {
 
 struct nvfx_vertex_program {
 	struct pipe_shader_state pipe;
+	unsigned long long id;
 
 	struct draw_vertex_shader *draw;
 
 	boolean translated;
 
-	struct pipe_clip_state ucp;
-
 	struct nvfx_vertex_program_exec *insns;
 	unsigned nr_insns;
 	struct nvfx_vertex_program_data *consts;
 	unsigned nr_consts;
 
+	char generic_to_fp_input[256];
+	int sprite_fp_input;
+
 	struct nouveau_resource *exec;
 	unsigned exec_start;
 	struct nouveau_resource *data;
@@ -38,7 +40,10 @@ struct nvfx_vertex_program {
 
 	uint32_t ir;
 	uint32_t or;
-	uint32_t clip_ctrl;
+	int clip_nr;
+
+	struct util_dynarray branch_relocs;
+	struct util_dynarray const_relocs;
 };
 
 struct nvfx_fragment_program_data {
@@ -49,15 +54,14 @@ struct nvfx_fragment_program_data {
 struct nvfx_fragment_program_bo {
 	struct nvfx_fragment_program_bo* next;
 	struct nouveau_bo* bo;
+	unsigned char* slots;
 	char insn[] __attribute__((aligned(16)));
 };
 
 struct nvfx_fragment_program {
-	struct pipe_shader_state pipe;
-	struct tgsi_shader_info info;
-
-	boolean translated;
 	unsigned samplers;
+	unsigned point_sprite_control;
+	unsigned or;
 
 	uint32_t *insn;
 	int       insn_len;
@@ -65,13 +69,36 @@ struct nvfx_fragment_program {
 	struct nvfx_fragment_program_data *consts;
 	unsigned nr_consts;
 
+	/* the slot at num_slots is for the sprite coordinate, if any */
+	unsigned num_slots; /* how many input semantics? */
+	unsigned char slot_to_generic[10]; /* semantics */
+	unsigned char slot_to_fp_input[11]; /* current assignment of slots for each used semantic */
+	struct util_dynarray slot_relocations[11];
+
+	/* This is reset to progs on any relocation update, and decreases every time we
+	 * move to a new prog due to a constant update
+	 * When this is the same as progs, applying relocations is no longer necessary.
+	 */
+	unsigned progs_left_with_obsolete_slot_assignments;
+
+	unsigned long long last_vp_id;
+	unsigned last_sprite_coord_enable;
+
 	uint32_t fp_control;
 
 	unsigned bo_prog_idx;
 	unsigned prog_size;
 	unsigned progs_per_bo;
+	unsigned progs;
+
 	struct nvfx_fragment_program_bo* fpbo;
 };
 
+struct nvfx_pipe_fragment_program {
+        struct pipe_shader_state pipe;
+        struct tgsi_shader_info info;
+
+        struct nvfx_fragment_program* fps[2];
+};
 
 #endif
diff --git a/src/gallium/drivers/nvfx/nvfx_state_emit.c b/src/gallium/drivers/nvfx/nvfx_state_emit.c
index f91ae19ecd..390bca8cdb 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_emit.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_emit.c
@@ -1,15 +1,54 @@
 #include "nvfx_context.h"
 #include "nvfx_state.h"
+#include "nvfx_resource.h"
 #include "draw/draw_context.h"
 
 static boolean
 nvfx_state_validate_common(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	unsigned dirty = nvfx->dirty;
+	unsigned dirty;
+	unsigned still_dirty = 0;
+	int all_swizzled = -1;
+	boolean flush_tex_cache = FALSE;
+	unsigned render_temps;
 
 	if(nvfx != nvfx->screen->cur_ctx)
-		dirty = ~0;
+	{
+		nvfx->dirty = ~0;
+		nvfx->hw_vtxelt_nr = 16;
+		nvfx->hw_pointsprite_control = -1;
+		nvfx->hw_vp_output = -1;
+		nvfx->screen->cur_ctx = nvfx;
+		nvfx->relocs_needed = NVFX_RELOCATE_ALL;
+	}
+
+	/* These can trigger use the of 3D engine to copy temporaries.
+	 * That will recurse here and thus dirty all 3D state, so we need to this before anything else, and in a loop..
+	 * This converges to having clean temps, then binding both fragtexes and framebuffers.
+	 */
+	while(nvfx->dirty & (NVFX_NEW_FB | NVFX_NEW_SAMPLER))
+	{
+		if(nvfx->dirty & NVFX_NEW_SAMPLER)
+		{
+			nvfx->dirty &=~ NVFX_NEW_SAMPLER;
+			nvfx_fragtex_validate(nvfx);
+
+			// TODO: only set this if really necessary
+			flush_tex_cache = TRUE;
+		}
+
+		if(nvfx->dirty & NVFX_NEW_FB)
+		{
+			nvfx->dirty &=~ NVFX_NEW_FB;
+			all_swizzled = nvfx_framebuffer_prepare(nvfx);
+
+			// TODO: make sure this doesn't happen, i.e. fbs have matching formats
+			assert(all_swizzled >= 0);
+		}
+	}
+
+	dirty = nvfx->dirty;
 
 	if(nvfx->render_mode == HW)
 	{
@@ -19,11 +58,19 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 				return FALSE;
 		}
 
-		if(dirty & (NVFX_NEW_ARRAYS))
+		if(dirty & NVFX_NEW_ARRAYS)
 		{
 			if(!nvfx_vbo_validate(nvfx))
 				return FALSE;
 		}
+
+		if(dirty & NVFX_NEW_INDEX)
+		{
+			if(nvfx->use_index_buffer)
+				nvfx_idxbuf_validate(nvfx);
+			else
+				still_dirty = NVFX_NEW_INDEX;
+		}
 	}
 	else
 	{
@@ -31,13 +78,10 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 		if(dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP))
 			nvfx_vertprog_validate(nvfx);
 
-		if(dirty & (NVFX_NEW_ARRAYS | NVFX_NEW_FRAGPROG))
+		if(dirty & (NVFX_NEW_ARRAYS | NVFX_NEW_INDEX | NVFX_NEW_FRAGPROG))
 			nvfx_vtxfmt_validate(nvfx);
 	}
 
-	if(dirty & NVFX_NEW_FB)
-		nvfx_state_framebuffer_validate(nvfx);
-
 	if(dirty & NVFX_NEW_RAST)
 		sb_emit(chan, nvfx->rasterizer->sb, nvfx->rasterizer->sb_len);
 
@@ -47,11 +91,97 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 	if(dirty & NVFX_NEW_STIPPLE)
 		nvfx_state_stipple_validate(nvfx);
 
-	if(dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST))
+       if(nvfx->dirty & NVFX_NEW_UCP)
+	{
+		unsigned enables[7] =
+		{
+				0,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4,
+				NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE5,
+		};
+
+		if(!nvfx->use_vp_clipping)
+		{
+			WAIT_RING(chan, 2);
+			OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1));
+			OUT_RING(chan, 0);
+
+			WAIT_RING(chan, 6 * 4 + 1);
+			OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANE_A(0), nvfx->clip.nr * 4));
+			OUT_RINGp(chan, &nvfx->clip.ucp[0][0], nvfx->clip.nr * 4);
+		}
+
+		WAIT_RING(chan, 2);
+		OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1));
+		OUT_RING(chan, enables[nvfx->clip.nr]);
+	}
+
+	if(nvfx->use_vp_clipping && (nvfx->dirty & (NVFX_NEW_UCP | NVFX_NEW_VERTPROG)))
+	{
+		unsigned i;
+		struct nvfx_vertex_program* vp = nvfx->vertprog;
+		if(nvfx->clip.nr != vp->clip_nr)
+		{
+			unsigned idx;
+			WAIT_RING(chan, 14);
+
+			/* remove last instruction bit */
+			if(vp->clip_nr >= 0)
+			{
+				idx = vp->nr_insns - 7 + vp->clip_nr;
+				OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_FROM_ID, 1));
+				OUT_RING(chan,  vp->exec->start + idx);
+				OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_INST(0), 4));
+				OUT_RINGp (chan, vp->insns[idx].data, 4);
+			}
+
+			 /* set last instruction bit */
+			idx = vp->nr_insns - 7 + nvfx->clip.nr;
+			OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_FROM_ID, 1));
+			OUT_RING(chan,  vp->exec->start + idx);
+			OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_INST(0), 4));
+			OUT_RINGp(chan, vp->insns[idx].data, 3);
+			OUT_RING(chan, vp->insns[idx].data[3] | 1);
+			vp->clip_nr = nvfx->clip.nr;
+		}
+
+		// TODO: only do this for the ones changed
+		WAIT_RING(chan, 6 * 6);
+		for(i = 0; i < nvfx->clip.nr; ++i)
+		{
+			OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_CONST_ID, 5));
+			OUT_RING(chan, vp->data->start + i);
+			OUT_RINGp (chan, nvfx->clip.ucp[i], 4);
+		}
+	}
+
+	if(dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST | NVFX_NEW_VERTPROG | NVFX_NEW_SPRITE))
+	{
 		nvfx_fragprog_validate(nvfx);
+		if(dirty & NVFX_NEW_FRAGPROG)
+			flush_tex_cache = TRUE; // TODO: do we need this?
+	}
 
-	if(dirty & NVFX_NEW_SAMPLER)
-		nvfx_fragtex_validate(nvfx);
+	if(nvfx->is_nv4x)
+	{
+		unsigned vp_output = nvfx->vertprog->or | nvfx->hw_fragprog->or;
+		vp_output |= (1 << (nvfx->clip.nr + 6)) - (1 << 6);
+
+		if(vp_output != nvfx->hw_vp_output)
+		{
+			WAIT_RING(chan, 2);
+			OUT_RING(chan, RING_3D(NV40TCL_VP_RESULT_EN, 1));
+			OUT_RING(chan, vp_output);
+			nvfx->hw_vp_output = vp_output;
+		}
+	}
+
+	if(all_swizzled >= 0)
+		nvfx_framebuffer_validate(nvfx, all_swizzled);
 
 	if(dirty & NVFX_NEW_BLEND)
 		sb_emit(chan, nvfx->blend->sb, nvfx->blend->sb_len);
@@ -65,31 +195,62 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 	if(dirty & NVFX_NEW_SR)
 		nvfx_state_sr_validate(nvfx);
 
-/* Having this depend on FB looks wrong, but it seems
-   necessary to make this work on nv3x
+/* All these dependencies are wrong, but otherwise
+   etracer, neverball, foobillard, glest totally misrender
    TODO: find the right fix
 */
-	if(dirty & (NVFX_NEW_VIEWPORT | NVFX_NEW_FB))
+	if(dirty & (NVFX_NEW_VIEWPORT | NVFX_NEW_RAST | NVFX_NEW_ZSA) || (all_swizzled >= 0))
+	{
 		nvfx_state_viewport_validate(nvfx);
+	}
+
+	if(dirty & NVFX_NEW_ZSA || (all_swizzled >= 0))
+	{
+		WAIT_RING(chan, 3);
+		OUT_RING(chan, RING_3D(NV34TCL_DEPTH_WRITE_ENABLE, 2));
+		OUT_RING(chan, nvfx->framebuffer.zsbuf && nvfx->zsa->pipe.depth.writemask);
+	        OUT_RING(chan, nvfx->framebuffer.zsbuf && nvfx->zsa->pipe.depth.enabled);
+	}
 
-	/* TODO: could nv30 need this or something similar too? */
-	if((dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_SAMPLER)) && nvfx->is_nv4x) {
-		WAIT_RING(chan, 4);
-		OUT_RING(chan, RING_3D(NV40TCL_TEX_CACHE_CTL, 1));
-		OUT_RING(chan, 2);
-		OUT_RING(chan, RING_3D(NV40TCL_TEX_CACHE_CTL, 1));
-		OUT_RING(chan, 1);
+	if(flush_tex_cache)
+	{
+		// TODO: what about nv30?
+		if(nvfx->is_nv4x)
+		{
+			WAIT_RING(chan, 4);
+			OUT_RING(chan, RING_3D(NV40TCL_TEX_CACHE_CTL, 1));
+			OUT_RING(chan, 2);
+			OUT_RING(chan, RING_3D(NV40TCL_TEX_CACHE_CTL, 1));
+			OUT_RING(chan, 1);
+		}
 	}
-	nvfx->dirty = 0;
+
+	nvfx->dirty = dirty & still_dirty;
+
+	render_temps = nvfx->state.render_temps;
+	if(render_temps)
+	{
+		for(int i = 0; i < nvfx->framebuffer.nr_cbufs; ++i)
+		{
+			if(render_temps & (1 << i))
+				util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(nvfx->framebuffer.cbufs[i]),
+						(struct util_dirty_surface*)nvfx->framebuffer.cbufs[i]);
+		}
+
+		if(render_temps & 0x80)
+			util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(nvfx->framebuffer.zsbuf),
+					(struct util_dirty_surface*)nvfx->framebuffer.zsbuf);
+	}
+
 	return TRUE;
 }
 
-void
-nvfx_state_emit(struct nvfx_context *nvfx)
+inline void
+nvfx_state_relocate(struct nvfx_context *nvfx, unsigned relocs)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
 	/* we need to ensure there is enough space to output relocations in one go */
-	unsigned max_relocs = 0
+	const unsigned max_relocs = 0
 	      + 16 /* vertex buffers, incl. dma flag */
 	      + 2 /* index buffer plus format+dma flag */
 	      + 2 * 5 /* 4 cbufs + zsbuf, plus dma objects */
@@ -97,18 +258,19 @@ nvfx_state_emit(struct nvfx_context *nvfx)
 	      + 2 * 4 /* vertex textures plus format+dma flag */
 	      + 1 /* fragprog incl dma flag */
 	      ;
+
 	MARK_RING(chan, max_relocs * 2, max_relocs * 2);
-	nvfx_state_relocate(nvfx);
-}
 
-void
-nvfx_state_relocate(struct nvfx_context *nvfx)
-{
-	nvfx_framebuffer_relocate(nvfx);
-	nvfx_fragtex_relocate(nvfx);
-	nvfx_fragprog_relocate(nvfx);
-	if (nvfx->render_mode == HW)
+	if(relocs & NVFX_RELOCATE_FRAMEBUFFER)
+		nvfx_framebuffer_relocate(nvfx);
+	if(relocs & NVFX_RELOCATE_FRAGTEX)
+		nvfx_fragtex_relocate(nvfx);
+	if(relocs & NVFX_RELOCATE_FRAGPROG)
+		nvfx_fragprog_relocate(nvfx);
+	if(relocs & NVFX_RELOCATE_VTXBUF)
 		nvfx_vbo_relocate(nvfx);
+	if(relocs & NVFX_RELOCATE_IDXBUF)
+		nvfx_idxbuf_relocate(nvfx);
 }
 
 boolean
@@ -173,6 +335,9 @@ nvfx_state_validate_swtnl(struct nvfx_context *nvfx)
 		draw_set_vertex_elements(draw, nvfx->vtxelt->num_elements, nvfx->vtxelt->pipe);
 	}
 
+	if (nvfx->draw_dirty & NVFX_NEW_INDEX)
+		draw_set_index_buffer(draw, &nvfx->idxbuf);
+
 	nvfx_state_validate_common(nvfx);
 
 	nvfx->draw_dirty = 0;
diff --git a/src/gallium/drivers/nvfx/nvfx_state_fb.c b/src/gallium/drivers/nvfx/nvfx_state_fb.c
index 360e569f77..3b869d43a1 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_fb.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_fb.c
@@ -1,21 +1,55 @@
 #include "nvfx_context.h"
 #include "nvfx_resource.h"
-#include "nouveau/nouveau_util.h"
+#include "util/u_format.h"
 
+static inline boolean
+nvfx_surface_linear_renderable(struct pipe_surface* surf)
+{
+	return (surf->texture->flags & NVFX_RESOURCE_FLAG_LINEAR)
+		&& !(surf->offset & 63)
+		&& !(((struct nvfx_surface*)surf)->pitch & 63);
+}
 
+static inline boolean
+nvfx_surface_swizzled_renderable(struct pipe_framebuffer_state* fb, struct pipe_surface* surf)
+{
+	/* TODO: return FALSE if we have a format not supporting swizzled rendering (e.g. r8); currently those are not supported at all */
+	return !((struct nvfx_miptree*)surf->texture)->linear_pitch
+		&& (surf->texture->target != PIPE_TEXTURE_3D || u_minify(surf->texture->depth0, surf->level) <= 1)
+		&& !(surf->offset & 127)
+		&& (surf->width == fb->width)
+		&& (surf->height == fb->height)
+		&& !((struct nvfx_surface*)surf)->temp;
+}
 
-void
-nvfx_state_framebuffer_validate(struct nvfx_context *nvfx)
+static boolean
+nvfx_surface_get_render_target(struct pipe_surface* surf, int all_swizzled, struct nvfx_render_target* target)
+{
+	struct nvfx_surface* ns = (struct nvfx_surface*)surf;
+	if(!ns->temp)
+	{
+		target->bo = ((struct nvfx_miptree*)surf->texture)->base.bo;
+		target->offset = surf->offset;
+		target->pitch = align(ns->pitch, 64);
+		assert(target->pitch);
+		return FALSE;
+	}
+	else
+	{
+		target->offset = 0;
+		target->pitch = ns->temp->linear_pitch;
+		target->bo = ns->temp->base.bo;
+		assert(target->pitch);
+		return TRUE;
+	}
+}
+
+int
+nvfx_framebuffer_prepare(struct nvfx_context *nvfx)
 {
 	struct pipe_framebuffer_state *fb = &nvfx->framebuffer;
-	struct nouveau_channel *chan = nvfx->screen->base.channel;
-	uint32_t rt_enable = 0, rt_format = 0;
-	int i, colour_format = 0, zeta_format = 0;
-	int depth_only = 0;
-	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
-	unsigned w = fb->width;
-	unsigned h = fb->height;
-	int colour_bits = 32, zeta_bits = 32;
+	int i, color_format = 0, zeta_format = 0;
+	int all_swizzled = 1;
 
 	if(!nvfx->is_nv4x)
 		assert(fb->nr_cbufs <= 2);
@@ -23,113 +57,135 @@ nvfx_state_framebuffer_validate(struct nvfx_context *nvfx)
 		assert(fb->nr_cbufs <= 4);
 
 	for (i = 0; i < fb->nr_cbufs; i++) {
-		if (colour_format)
-			assert(colour_format == fb->cbufs[i]->format);
-		else
-			colour_format = fb->cbufs[i]->format;
-
-		rt_enable |= (NV34TCL_RT_ENABLE_COLOR0 << i);
-		nvfx->hw_rt[i].bo = nvfx_surface_buffer(fb->cbufs[i]);
-		nvfx->hw_rt[i].offset = fb->cbufs[i]->offset;
-		nvfx->hw_rt[i].pitch = ((struct nv04_surface *)fb->cbufs[i])->pitch;
+		if (color_format) {
+			if(color_format != fb->cbufs[i]->format)
+				return -1;
+		} else
+			color_format = fb->cbufs[i]->format;
+
+		if(!nvfx_surface_swizzled_renderable(fb, fb->cbufs[i]))
+			all_swizzled = 0;
 	}
-	for(; i < 4; ++i)
-		nvfx->hw_rt[i].bo = 0;
 
+	if (fb->zsbuf) {
+		/* TODO: return FALSE if we have a format not supporting a depth buffer (e.g. r8); currently those are not supported at all */
+		if(!nvfx_surface_swizzled_renderable(fb, fb->zsbuf))
+			all_swizzled = 0;
+
+		if(all_swizzled && util_format_get_blocksize(color_format) != util_format_get_blocksize(zeta_format))
+			all_swizzled = 0;
+	}
+
+	for (i = 0; i < fb->nr_cbufs; i++) {
+		if(!((struct nvfx_surface*)fb->cbufs[i])->temp && !all_swizzled && !nvfx_surface_linear_renderable(fb->cbufs[i]))
+			nvfx_surface_create_temp(&nvfx->pipe, fb->cbufs[i]);
+	}
+
+	if(fb->zsbuf) {
+		if(!((struct nvfx_surface*)fb->zsbuf)->temp && !all_swizzled && !nvfx_surface_linear_renderable(fb->zsbuf))
+			nvfx_surface_create_temp(&nvfx->pipe, fb->zsbuf);
+	}
+
+	return all_swizzled;
+}
+
+void
+nvfx_framebuffer_validate(struct nvfx_context *nvfx, unsigned prepare_result)
+{
+	struct pipe_framebuffer_state *fb = &nvfx->framebuffer;
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	uint32_t rt_enable, rt_format;
+	int i;
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
+	unsigned w = fb->width;
+	unsigned h = fb->height;
+
+	rt_enable = (NV34TCL_RT_ENABLE_COLOR0 << fb->nr_cbufs) - 1;
 	if (rt_enable & (NV34TCL_RT_ENABLE_COLOR1 |
 			 NV40TCL_RT_ENABLE_COLOR2 | NV40TCL_RT_ENABLE_COLOR3))
 		rt_enable |= NV34TCL_RT_ENABLE_MRT;
 
+	nvfx->state.render_temps = 0;
+
+	for (i = 0; i < fb->nr_cbufs; i++)
+		nvfx->state.render_temps |= nvfx_surface_get_render_target(fb->cbufs[i], prepare_result, &nvfx->hw_rt[i]) << i;
+
+	for(; i < 4; ++i)
+		nvfx->hw_rt[i].bo = 0;
+
 	if (fb->zsbuf) {
-		zeta_format = fb->zsbuf->format;
-		nvfx->hw_zeta.bo = nvfx_surface_buffer(fb->zsbuf);
-		nvfx->hw_zeta.offset = fb->zsbuf->offset;
-		nvfx->hw_zeta.pitch = ((struct nv04_surface *)fb->zsbuf)->pitch;
-	}
-	else
-		nvfx->hw_zeta.bo = 0;
-
-	if (rt_enable & (NV34TCL_RT_ENABLE_COLOR0 | NV34TCL_RT_ENABLE_COLOR1 |
-		NV40TCL_RT_ENABLE_COLOR2 | NV40TCL_RT_ENABLE_COLOR3)) {
-		/* Render to at least a colour buffer */
-		if (!(fb->cbufs[0]->texture->flags & NVFX_RESOURCE_FLAG_LINEAR)) {
-			assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
-			for (i = 1; i < fb->nr_cbufs; i++)
-				assert(!(fb->cbufs[i]->texture->flags & NVFX_RESOURCE_FLAG_LINEAR));
-
-			rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
-				(log2i(fb->cbufs[0]->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
-				(log2i(fb->cbufs[0]->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
-		}
-		else
-			rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
-	} else if (fb->zsbuf) {
-		depth_only = 1;
-
-		/* Render to depth buffer only */
-		if (!(fb->zsbuf->texture->usage & NVFX_RESOURCE_FLAG_LINEAR)) {
-			assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
-
-			rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
-				(log2i(fb->zsbuf->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
-				(log2i(fb->zsbuf->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
-		}
-		else
-			rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
-	} else {
-		return;
+		nvfx->state.render_temps |= nvfx_surface_get_render_target(fb->zsbuf, prepare_result, &nvfx->hw_zeta) << 7;
+
+		assert(util_format_get_stride(fb->zsbuf->format, fb->width) <= nvfx->hw_zeta.pitch);
+		assert(nvfx->hw_zeta.offset + nvfx->hw_zeta.pitch * fb->height <= nvfx->hw_zeta.bo->size);
 	}
 
-	switch (colour_format) {
-	case PIPE_FORMAT_B8G8R8X8_UNORM:
-		rt_format |= NV34TCL_RT_FORMAT_COLOR_X8R8G8B8;
-		break;
-	case PIPE_FORMAT_B8G8R8A8_UNORM:
-	case 0:
-		rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8;
-		break;
-	case PIPE_FORMAT_B5G6R5_UNORM:
+	if (prepare_result) {
+		assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+
+		rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
+			(util_logbase2(fb->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
+			(util_logbase2(fb->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
+	} else
+		rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
+
+	if(fb->nr_cbufs > 0) {
+		switch (fb->cbufs[0]->format) {
+		case PIPE_FORMAT_B8G8R8X8_UNORM:
+			rt_format |= NV34TCL_RT_FORMAT_COLOR_X8R8G8B8;
+			break;
+		case PIPE_FORMAT_B8G8R8A8_UNORM:
+		case 0:
+			rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8;
+			break;
+		case PIPE_FORMAT_B5G6R5_UNORM:
+			rt_format |= NV34TCL_RT_FORMAT_COLOR_R5G6B5;
+			break;
+		default:
+			assert(0);
+		}
+	} else if(fb->zsbuf && util_format_get_blocksize(fb->zsbuf->format) == 2)
 		rt_format |= NV34TCL_RT_FORMAT_COLOR_R5G6B5;
-		colour_bits = 16;
-		break;
-	default:
-		assert(0);
-	}
+	else
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8;
 
-	switch (zeta_format) {
-	case PIPE_FORMAT_Z16_UNORM:
+	if(fb->zsbuf) {
+		switch (fb->zsbuf->format) {
+		case PIPE_FORMAT_Z16_UNORM:
+			rt_format |= NV34TCL_RT_FORMAT_ZETA_Z16;
+			break;
+		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+		case PIPE_FORMAT_X8Z24_UNORM:
+		case 0:
+			rt_format |= NV34TCL_RT_FORMAT_ZETA_Z24S8;
+			break;
+		default:
+			assert(0);
+		}
+	} else if(fb->nr_cbufs && util_format_get_blocksize(fb->cbufs[0]->format) == 2)
 		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z16;
-		zeta_bits = 16;
-		break;
-	case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
-	case PIPE_FORMAT_X8Z24_UNORM:
-	case 0:
+	else
 		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z24S8;
-		break;
-	default:
-		assert(0);
-	}
 
-	if ((!nvfx->is_nv4x) && colour_bits > zeta_bits) {
-		/* TODO: does this limitation really exist?
-		   TODO: can it be worked around somehow? */
-		assert(0);
-	}
+	if ((rt_enable & NV34TCL_RT_ENABLE_COLOR0) || fb->zsbuf) {
+		struct nvfx_render_target *rt0 = &nvfx->hw_rt[0];
+		uint32_t pitch;
 
-	if ((rt_enable & NV34TCL_RT_ENABLE_COLOR0)
-		|| ((!nvfx->is_nv4x) && depth_only)) {
-		struct nvfx_render_target *rt0 = (depth_only ? &nvfx->hw_zeta : &nvfx->hw_rt[0]);
-		uint32_t pitch = rt0->pitch;
+		if(!(rt_enable & NV34TCL_RT_ENABLE_COLOR0))
+			rt0 = &nvfx->hw_zeta;
+
+		pitch = rt0->pitch;
 
 		if(!nvfx->is_nv4x)
 		{
-			if (nvfx->hw_zeta.bo) {
+			if (nvfx->hw_zeta.bo)
 				pitch |= (nvfx->hw_zeta.pitch << 16);
-			} else {
+			else
 				pitch |= (pitch << 16);
-			}
 		}
 
+		//printf("rendering to bo %p [%i] at offset %i with pitch %i\n", rt0->bo, rt0->bo->handle, rt0->offset, pitch);
+
 		OUT_RING(chan, RING_3D(NV34TCL_DMA_COLOR0, 1));
 		OUT_RELOC(chan, rt0->bo, 0,
 			      rt_flags | NOUVEAU_BO_OR,
@@ -182,7 +238,7 @@ nvfx_state_framebuffer_validate(struct nvfx_context *nvfx)
 		}
 	}
 
-	if (zeta_format) {
+	if (fb->zsbuf) {
 		OUT_RING(chan, RING_3D(NV34TCL_DMA_ZETA, 1));
 		OUT_RELOC(chan, nvfx->hw_zeta.bo, 0,
 			      rt_flags | NOUVEAU_BO_OR,
@@ -196,6 +252,10 @@ nvfx_state_framebuffer_validate(struct nvfx_context *nvfx)
 			OUT_RING(chan, nvfx->hw_zeta.pitch);
 		}
 	}
+	else if(nvfx->is_nv4x) {
+		OUT_RING(chan, RING_3D(NV40TCL_ZETA_PITCH, 1));
+		OUT_RING(chan, 64);
+	}
 
 	OUT_RING(chan, RING_3D(NV34TCL_RT_ENABLE, 1));
 	OUT_RING(chan, rt_enable);
@@ -218,6 +278,7 @@ nvfx_state_framebuffer_validate(struct nvfx_context *nvfx)
 		OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_TX_ORIGIN, 1));
 		OUT_RING(chan, 0);
 	}
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAMEBUFFER;
 }
 
 void
@@ -247,4 +308,5 @@ nvfx_framebuffer_relocate(struct nvfx_context *nvfx)
 	DO(NV40, 3);
 
 	DO_(nvfx->hw_zeta, NV34, ZETA);
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAMEBUFFER;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_state_stipple.c b/src/gallium/drivers/nvfx/nvfx_state_stipple.c
index 4da968f093..b76e9dd382 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_stipple.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_stipple.c
@@ -4,23 +4,8 @@ void
 nvfx_state_stipple_validate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel *chan = nvfx->screen->base.channel;
-	struct pipe_rasterizer_state *rast = &nvfx->rasterizer->pipe;
 
-	if ((rast->poly_stipple_enable == 0 && nvfx->state.stipple_enabled == 0))
-		return;
-
-	if (rast->poly_stipple_enable) {
-		unsigned i;
-
-		WAIT_RING(chan, 35);
-		OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_ENABLE, 1));
-		OUT_RING(chan, 1);
-		OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32));
-		for (i = 0; i < 32; i++)
-			OUT_RING(chan, nvfx->stipple[i]);
-	} else {
-		WAIT_RING(chan, 2);
-		OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_ENABLE, 1));
-		OUT_RING(chan, 0);
-	}
+	WAIT_RING(chan, 33);
+	OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32));
+	OUT_RINGp(chan, nvfx->stipple, 32);
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_surface.c b/src/gallium/drivers/nvfx/nvfx_surface.c
index a605d2b754..a5931b6e15 100644
--- a/src/gallium/drivers/nvfx/nvfx_surface.c
+++ b/src/gallium/drivers/nvfx/nvfx_surface.c
@@ -26,33 +26,421 @@
  *
  **************************************************************************/
 
+#include "pipe/p_context.h"
+#include "pipe/p_format.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+#include "util/u_blitter.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_screen.h"
 #include "nvfx_context.h"
+#include "nvfx_screen.h"
 #include "nvfx_resource.h"
-#include "pipe/p_defines.h"
-#include "util/u_inlines.h"
-#include "util/u_pack_color.h"
+#include "nv04_2d.h"
+
+#include <nouveau/nouveau_bo.h>
+
+static INLINE void
+nvfx_region_set_format(struct nv04_region* rgn, enum pipe_format format)
+{
+	unsigned bits = util_format_get_blocksizebits(format);
+	switch(bits)
+	{
+	case 8:
+		rgn->bpps = 0;
+		break;
+	case 16:
+		rgn->bpps = 1;
+		break;
+	case 32:
+		rgn->bpps = 2;
+		break;
+	default:
+		{
+			int shift;
+			assert(util_is_power_of_two(bits));
+			shift = util_logbase2(bits) - 3;
+			assert(shift >= 2);
+			rgn->bpps = 2;
+			shift -= 2;
+
+			rgn->x = util_format_get_nblocksx(format, rgn->x) << shift;
+			rgn->y = util_format_get_nblocksy(format, rgn->y);
+		}
+	}
+}
+
+static INLINE void
+nvfx_region_fixup_swizzled(struct nv04_region* rgn, unsigned zslice, unsigned width, unsigned height, unsigned depth)
+{
+	// TODO: move this code to surface creation?
+	if((depth <= 1) && (height <= 1 || width <= 2))
+		rgn->pitch = width << rgn->bpps;
+	else if(depth > 1 && height <= 2 && width <= 2)
+	{
+		rgn->pitch = width << rgn->bpps;
+		rgn->offset += (zslice * width * height) << rgn->bpps;
+	}
+	else
+	{
+		rgn->pitch = 0;
+		rgn->z = zslice;
+		rgn->w = width;
+		rgn->h = height;
+		rgn->d = depth;
+	}
+}
+
+static INLINE void
+nvfx_region_init_for_surface(struct nv04_region* rgn, struct nvfx_surface* surf, unsigned x, unsigned y, bool for_write)
+{
+	rgn->x = x;
+	rgn->y = y;
+	rgn->z = 0;
+	nvfx_region_set_format(rgn, surf->base.base.format);
+
+	if(surf->temp)
+	{
+		rgn->bo = surf->temp->base.bo;
+		rgn->offset = 0;
+		rgn->pitch = surf->temp->linear_pitch;
+
+		if(for_write)
+			util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(&surf->base.base), &surf->base);
+	} else {
+		rgn->bo = ((struct nvfx_resource*)surf->base.base.texture)->bo;
+		rgn->offset = surf->base.base.offset;
+		rgn->pitch = surf->pitch;
+
+	        if(!(surf->base.base.texture->flags & NVFX_RESOURCE_FLAG_LINEAR))
+		        nvfx_region_fixup_swizzled(rgn, surf->base.base.zslice, surf->base.base.width, surf->base.base.height, u_minify(surf->base.base.texture->depth0, surf->base.base.level));
+	}
+}
+
+static INLINE void
+nvfx_region_init_for_subresource(struct nv04_region* rgn, struct pipe_resource* pt, struct pipe_subresource sub, unsigned x, unsigned y, unsigned z, bool for_write)
+{
+	if(pt->target != PIPE_BUFFER)
+	{
+		struct nvfx_surface* ns = (struct nvfx_surface*)util_surfaces_peek(&((struct nvfx_miptree*)pt)->surfaces, pt, sub.face, sub.level, z);
+		if(ns && util_dirty_surface_is_dirty(&ns->base))
+		{
+			nvfx_region_init_for_surface(rgn, ns, x, y, for_write);
+			return;
+		}
+	}
+
+	rgn->bo = ((struct nvfx_resource*)pt)->bo;
+	rgn->offset = nvfx_subresource_offset(pt, sub.face, sub.level, z);
+	rgn->pitch = nvfx_subresource_pitch(pt, sub.level);
+	rgn->x = x;
+	rgn->y = y;
+	rgn->z = 0;
+
+	nvfx_region_set_format(rgn, pt->format);
+	if(!(pt->flags & NVFX_RESOURCE_FLAG_LINEAR))
+		nvfx_region_fixup_swizzled(rgn, z, u_minify(pt->width0, sub.level), u_minify(pt->height0, sub.level), u_minify(pt->depth0, sub.level));
+}
+
+// TODO: actually test this for all formats, it's probably wrong for some...
+
+static INLINE int
+nvfx_surface_format(enum pipe_format format)
+{
+	switch(util_format_get_blocksize(format)) {
+	case 1:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
+	case 2:
+		//return NV04_CONTEXT_SURFACES_2D_FORMAT_Y16;
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5;
+	case 4:
+		//if(format == PIPE_FORMAT_B8G8R8X8_UNORM || format == PIPE_FORMAT_B8G8R8A8_UNORM)
+			return NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8;
+		//else
+		//	return NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
+	default:
+		return -1;
+	}
+}
+
+static INLINE int
+nv04_scaled_image_format(enum pipe_format format)
+{
+	switch(util_format_get_blocksize(format)) {
+	case 1:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_Y8;
+	case 2:
+		//if(format == PIPE_FORMAT_B5G5R5A1_UNORM)
+		//	return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5;
+		//else
+			return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5;
+	case 4:
+		if(format == PIPE_FORMAT_B8G8R8X8_UNORM)
+			return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8;
+		else
+			return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8;
+	default:
+		return -1;
+	}
+}
+
+// XXX: must save index buffer too!
+static struct blitter_context*
+nvfx_get_blitter(struct pipe_context* pipe, int copy)
+{
+	struct nvfx_context* nvfx = nvfx_context(pipe);
+
+	struct blitter_context* blitter = nvfx->blitter;
+	if(!blitter)
+		nvfx->blitter = blitter = util_blitter_create(pipe);
+
+	util_blitter_save_blend(blitter, nvfx->blend);
+	util_blitter_save_depth_stencil_alpha(blitter, nvfx->zsa);
+	util_blitter_save_stencil_ref(blitter, &nvfx->stencil_ref);
+	util_blitter_save_rasterizer(blitter, nvfx->rasterizer);
+	util_blitter_save_fragment_shader(blitter, nvfx->fragprog);
+	util_blitter_save_vertex_shader(blitter, nvfx->vertprog);
+	util_blitter_save_viewport(blitter, &nvfx->viewport);
+	util_blitter_save_framebuffer(blitter, &nvfx->framebuffer);
+	util_blitter_save_clip(blitter, &nvfx->clip);
+	util_blitter_save_vertex_elements(blitter, nvfx->vtxelt);
+	util_blitter_save_vertex_buffers(blitter, nvfx->vtxbuf_nr, nvfx->vtxbuf);
+
+	if(copy)
+	{
+		util_blitter_save_fragment_sampler_states(blitter, nvfx->nr_samplers, (void**)nvfx->tex_sampler);
+		util_blitter_save_fragment_sampler_views(blitter, nvfx->nr_textures, nvfx->fragment_sampler_views);
+	}
+
+	return blitter;
+}
+
+static unsigned
+nvfx_region_clone(struct nv04_2d_context* ctx, struct nv04_region* rgn, unsigned w, unsigned h, boolean for_read)
+{
+	unsigned begin = nv04_region_begin(rgn, w, h);
+	unsigned end = nv04_region_end(rgn, w, h);
+	unsigned size = end - begin;
+	struct nouveau_bo* bo = 0;
+	nouveau_bo_new(rgn->bo->device, NOUVEAU_BO_MAP | NOUVEAU_BO_GART, 256, size, &bo);
+
+	if(for_read || (size > ((w * h) << rgn->bpps)))
+		nv04_memcpy(ctx, bo, 0, rgn->bo, rgn->offset + begin, size);
+
+	rgn->bo = bo;
+	rgn->offset = -begin;
+	return begin;
+}
 
 static void
-nvfx_surface_copy(struct pipe_context *pipe,
-		  struct pipe_resource *dest, struct pipe_subresource subdst,
-		  unsigned destx, unsigned desty, unsigned destz,
-		  struct pipe_resource *src, struct pipe_subresource subsrc,
+nvfx_resource_copy_region(struct pipe_context *pipe,
+		  struct pipe_resource *dstr, struct pipe_subresource subdst,
+		  unsigned dstx, unsigned dsty, unsigned dstz,
+		  struct pipe_resource *srcr, struct pipe_subresource subsrc,
 		  unsigned srcx, unsigned srcy, unsigned srcz,
-		  unsigned width, unsigned height)
+		  unsigned w, unsigned h)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nv04_surface_2d *eng2d = nvfx->screen->eng2d;
-	struct pipe_surface *ps_dst, *ps_src;
+	static int copy_threshold = -1;
+	struct nv04_2d_context *ctx = nvfx_screen(pipe->screen)->eng2d;
+	struct nv04_region dst, src;
+	int dst_to_gpu;
+	int src_on_gpu;
+	boolean small;
+	int ret;
+
+	if(!w || !h)
+		return;
+
+	if(copy_threshold < 0)
+		copy_threshold = debug_get_num_option("NOUVEAU_COPY_THRESHOLD", 4);
+
+	dst_to_gpu = dstr->usage != PIPE_USAGE_DYNAMIC && dstr->usage != PIPE_USAGE_STAGING;
+	src_on_gpu = nvfx_resource_on_gpu(srcr);
+
+	nvfx_region_init_for_subresource(&dst, dstr, subdst, dstx, dsty, dstz, TRUE);
+	nvfx_region_init_for_subresource(&src, srcr, subsrc, srcx, srcy, srcz, FALSE);
+	w = util_format_get_stride(dstr->format, w) >> dst.bpps;
+	h = util_format_get_nblocksy(dstr->format, h);
 
-	ps_src = nvfx_miptree_surface_new(pipe->screen, src, subsrc.face,
-					  subsrc.level, srcz, 0 /* bind flags */);
-	ps_dst = nvfx_miptree_surface_new(pipe->screen, dest, subdst.face,
-					  subdst.level, destz, 0 /* bindflags */);
+	small = (w * h <= copy_threshold);
+	if((!dst_to_gpu || !src_on_gpu) && small)
+		ret = -1; /* use the CPU */
+	else
+		ret = nv04_region_copy_2d(ctx, &dst, &src, w, h,
+			dstr->target == PIPE_BUFFER ? -1 : nvfx_surface_format(dstr->format),
+			dstr->target == PIPE_BUFFER ? -1 : nv04_scaled_image_format(dstr->format),
+			dst_to_gpu, src_on_gpu);
+	if(!ret)
+	{}
+	else if(ret > 0 && dstr->bind & PIPE_BIND_RENDER_TARGET && srcr->bind & PIPE_BIND_SAMPLER_VIEW)
+	{
+		struct blitter_context* blitter = nvfx_get_blitter(pipe, 1);
+		util_blitter_copy_region(blitter, dstr, subdst, dstx, dsty, dstz, srcr, subsrc, srcx, srcy, srcz, w, h, TRUE);
+	}
+	else
+	{
+		struct nv04_region dstt = dst;
+		struct nv04_region srct = src;
+		unsigned dstbegin = 0;
 
-	eng2d->copy(eng2d, ps_dst, destx, desty, ps_src, srcx, srcy, width, height);
+		if(!small)
+		{
+			if(src_on_gpu)
+				nvfx_region_clone(ctx, &srct, w, h, TRUE);
 
-	nvfx_miptree_surface_del(ps_src);
-	nvfx_miptree_surface_del(ps_dst);
+			if(dst_to_gpu)
+				dstbegin = nvfx_region_clone(ctx, &dstt, w, h, FALSE);
+		}
+
+		nv04_region_copy_cpu(&dstt, &srct, w, h);
+
+		if(srct.bo != src.bo)
+			nouveau_screen_bo_release(pipe->screen, srct.bo);
+
+		if(dstt.bo != dst.bo)
+		{
+			nv04_memcpy(ctx, dst.bo, dst.offset + dstbegin, dstt.bo, 0, dstt.bo->size);
+			nouveau_screen_bo_release(pipe->screen, dstt.bo);
+		}
+	}
+}
+
+static int
+nvfx_surface_fill(struct pipe_context* pipe, struct pipe_surface *dsts,
+		  unsigned dx, unsigned dy, unsigned w, unsigned h, unsigned value)
+{
+	struct nv04_2d_context *ctx = nvfx_screen(pipe->screen)->eng2d;
+	struct nv04_region dst;
+	int ret;
+	/* Always try to use the GPU right now, if possible
+	 * If the user wanted the surface data on the CPU, he would have cleared with memset (hopefully) */
+
+	// we don't care about interior pixel order since we set all them to the same value
+	nvfx_region_init_for_surface(&dst, (struct nvfx_surface*)dsts, dx, dy, TRUE);
+
+	w = util_format_get_stride(dsts->format, w) >> dst.bpps;
+	h = util_format_get_nblocksy(dsts->format, h);
+
+	ret = nv04_region_fill_2d(ctx, &dst, w, h, value);
+	if(ret > 0 && dsts->texture->bind & PIPE_BIND_RENDER_TARGET)
+		return 1;
+	else if(ret)
+	{
+		struct nv04_region dstt = dst;
+		unsigned dstbegin = 0;
+
+		if(nvfx_resource_on_gpu(dsts->texture))
+			dstbegin = nvfx_region_clone(ctx, &dstt, w, h, FALSE);
+
+		nv04_region_fill_cpu(&dstt, w, h, value);
+
+		if(dstt.bo != dst.bo)
+		{
+			nv04_memcpy(ctx, dst.bo, dst.offset + dstbegin, dstt.bo, 0, dstt.bo->size);
+			nouveau_screen_bo_release(pipe->screen, dstt.bo);
+		}
+	}
+
+	return 0;
+}
+
+
+void
+nvfx_screen_surface_takedown(struct pipe_screen *pscreen)
+{
+	nv04_2d_context_takedown(nvfx_screen(pscreen)->eng2d);
+	nvfx_screen(pscreen)->eng2d = 0;
+}
+
+int
+nvfx_screen_surface_init(struct pipe_screen *pscreen)
+{
+	struct nv04_2d_context* ctx = nv04_2d_context_init(nouveau_screen(pscreen)->channel);
+	if(!ctx)
+		return -1;
+	nvfx_screen(pscreen)->eng2d = ctx;
+	return 0;
+}
+
+static void
+nvfx_surface_copy_temp(struct pipe_context* pipe, struct pipe_surface* surf, int to_temp)
+{
+	struct nvfx_surface* ns = (struct nvfx_surface*)surf;
+	struct pipe_subresource tempsr, surfsr;
+	struct nvfx_context* nvfx = nvfx_context(pipe);
+
+	// TODO: we really should do this validation before setting these variable in draw calls
+	unsigned use_vertex_buffers = nvfx->use_vertex_buffers;
+	boolean use_index_buffer = nvfx->use_index_buffer;
+	unsigned base_vertex = nvfx->base_vertex;
+
+	tempsr.face = 0;
+	tempsr.level = 0;
+	surfsr.face = surf->face;
+	surfsr.level = surf->level;
+
+	if(to_temp)
+		nvfx_resource_copy_region(pipe, &ns->temp->base.base, tempsr, 0, 0, 0, surf->texture, surfsr, 0, 0, surf->zslice, surf->width, surf->height);
+	else
+		nvfx_resource_copy_region(pipe, surf->texture, surfsr, 0, 0, surf->zslice, &ns->temp->base.base, tempsr, 0, 0, 0, surf->width, surf->height);
+
+	nvfx->use_vertex_buffers = use_vertex_buffers;
+	nvfx->use_index_buffer = use_index_buffer;
+        nvfx->base_vertex = base_vertex;
+
+	nvfx->dirty |= NVFX_NEW_ARRAYS;
+	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
+}
+
+void
+nvfx_surface_create_temp(struct pipe_context* pipe, struct pipe_surface* surf)
+{
+	struct nvfx_surface* ns = (struct nvfx_surface*)surf;
+	struct pipe_resource template;
+	memset(&template, 0, sizeof(struct pipe_resource));
+	template.target = PIPE_TEXTURE_2D;
+	template.format = surf->format;
+	template.width0 = surf->width;
+	template.height0 = surf->height;
+	template.depth0 = 1;
+	template.nr_samples = surf->texture->nr_samples;
+	template.flags = NVFX_RESOURCE_FLAG_LINEAR;
+
+	ns->temp = (struct nvfx_miptree*)nvfx_miptree_create(pipe->screen, &template);
+	nvfx_surface_copy_temp(pipe, surf, 1);
+}
+
+void
+nvfx_surface_flush(struct pipe_context* pipe, struct pipe_surface* surf)
+{
+	struct nvfx_context* nvfx = (struct nvfx_context*)pipe;
+	struct nvfx_surface* ns = (struct nvfx_surface*)surf;
+	boolean bound = FALSE;
+
+	/* must be done before the copy, otherwise the copy will use the temp as destination */
+	util_dirty_surface_set_clean(nvfx_surface_get_dirty_surfaces(surf), &ns->base);
+
+	nvfx_surface_copy_temp(pipe, surf, 0);
+
+	if(nvfx->framebuffer.zsbuf == surf)
+		bound = TRUE;
+	else
+	{
+		for(unsigned i = 0; i < nvfx->framebuffer.nr_cbufs; ++i)
+		{
+			if(nvfx->framebuffer.cbufs[i] == surf)
+			{
+				bound = TRUE;
+				break;
+			}
+		}
+	}
+
+	if(!bound)
+		pipe_resource_reference((struct pipe_resource**)&ns->temp, 0);
 }
 
 static void
@@ -62,12 +450,16 @@ nvfx_clear_render_target(struct pipe_context *pipe,
 			 unsigned dstx, unsigned dsty,
 			 unsigned width, unsigned height)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nv04_surface_2d *eng2d = nvfx->screen->eng2d;
 	union util_color uc;
 	util_pack_color(rgba, dst->format, &uc);
 
-	eng2d->fill(eng2d, dst, dstx, dsty, width, height, uc.ui);
+	if(util_format_get_blocksizebits(dst->format) > 32
+		|| nvfx_surface_fill(pipe, dst, dstx, dsty, width, height, uc.ui))
+	{
+		// TODO: probably should use hardware clear here instead if possible
+		struct blitter_context* blitter = nvfx_get_blitter(pipe, 0);
+		util_blitter_clear_render_target(blitter, dst, rgba, dstx, dsty, width, height);
+	}
 }
 
 static void
@@ -79,18 +471,20 @@ nvfx_clear_depth_stencil(struct pipe_context *pipe,
 			 unsigned dstx, unsigned dsty,
 			 unsigned width, unsigned height)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nv04_surface_2d *eng2d = nvfx->screen->eng2d;
-
-	eng2d->fill(eng2d, dst, dstx, dsty, width, height,
-		    util_pack_z_stencil(dst->format, depth, stencil));
+	if(util_format_get_blocksizebits(dst->format) > 32
+		|| nvfx_surface_fill(pipe, dst, dstx, dsty, width, height, util_pack_z_stencil(dst->format, depth, stencil)))
+	{
+		// TODO: probably should use hardware clear here instead if possible
+		struct blitter_context* blitter = nvfx_get_blitter(pipe, 0);
+		util_blitter_clear_depth_stencil(blitter, dst, clear_flags, depth, stencil, dstx, dsty, width, height);
+	}
 }
 
 
 void
 nvfx_init_surface_functions(struct nvfx_context *nvfx)
 {
-	nvfx->pipe.resource_copy_region = nvfx_surface_copy;
+	nvfx->pipe.resource_copy_region = nvfx_resource_copy_region;
 	nvfx->pipe.clear_render_target = nvfx_clear_render_target;
 	nvfx->pipe.clear_depth_stencil = nvfx_clear_depth_stencil;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_tex.h b/src/gallium/drivers/nvfx/nvfx_tex.h
index 69187a79e7..34be936a89 100644
--- a/src/gallium/drivers/nvfx/nvfx_tex.h
+++ b/src/gallium/drivers/nvfx/nvfx_tex.h
@@ -1,6 +1,11 @@
 #ifndef NVFX_TEX_H_
 #define NVFX_TEX_H_
 
+#include "util/u_math.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include <nouveau/nouveau_class.h>
+
 static inline unsigned
 nvfx_tex_wrap_mode(unsigned wrap) {
 	unsigned ret;
@@ -31,7 +36,7 @@ nvfx_tex_wrap_mode(unsigned wrap) {
 		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP;
 		break;
 	default:
-		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		assert(0);
 		ret = NV34TCL_TX_WRAP_S_REPEAT;
 		break;
 	}
@@ -40,31 +45,29 @@ nvfx_tex_wrap_mode(unsigned wrap) {
 }
 
 static inline unsigned
-nvfx_tex_wrap_compare_mode(const struct pipe_sampler_state* cso)
+nvfx_tex_wrap_compare_mode(unsigned func)
 {
-	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-		switch (cso->compare_func) {
-		case PIPE_FUNC_NEVER:
-			return NV34TCL_TX_WRAP_RCOMP_NEVER;
-		case PIPE_FUNC_GREATER:
-			return NV34TCL_TX_WRAP_RCOMP_GREATER;
-		case PIPE_FUNC_EQUAL:
-			return NV34TCL_TX_WRAP_RCOMP_EQUAL;
-		case PIPE_FUNC_GEQUAL:
-			return NV34TCL_TX_WRAP_RCOMP_GEQUAL;
-		case PIPE_FUNC_LESS:
-			return NV34TCL_TX_WRAP_RCOMP_LESS;
-		case PIPE_FUNC_NOTEQUAL:
-			return NV34TCL_TX_WRAP_RCOMP_NOTEQUAL;
-		case PIPE_FUNC_LEQUAL:
-			return NV34TCL_TX_WRAP_RCOMP_LEQUAL;
-		case PIPE_FUNC_ALWAYS:
-			return NV34TCL_TX_WRAP_RCOMP_ALWAYS;
-		default:
-			break;
-		}
+	switch (func) {
+	case PIPE_FUNC_NEVER:
+		return NV34TCL_TX_WRAP_RCOMP_NEVER;
+	case PIPE_FUNC_GREATER:
+		return NV34TCL_TX_WRAP_RCOMP_GREATER;
+	case PIPE_FUNC_EQUAL:
+		return NV34TCL_TX_WRAP_RCOMP_EQUAL;
+	case PIPE_FUNC_GEQUAL:
+		return NV34TCL_TX_WRAP_RCOMP_GEQUAL;
+	case PIPE_FUNC_LESS:
+		return NV34TCL_TX_WRAP_RCOMP_LESS;
+	case PIPE_FUNC_NOTEQUAL:
+		return NV34TCL_TX_WRAP_RCOMP_NOTEQUAL;
+	case PIPE_FUNC_LEQUAL:
+		return NV34TCL_TX_WRAP_RCOMP_LEQUAL;
+	case PIPE_FUNC_ALWAYS:
+		return NV34TCL_TX_WRAP_RCOMP_ALWAYS;
+	default:
+		assert(0);
+		return 0;
 	}
-	return 0;
 }
 
 static inline unsigned nvfx_tex_filter(const struct pipe_sampler_state* cso)
@@ -128,6 +131,45 @@ struct nvfx_sampler_state {
 	uint32_t en;
 	uint32_t filt;
 	uint32_t bcol;
+	uint32_t min_lod;
+	uint32_t max_lod;
+	boolean compare;
+};
+
+struct nvfx_sampler_view {
+	struct pipe_sampler_view base;
+	int offset;
+	uint32_t swizzle;
+	uint32_t npot_size;
+	uint32_t filt;
+	uint32_t wrap_mask;
+	uint32_t wrap;
+	uint32_t lod_offset;
+	uint32_t max_lod_limit;
+	union
+	{
+		struct
+		{
+			uint32_t fmt[4]; /* nv30 has 4 entries, nv40 one */
+			int rect;
+		} nv30;
+		struct
+		{
+			uint32_t fmt[2]; /* nv30 has 4 entries, nv40 one */
+			uint32_t npot_size2; /* nv40 only */
+		} nv40;
+		uint32_t init_fmt;
+	} u;
 };
 
+struct nvfx_texture_format {
+	int fmt[6];
+	unsigned sign;
+	unsigned wrap;
+	unsigned char src[6];
+	unsigned char comp[6];
+};
+
+extern struct nvfx_texture_format nvfx_texture_formats[PIPE_FORMAT_COUNT];
+
 #endif /* NVFX_TEX_H_ */
diff --git a/src/gallium/drivers/nvfx/nvfx_transfer.c b/src/gallium/drivers/nvfx/nvfx_transfer.c
index 9ff0a93d30..7cb47a20f6 100644
--- a/src/gallium/drivers/nvfx/nvfx_transfer.c
+++ b/src/gallium/drivers/nvfx/nvfx_transfer.c
@@ -4,204 +4,218 @@
 #include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
-#include "nouveau/nouveau_winsys.h"
+#include "util/u_staging.h"
 #include "nvfx_context.h"
 #include "nvfx_screen.h"
 #include "nvfx_state.h"
 #include "nvfx_resource.h"
 #include "nvfx_transfer.h"
 
-struct nvfx_transfer {
-	struct pipe_transfer base;
-	struct pipe_surface *surface;
-	boolean direct;
-};
-
-static void
-nvfx_compatible_transfer_tex(struct pipe_resource *pt, unsigned width, unsigned height,
-			     unsigned bind,
-                             struct pipe_resource *template)
-{
-	memset(template, 0, sizeof(struct pipe_resource));
-	template->target = pt->target;
-	template->format = pt->format;
-	template->width0 = width;
-	template->height0 = height;
-	template->depth0 = 1;
-	template->last_level = 0;
-	template->nr_samples = pt->nr_samples;
-	template->bind = bind;
-	template->usage = PIPE_USAGE_DYNAMIC;
-	template->flags = NVFX_RESOURCE_FLAG_LINEAR;
-}
-
-
-static unsigned nvfx_transfer_bind_flags( unsigned transfer_usage )
+struct nvfx_staging_transfer
 {
-	unsigned bind = 0;
+	struct util_staging_transfer base;
 
-#if 0
-	if (transfer_usage & PIPE_TRANSFER_WRITE)
-		bind |= PIPE_BIND_BLIT_SOURCE;
-
-	if (transfer_usage & PIPE_TRANSFER_READ)
-		bind |= PIPE_BIND_BLIT_DESTINATION;
-#endif
-
-	return bind;
-}
+	unsigned offset;
+	unsigned map_count;
+};
 
 struct pipe_transfer *
-nvfx_miptree_transfer_new(struct pipe_context *pipe,
+nvfx_transfer_new(struct pipe_context *pipe,
 			  struct pipe_resource *pt,
 			  struct pipe_subresource sr,
 			  unsigned usage,
 			  const struct pipe_box *box)
 {
-	struct pipe_screen *pscreen = pipe->screen;
-	struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
-	struct nvfx_transfer *tx;
-	struct pipe_resource tx_tex_template, *tx_tex;
-	static int no_transfer = -1;
-	unsigned bind = nvfx_transfer_bind_flags(usage);
-	if(no_transfer < 0)
-		no_transfer = debug_get_bool_option("NOUVEAU_NO_TRANSFER", FALSE);
-
-
-	tx = CALLOC_STRUCT(nvfx_transfer);
-	if (!tx)
-		return NULL;
-
-	/* Don't handle 3D transfers yet.
-	 */
-	assert(box->depth == 1);
-
-	pipe_resource_reference(&tx->base.resource, pt);
-	tx->base.sr = sr;
-	tx->base.usage = usage;
-	tx->base.box = *box;
-	tx->base.stride = mt->level[sr.level].pitch;
-
-	/* Direct access to texture */
-	if ((pt->usage == PIPE_USAGE_DYNAMIC ||
-	     no_transfer) &&
-	    pt->flags & NVFX_RESOURCE_FLAG_LINEAR)
+        if((usage & (PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_DONTBLOCK)) == PIPE_TRANSFER_DONTBLOCK)
+        {
+                struct nouveau_bo* bo = ((struct nvfx_resource*)pt)->bo;
+                if(bo && nouveau_bo_busy(bo, NOUVEAU_BO_WR))
+                        return NULL;
+        }
+
+	if(pt->target == PIPE_BUFFER)
 	{
-		tx->direct = true;
-
-		/* XXX: just call the internal nvfx function.  
-		 */
-		tx->surface = pscreen->get_tex_surface(pscreen, pt,
-	                                               sr.face, sr.level,
-						       box->z,
-	                                               bind);
-		return &tx->base;
-	}
+		// it would be nice if we could avoid all this ridiculous overhead...
+		struct pipe_transfer* tx;
+		struct nvfx_buffer* buffer = nvfx_buffer(pt);
+
+		tx = CALLOC_STRUCT(pipe_transfer);
+		if (!tx)
+			return NULL;
 
-	tx->direct = false;
+		pipe_resource_reference(&tx->resource, pt);
+		tx->sr = sr;
+		tx->usage = usage;
+		tx->box = *box;
 
-	nvfx_compatible_transfer_tex(pt, box->width, box->height, bind, &tx_tex_template);
+		tx->slice_stride = tx->stride = util_format_get_stride(pt->format, box->width);
+		tx->data = buffer->data + util_format_get_stride(pt->format, box->x);
 
-	tx_tex = pscreen->resource_create(pscreen, &tx_tex_template);
-	if (!tx_tex)
+		return tx;
+	}
+	else
 	{
-		FREE(tx);
-		return NULL;
+	        struct nvfx_staging_transfer* tx;
+	        bool direct = !nvfx_resource_on_gpu(pt) && pt->flags & NVFX_RESOURCE_FLAG_LINEAR;
+
+	        tx = CALLOC_STRUCT(nvfx_staging_transfer);
+	        if(!tx)
+	        	return NULL;
+
+	        util_staging_transfer_init(pipe, pt, sr, usage, box, direct, &tx->base);
+
+		if(direct)
+		{
+			tx->base.base.stride = nvfx_subresource_pitch(pt, sr.level);
+			tx->base.base.slice_stride = tx->base.base.stride * u_minify(pt->height0, sr.level);
+			tx->offset = nvfx_subresource_offset(pt, sr.face, sr.level, box->z)
+				+ util_format_get_2d_size(pt->format, tx->base.base.stride, box->y)
+				+ util_format_get_stride(pt->format, box->x);
+		}
+		else
+		{
+			tx->base.base.stride = nvfx_subresource_pitch(tx->base.staging_resource, 0);
+			tx->base.base.slice_stride = tx->base.base.stride * tx->base.staging_resource->height0;
+			tx->offset = 0;
+		}
+
+		assert(tx->base.base.stride);
+
+		return &tx->base.base;
 	}
+}
 
-	tx->base.stride = ((struct nvfx_miptree*)tx_tex)->level[0].pitch;
-
-	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
-	                                       0, 0, 0,
-	                                       bind);
-
-	pipe_resource_reference(&tx_tex, NULL);
-
-	if (!tx->surface)
+static void nvfx_buffer_dirty_interval(struct nvfx_buffer* buffer, unsigned begin, unsigned size, boolean unsynchronized)
+{
+	struct nvfx_screen* screen = nvfx_screen(buffer->base.base.screen);
+	buffer->last_update_static = buffer->bytes_to_draw_until_static < 0;
+	if(buffer->dirty_begin == buffer->dirty_end)
 	{
-		pipe_surface_reference(&tx->surface, NULL);
-		FREE(tx);
-		return NULL;
+		buffer->dirty_begin = begin;
+		buffer->dirty_end = begin + size;
+		buffer->dirty_unsynchronized = unsynchronized;
+	}
+	else
+	{
+		buffer->dirty_begin = MIN2(buffer->dirty_begin, begin);
+		buffer->dirty_end = MAX2(buffer->dirty_end, begin + size);
+		buffer->dirty_unsynchronized &= unsynchronized;
 	}
 
-	if (usage & PIPE_TRANSFER_READ) {
-		struct nvfx_screen *nvscreen = nvfx_screen(pscreen);
-		struct pipe_surface *src;
+	if(unsynchronized)
+	{
+		// TODO: revisit this, it doesn't seem quite right
+		//printf("UNSYNC UPDATE %p %u %u\n", buffer, begin, size);
+		buffer->bytes_to_draw_until_static += size * screen->static_reuse_threshold;
+	}
+	else
+		buffer->bytes_to_draw_until_static = buffer->size * screen->static_reuse_threshold;
+}
 
-		src = pscreen->get_tex_surface(pscreen, pt,
-	                                       sr.face, sr.level, box->z,
-	                                       0 /*PIPE_BIND_BLIT_SOURCE*/);
+static void nvfx_transfer_flush_region( struct pipe_context *pipe,
+				      struct pipe_transfer *ptx,
+				      const struct pipe_box *box)
+{
+	if(ptx->resource->target == PIPE_BUFFER && (ptx->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
+	{
+		struct nvfx_buffer* buffer = nvfx_buffer(ptx->resource);
+		nvfx_buffer_dirty_interval(buffer,
+				(uint8_t*)ptx->data - buffer->data + util_format_get_stride(buffer->base.base.format, box->x),
+				util_format_get_stride(buffer->base.base.format, box->width),
+				!!(ptx->usage & PIPE_TRANSFER_UNSYNCHRONIZED));
+	}
+}
 
-		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
-		/* TODO: Check if SIFM can un-swizzle */
-		nvscreen->eng2d->copy(nvscreen->eng2d,
-		                      tx->surface, 0, 0,
-		                      src,
-				      box->x, box->y,
-		                      box->width, box->height);
+static void
+nvfx_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx)
+{
+	if(ptx->resource->target == PIPE_BUFFER)
+	{
+		struct nvfx_buffer* buffer = nvfx_buffer(ptx->resource);
+		if((ptx->usage & (PIPE_TRANSFER_WRITE | PIPE_TRANSFER_FLUSH_EXPLICIT)) == PIPE_TRANSFER_WRITE)
+			nvfx_buffer_dirty_interval(buffer,
+				(uint8_t*)ptx->data - buffer->data,
+				ptx->stride,
+				!!(ptx->usage & PIPE_TRANSFER_UNSYNCHRONIZED));
+		pipe_resource_reference(&ptx->resource, 0);
+		FREE(ptx);
+	}
+	else
+	{
+		struct nouveau_channel* chan = nvfx_context(pipe)->screen->base.channel;
+		util_staging_transfer_destroy(pipe, ptx);
 
-		pipe_surface_reference(&src, NULL);
+		FIRE_RING(chan);
 	}
+}
 
-	return &tx->base;
+void *
+nvfx_transfer_map(struct pipe_context *pipe, struct pipe_transfer *ptx)
+{
+	if(ptx->resource->target == PIPE_BUFFER)
+		return ptx->data;
+	else
+	{
+		struct nvfx_staging_transfer *tx = (struct nvfx_staging_transfer *)ptx;
+		if(!ptx->data)
+		{
+			struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->base.staging_resource;
+			uint8_t *map = nouveau_screen_bo_map(pipe->screen, mt->base.bo, nouveau_screen_transfer_flags(ptx->usage));
+			ptx->data = map + tx->offset;
+		}
+
+		++tx->map_count;
+		return ptx->data;
+	}
 }
 
 void
-nvfx_miptree_transfer_del(struct pipe_context *pipe,
-			  struct pipe_transfer *ptx)
+nvfx_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *ptx)
 {
-	struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx;
-
-	if (!tx->direct && (ptx->usage & PIPE_TRANSFER_WRITE)) {
-		struct pipe_screen *pscreen = pipe->screen;
-		struct nvfx_screen *nvscreen = nvfx_screen(pscreen);
-		struct pipe_surface *dst;
-
-		dst = pscreen->get_tex_surface(pscreen,
-					       ptx->resource,
-	                                       ptx->sr.face,
-					       ptx->sr.level,
-					       ptx->box.z,
-	                                       0 /*PIPE_BIND_BLIT_DESTINATION*/);
-
-		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
-		nvscreen->eng2d->copy(nvscreen->eng2d,
-		                      dst, ptx->box.x, ptx->box.y,
-		                      tx->surface, 0, 0,
-		                      ptx->box.width, ptx->box.height);
-
-		pipe_surface_reference(&dst, NULL);
+	if(ptx->resource->target != PIPE_BUFFER)
+	{
+		struct nvfx_staging_transfer *tx = (struct nvfx_staging_transfer *)ptx;
+		struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->base.staging_resource;
+
+		if(!--tx->map_count)
+		{
+			nouveau_screen_bo_unmap(pipe->screen, mt->base.bo);
+			ptx->data = 0;
+		}
 	}
-
-	pipe_surface_reference(&tx->surface, NULL);
-	pipe_resource_reference(&ptx->resource, NULL);
-	FREE(ptx);
 }
 
-void *
-nvfx_miptree_transfer_map(struct pipe_context *pipe, struct pipe_transfer *ptx)
+static void nvfx_transfer_inline_write( struct pipe_context *pipe,
+				      struct pipe_resource *pr,
+				      struct pipe_subresource sr,
+				      unsigned usage,
+				      const struct pipe_box *box,
+				      const void *data,
+				      unsigned stride,
+				      unsigned slice_stride)
 {
-	struct pipe_screen *pscreen = pipe->screen;
-	struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx;
-	struct nv04_surface *ns = (struct nv04_surface *)tx->surface;
-	struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->surface->texture;
-	uint8_t *map = nouveau_screen_bo_map(pscreen, mt->base.bo,
-					     nouveau_screen_transfer_flags(ptx->usage));
-
-	if(!tx->direct)
-		return map + ns->base.offset;
+	if(pr->target != PIPE_BUFFER)
+	{
+		u_default_transfer_inline_write(pipe, pr, sr, usage, box, data, stride, slice_stride);
+	}
 	else
-		return (map + ns->base.offset + 
-			ptx->box.y * ns->pitch + 
-			ptx->box.x * util_format_get_blocksize(ptx->resource->format));
+	{
+		struct nvfx_buffer* buffer = nvfx_buffer(pr);
+		unsigned begin = util_format_get_stride(pr->format, box->x);
+		unsigned size = util_format_get_stride(pr->format, box->width);
+		memcpy(buffer->data + begin, data, size);
+		nvfx_buffer_dirty_interval(buffer, begin, size,
+				!!(pr->flags & PIPE_TRANSFER_UNSYNCHRONIZED));
+	}
 }
 
 void
-nvfx_miptree_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *ptx)
+nvfx_init_transfer_functions(struct pipe_context *pipe)
 {
-	struct pipe_screen *pscreen = pipe->screen;
-	struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx;
-	struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->surface->texture;
-
-	nouveau_screen_bo_unmap(pscreen, mt->base.bo);
+	pipe->get_transfer = nvfx_transfer_new;
+	pipe->transfer_map = nvfx_transfer_map;
+	pipe->transfer_flush_region = nvfx_transfer_flush_region;
+	pipe->transfer_unmap = nvfx_transfer_unmap;
+	pipe->transfer_destroy = nvfx_transfer_destroy;
+	pipe->transfer_inline_write = nvfx_transfer_inline_write;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_transfer.h b/src/gallium/drivers/nvfx/nvfx_transfer.h
index 3e3317b2c7..20f20d5b0b 100644
--- a/src/gallium/drivers/nvfx/nvfx_transfer.h
+++ b/src/gallium/drivers/nvfx/nvfx_transfer.h
@@ -7,19 +7,17 @@
 
 
 struct pipe_transfer *
-nvfx_miptree_transfer_new(struct pipe_context *pcontext,
+nvfx_transfer_new(struct pipe_context *pcontext,
 			  struct pipe_resource *pt,
 			  struct pipe_subresource sr,
 			  unsigned usage,
 			  const struct pipe_box *box);
-void
-nvfx_miptree_transfer_del(struct pipe_context *pcontext,
-			  struct pipe_transfer *ptx);
+
 void *
-nvfx_miptree_transfer_map(struct pipe_context *pcontext,
+nvfx_transfer_map(struct pipe_context *pcontext,
 			  struct pipe_transfer *ptx);
 void
-nvfx_miptree_transfer_unmap(struct pipe_context *pcontext,
+nvfx_transfer_unmap(struct pipe_context *pcontext,
 			    struct pipe_transfer *ptx);
 
 
diff --git a/src/gallium/drivers/nvfx/nvfx_vbo.c b/src/gallium/drivers/nvfx/nvfx_vbo.c
index 4aa3793842..e6e9a8f2e4 100644
--- a/src/gallium/drivers/nvfx/nvfx_vbo.c
+++ b/src/gallium/drivers/nvfx/nvfx_vbo.c
@@ -2,6 +2,7 @@
 #include "pipe/p_state.h"
 #include "util/u_inlines.h"
 #include "util/u_format.h"
+#include "translate/translate.h"
 
 #include "nvfx_context.h"
 #include "nvfx_state.h"
@@ -10,646 +11,595 @@
 #include "nouveau/nouveau_channel.h"
 #include "nouveau/nouveau_class.h"
 #include "nouveau/nouveau_pushbuf.h"
-#include "nouveau/nouveau_util.h"
 
-static INLINE int
-nvfx_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
+static inline unsigned
+util_guess_unique_indices_count(unsigned mode, unsigned indices)
 {
-	switch (pipe) {
-	case PIPE_FORMAT_R32_FLOAT:
-	case PIPE_FORMAT_R32G32_FLOAT:
-	case PIPE_FORMAT_R32G32B32_FLOAT:
-	case PIPE_FORMAT_R32G32B32A32_FLOAT:
-		*fmt = NV34TCL_VTXFMT_TYPE_FLOAT;
-		break;
-	case PIPE_FORMAT_R16_FLOAT:
-	case PIPE_FORMAT_R16G16_FLOAT:
-	case PIPE_FORMAT_R16G16B16_FLOAT:
-	case PIPE_FORMAT_R16G16B16A16_FLOAT:
-		*fmt = NV34TCL_VTXFMT_TYPE_HALF;
-		break;
-	case PIPE_FORMAT_R8_UNORM:
-	case PIPE_FORMAT_R8G8_UNORM:
-	case PIPE_FORMAT_R8G8B8_UNORM:
-	case PIPE_FORMAT_R8G8B8A8_UNORM:
-		*fmt = NV34TCL_VTXFMT_TYPE_UBYTE;
-		break;
-	case PIPE_FORMAT_R16_SSCALED:
-	case PIPE_FORMAT_R16G16_SSCALED:
-	case PIPE_FORMAT_R16G16B16_SSCALED:
-	case PIPE_FORMAT_R16G16B16A16_SSCALED:
-		*fmt = NV34TCL_VTXFMT_TYPE_USHORT;
-		break;
-	default:
-		NOUVEAU_ERR("Unknown format %s\n", util_format_name(pipe));
-		return 1;
+	/* Euler's formula gives V =
+	 * = E - F + 2 =
+	 * = F * (polygon_edges / 2 - 1) + 2 =
+	 * =  F * (polygon_edges - 2) / 2 + 2 =
+	 * =  indices * (polygon_edges - 2) / (2 * indices_per_face) + 2
+	 * =  indices * (1 / 2 - 1 / polygon_edges) + 2
+	 */
+	switch(mode)
+	{
+	case PIPE_PRIM_LINES:
+		return indices >> 1;
+	case PIPE_PRIM_TRIANGLES:
+	{
+		// avoid an expensive division by 3 using the multiplicative inverse mod 2^32
+		unsigned q;
+		unsigned inv3 = 2863311531;
+		indices >>= 1;
+		q = indices * inv3;
+		if(unlikely(q >= indices))
+		{
+			q += inv3;
+			if(q >= indices)
+				q += inv3;
+		}
+		return indices + 2;
+		//return indices / 6 + 2;
 	}
-
-	switch (pipe) {
-	case PIPE_FORMAT_R8_UNORM:
-	case PIPE_FORMAT_R32_FLOAT:
-	case PIPE_FORMAT_R16_FLOAT:
-	case PIPE_FORMAT_R16_SSCALED:
-		*ncomp = 1;
-		break;
-	case PIPE_FORMAT_R8G8_UNORM:
-	case PIPE_FORMAT_R32G32_FLOAT:
-	case PIPE_FORMAT_R16G16_FLOAT:
-	case PIPE_FORMAT_R16G16_SSCALED:
-		*ncomp = 2;
-		break;
-	case PIPE_FORMAT_R8G8B8_UNORM:
-	case PIPE_FORMAT_R32G32B32_FLOAT:
-	case PIPE_FORMAT_R16G16B16_FLOAT:
-	case PIPE_FORMAT_R16G16B16_SSCALED:
-		*ncomp = 3;
-		break;
-	case PIPE_FORMAT_R8G8B8A8_UNORM:
-	case PIPE_FORMAT_R32G32B32A32_FLOAT:
-	case PIPE_FORMAT_R16G16B16A16_FLOAT:
-	case PIPE_FORMAT_R16G16B16A16_SSCALED:
-		*ncomp = 4;
-		break;
+	// guess that indexed quads are created by successive connections, since a closed mesh seems unlikely
+	case PIPE_PRIM_QUADS:
+		return (indices >> 1) + 2;
+	//	return (indices >> 2) + 2; // if it is a closed mesh
 	default:
-		NOUVEAU_ERR("Unknown format %s\n", util_format_name(pipe));
-		return 1;
+		return indices;
 	}
-
-	return 0;
 }
 
-static boolean
-nvfx_vbo_set_idxbuf(struct nvfx_context *nvfx, struct pipe_resource *ib,
-		    unsigned ib_size)
+static unsigned nvfx_decide_upload_mode(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
-	unsigned type;
-
-	if (!ib) {
-		nvfx->idxbuf_buffer = NULL;
-		nvfx->idxbuf_format = 0xdeadbeef;
-		return FALSE;
+	struct nvfx_context* nvfx = nvfx_context(pipe);
+	unsigned hardware_cost = 0;
+	unsigned inline_cost = 0;
+	unsigned unique_vertices;
+	unsigned upload_mode;
+	float best_index_cost_for_hardware_vertices_as_inline_cost;
+	boolean prefer_hardware_indices;
+	unsigned index_inline_cost;
+	unsigned index_hardware_cost;
+	if (info->indexed)
+		unique_vertices = util_guess_unique_indices_count(info->mode, info->count);
+	else
+		unique_vertices = info->count;
+
+	/* Here we try to figure out if we are better off writing vertex data directly on the FIFO,
+	 * or create hardware buffer objects and pointing the hardware to them.
+	 *
+	 * This is done by computing the total memcpy cost of each option, ignoring uploads
+	 * if we think that the buffer is static and thus the upload cost will be amortized over
+	 * future draw calls.
+	 *
+	 * For instance, if everything looks static, we will always create buffer objects, while if
+	 * everything is a user buffer and we are not doing indexed drawing, we never do.
+	 *
+	 * Other interesting cases are where a small user vertex buffer, but a huge user index buffer,
+	 * where we will upload the vertex buffer, so that we can use hardware index lookup, and
+	 * the opposite case, where we instead do index lookup in software to avoid uploading
+	 * a huge amount of vertex data that is not going to be used.
+	 *
+	 * Otherwise, we generally move to the GPU the after it has been pushed
+	 * NVFX_STATIC_BUFFER_MIN_REUSE_TIMES times to the GPU without having
+	 * been updated with a transfer (or just the buffer having been destroyed).
+	 *
+	 * There is no special handling for user buffers, since applications can use
+	 * OpenGL VBOs in a one-shot fashion. OpenGL 3/4 core profile forces this
+	 * by the way.
+	 *
+	 * Note that currently we don't support only putting some data on the FIFO, and
+	 * some on vertex buffers (constant and instanced data is independent from this).
+	 *
+	 * nVidia doesn't seem to do this either, even though it should be at least
+	 * doable with VTX_ATTR and possibly with VERTEX_DATA too if not indexed.
+	 */
+
+	for (unsigned i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; i++)
+	{
+		struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+		struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+		struct nvfx_buffer* buffer = nvfx_buffer(vb->buffer);
+		buffer->bytes_to_draw_until_static -= vbi->per_vertex_size * unique_vertices;
+		if (!nvfx_buffer_seems_static(buffer))
+		{
+			hardware_cost += buffer->dirty_end - buffer->dirty_begin;
+			if (!buffer->base.bo)
+				hardware_cost += nvfx->screen->buffer_allocation_cost;
+		}
+		inline_cost += vbi->per_vertex_size * info->count;
 	}
 
-	if (!nvfx->screen->index_buffer_reloc_flags || ib_size == 1)
-		return FALSE;
+	best_index_cost_for_hardware_vertices_as_inline_cost = 0.0f;
+	prefer_hardware_indices = FALSE;
+	index_inline_cost = 0;
+	index_hardware_cost = 0;
 
-	switch (ib_size) {
-	case 2:
-		type = NV34TCL_IDXBUF_FORMAT_TYPE_U16;
-		break;
-	case 4:
-		type = NV34TCL_IDXBUF_FORMAT_TYPE_U32;
-		break;
-	default:
-		return FALSE;
-	}
+	if (info->indexed)
+	{
+		index_inline_cost = nvfx->idxbuf.index_size * info->count;
+		if (nvfx->screen->index_buffer_reloc_flags
+			&& (nvfx->idxbuf.index_size == 2 || nvfx->idxbuf.index_size == 4)
+			&& !(nvfx->idxbuf.offset & (nvfx->idxbuf.index_size - 1)))
+		{
+			struct nvfx_buffer* buffer = nvfx_buffer(nvfx->idxbuf.buffer);
+			buffer->bytes_to_draw_until_static -= index_inline_cost;
 
-	if (ib != nvfx->idxbuf_buffer ||
-	    type != nvfx->idxbuf_format) {
-		nvfx->dirty |= NVFX_NEW_ARRAYS;
-		nvfx->idxbuf_buffer = ib;
-		nvfx->idxbuf_format = type;
-	}
+			prefer_hardware_indices = TRUE;
 
-	return TRUE;
-}
+			if (!nvfx_buffer_seems_static(buffer))
+			{
+				index_hardware_cost = buffer->dirty_end - buffer->dirty_begin;
+				if (!buffer->base.bo)
+					index_hardware_cost += nvfx->screen->buffer_allocation_cost;
+			}
 
-// type must be floating point
-static inline void
-nvfx_vbo_static_attrib(struct nvfx_context *nvfx,
-		       int attrib, struct pipe_vertex_element *ve,
-		       struct pipe_vertex_buffer *vb, unsigned ncomp)
-{
-	struct pipe_transfer *transfer;
-	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	void *map;
-	float *v;
-
-	map  = pipe_buffer_map(&nvfx->pipe, vb->buffer, PIPE_TRANSFER_READ, &transfer);
-	map = (uint8_t *) map + vb->buffer_offset + ve->src_offset;
-
-	v = map;
-
-	switch (ncomp) {
-	case 4:
-		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_4F_X(attrib), 4));
-		OUT_RING(chan, fui(v[0]));
-		OUT_RING(chan, fui(v[1]));
-		OUT_RING(chan,  fui(v[2]));
-		OUT_RING(chan,  fui(v[3]));
-		break;
-	case 3:
-		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_3F_X(attrib), 3));
-		OUT_RING(chan,  fui(v[0]));
-		OUT_RING(chan,  fui(v[1]));
-		OUT_RING(chan,  fui(v[2]));
-		break;
-	case 2:
-		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_2F_X(attrib), 2));
-		OUT_RING(chan,  fui(v[0]));
-		OUT_RING(chan,  fui(v[1]));
-		break;
-	case 1:
-		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_1F(attrib), 1));
-		OUT_RING(chan,  fui(v[0]));
-		break;
+			if ((float) index_inline_cost < (float) index_hardware_cost * nvfx->screen->inline_cost_per_hardware_cost)
+			{
+				best_index_cost_for_hardware_vertices_as_inline_cost = (float) index_inline_cost;
+			}
+			else
+			{
+				best_index_cost_for_hardware_vertices_as_inline_cost = (float) index_hardware_cost * nvfx->screen->inline_cost_per_hardware_cost;
+				prefer_hardware_indices = TRUE;
+			}
+		}
 	}
 
-	pipe_buffer_unmap(&nvfx->pipe, vb->buffer, transfer);
+	/* let's finally figure out which of the 3 paths we want to take */
+	if ((float) (inline_cost + index_inline_cost) > ((float) hardware_cost * nvfx->screen->inline_cost_per_hardware_cost + best_index_cost_for_hardware_vertices_as_inline_cost))
+		upload_mode = 1 + prefer_hardware_indices;
+	else
+		upload_mode = 0;
+
+#ifdef DEBUG
+        if (unlikely(nvfx->screen->trace_draw))
+          {
+                  fprintf(stderr, "DRAW");
+                  if (info->indexed)
+                  {
+                          fprintf(stderr, "_IDX%u", nvfx->idxbuf.index_size);
+                          if (info->index_bias)
+                                  fprintf(stderr, " biased %u", info->index_bias);
+                          fprintf(stderr, " idxrange %u -> %u", info->min_index, info->max_index);
+                  }
+                  if (info->instance_count > 1)
+                          fprintf(stderr, " %u instances from %u", info->instance_count, info->indexed);
+                  fprintf(stderr, " start %u count %u prim %u", info->start, info->count, info->mode);
+                  if (!upload_mode)
+                          fprintf(stderr, " -> inline vertex data");
+                  else if (upload_mode == 2 || !info->indexed)
+                          fprintf(stderr, " -> buffer range");
+                  else
+                          fprintf(stderr, " -> inline indices");
+                  fprintf(stderr, " [ivtx %u hvtx %u iidx %u hidx %u bidx %f] <", inline_cost, hardware_cost, index_inline_cost, index_hardware_cost, best_index_cost_for_hardware_vertices_as_inline_cost);
+                  for (unsigned i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; ++i)
+                  {
+                          struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+                          struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+                          struct nvfx_buffer* buffer = nvfx_buffer(vb->buffer);
+                          if (i)
+                                  fprintf(stderr, ", ");
+                          fprintf(stderr, "%p%s left %Li", buffer, buffer->last_update_static ? " static" : "", buffer->bytes_to_draw_until_static);
+                  }
+                  fprintf(stderr, ">\n");
+          }
+#endif
+
+	return upload_mode;
 }
 
-static void
-nvfx_draw_arrays(struct pipe_context *pipe,
-		 unsigned mode, unsigned start, unsigned count)
+void nvfx_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-	unsigned restart = 0;
-
-	nvfx_vbo_set_idxbuf(nvfx, NULL, 0);
-	if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx)) {
-		nvfx_draw_elements_swtnl(pipe, NULL, 0, 0,
-                                           mode, start, count);
-                return;
-	}
+	unsigned upload_mode = 0;
 
-	while (count) {
-		unsigned vc, nr, avail;
+	if (!nvfx->vtxelt->needs_translate)
+		upload_mode = nvfx_decide_upload_mode(pipe, info);
 
-		nvfx_state_emit(nvfx);
+	nvfx->use_index_buffer = upload_mode > 1;
 
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+	if ((upload_mode > 0) != nvfx->use_vertex_buffers)
+	{
+		nvfx->use_vertex_buffers = (upload_mode > 0);
+		nvfx->dirty |= NVFX_NEW_ARRAYS;
+		nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
+	}
 
-		vc = nouveau_vbuf_split(avail, 6, 256,
-					mode, start, count, &restart);
-		if (!vc) {
-			FIRE_RING(chan);
-			continue;
+	if (upload_mode > 0)
+	{
+		for (unsigned i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; i++)
+		{
+			struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+			nvfx_buffer_upload(nvfx_buffer(vb->buffer));
 		}
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
+		if (upload_mode > 1)
+		{
+			nvfx_buffer_upload(nvfx_buffer(nvfx->idxbuf.buffer));
 
-		nr = (vc & 0xff);
-		if (nr) {
-			OUT_RING(chan, RING_3D(NV34TCL_VB_VERTEX_BATCH, 1));
-			OUT_RING  (chan, ((nr - 1) << 24) | start);
-			start += nr;
+			if (unlikely(info->index_bias != nvfx->base_vertex))
+			{
+				nvfx->base_vertex = info->index_bias;
+				nvfx->dirty |= NVFX_NEW_ARRAYS;
+			}
 		}
-
-		nr = vc >> 8;
-		while (nr) {
-			unsigned push = nr > 2047 ? 2047 : nr;
-
-			nr -= push;
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_VERTEX_BATCH, push));
-			while (push--) {
-				OUT_RING(chan, ((0x100 - 1) << 24) | start);
-				start += 0x100;
+		else
+		{
+			if (unlikely(info->start < nvfx->base_vertex && nvfx->base_vertex))
+			{
+				nvfx->base_vertex = 0;
+				nvfx->dirty |= NVFX_NEW_ARRAYS;
 			}
 		}
-
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
-
-		count -= vc;
-		start = restart;
 	}
 
-	pipe->flush(pipe, 0, NULL);
+	if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx))
+		nvfx_draw_vbo_swtnl(pipe, info);
+	else
+		nvfx_push_vbo(pipe, info);
 }
 
-static INLINE void
-nvfx_draw_elements_u08(struct nvfx_context *nvfx, void *ib,
-		       unsigned mode, unsigned start, unsigned count)
+boolean
+nvfx_vbo_validate(struct nvfx_context *nvfx)
 {
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	int i;
+	int elements = MAX2(nvfx->vtxelt->num_elements, nvfx->hw_vtxelt_nr);
+	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD;
 
-	while (count) {
-		uint8_t *elts = (uint8_t *)ib + start;
-		unsigned vc, push, restart = 0, avail;
+	if (!elements)
+		return TRUE;
 
-		nvfx_state_emit(nvfx);
+	MARK_RING(chan, (5 + 2) * 16 + 2 + 11, 16 + 2);
+	for(unsigned i = 0; i < nvfx->vtxelt->num_constant; ++i)
+	{
+		struct nvfx_low_frequency_element *ve = &nvfx->vtxelt->constant[i];
+		struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+		struct nvfx_buffer* buffer = nvfx_buffer(vb->buffer);
+		float v[4];
+		ve->fetch_rgba_float(v, buffer->data + vb->buffer_offset + ve->src_offset, 0, 0);
+		nvfx_emit_vtx_attr(chan, ve->idx, v, ve->ncomp);
+	}
 
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
 
-		vc = nouveau_vbuf_split(avail, 6, 2,
-					mode, start, count, &restart);
-		if (vc == 0) {
-			FIRE_RING(chan);
-			continue;
-		}
-		count -= vc;
+	OUT_RING(chan, RING_3D(NV34TCL_VTXFMT(0), elements));
+	if(nvfx->use_vertex_buffers)
+	{
+		unsigned idx = 0;
+		for (i = 0; i < nvfx->vtxelt->num_per_vertex; i++) {
+			struct nvfx_per_vertex_element *ve = &nvfx->vtxelt->per_vertex[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
+			if(idx != ve->idx)
+			{
+				assert(idx < ve->idx);
+				OUT_RINGp(chan, &nvfx->vtxelt->vtxfmt[idx], ve->idx - idx);
+				idx = ve->idx;
+			}
 
-		if (vc & 1) {
-			OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
-			OUT_RING  (chan, elts[0]);
-			elts++; vc--;
+			OUT_RING(chan, nvfx->vtxelt->vtxfmt[idx] | (vb->stride << NV34TCL_VTXFMT_STRIDE_SHIFT));
+			++idx;
 		}
+		if(idx != nvfx->vtxelt->num_elements)
+			OUT_RINGp(chan, &nvfx->vtxelt->vtxfmt[idx], nvfx->vtxelt->num_elements - idx);
+	}
+	else
+		OUT_RINGp(chan, nvfx->vtxelt->vtxfmt, nvfx->vtxelt->num_elements);
 
-		while (vc) {
-			unsigned i;
-
-			push = MIN2(vc, 2047 * 2);
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
-			for (i = 0; i < push; i+=2)
-				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
+	for(i = nvfx->vtxelt->num_elements; i < elements; ++i)
+		OUT_RING(chan, NV34TCL_VTXFMT_TYPE_32_FLOAT);
 
-			vc -= push;
-			elts += push;
+	if(nvfx->is_nv4x) {
+		unsigned i;
+		/* seems to be some kind of cache flushing */
+		for(i = 0; i < 3; ++i) {
+			OUT_RING(chan, RING_3D(0x1718, 1));
+			OUT_RING(chan, 0);
 		}
-
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
-
-		start = restart;
 	}
-}
-
-static INLINE void
-nvfx_draw_elements_u16(struct nvfx_context *nvfx, void *ib,
-		       unsigned mode, unsigned start, unsigned count)
-{
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-
-	while (count) {
-		uint16_t *elts = (uint16_t *)ib + start;
-		unsigned vc, push, restart = 0, avail;
-
-		nvfx_state_emit(nvfx);
-
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
 
-		vc = nouveau_vbuf_split(avail, 6, 2,
-					mode, start, count, &restart);
-		if (vc == 0) {
-			FIRE_RING(chan);
-			continue;
-		}
-		count -= vc;
+	OUT_RING(chan, RING_3D(NV34TCL_VTXBUF_ADDRESS(0), elements));
+	if(nvfx->use_vertex_buffers)
+	{
+		unsigned idx = 0;
+		for (i = 0; i < nvfx->vtxelt->num_per_vertex; i++) {
+			struct nvfx_per_vertex_element *ve = &nvfx->vtxelt->per_vertex[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+			struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
+			for(; idx < ve->idx; ++idx)
+				OUT_RING(chan, 0);
 
-		if (vc & 1) {
-			OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
-			OUT_RING  (chan, elts[0]);
-			elts++; vc--;
+			OUT_RELOC(chan, bo,
+					vb->buffer_offset + ve->src_offset + nvfx->base_vertex * vb->stride,
+					vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+					0, NV34TCL_VTXBUF_ADDRESS_DMA1);
+			++idx;
 		}
 
-		while (vc) {
-			unsigned i;
-
-			push = MIN2(vc, 2047 * 2);
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
-			for (i = 0; i < push; i+=2)
-				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
-
-			vc -= push;
-			elts += push;
-		}
+		for(; idx < elements; ++idx)
+			OUT_RING(chan, 0);
+	}
+	else
+	{
+		for (i = 0; i < elements; i++)
+			OUT_RING(chan, 0);
+	}
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
+	OUT_RING(chan, RING_3D(0x1710, 1));
+	OUT_RING(chan, 0);
 
-		start = restart;
-	}
+	nvfx->hw_vtxelt_nr = nvfx->vtxelt->num_elements;
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_VTXBUF;
+	return TRUE;
 }
 
-static INLINE void
-nvfx_draw_elements_u32(struct nvfx_context *nvfx, void *ib,
-		       unsigned mode, unsigned start, unsigned count)
+void
+nvfx_vbo_relocate(struct nvfx_context *nvfx)
 {
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-
-	while (count) {
-		uint32_t *elts = (uint32_t *)ib + start;
-		unsigned vc, push, restart = 0, avail;
-
-		nvfx_state_emit(nvfx);
-
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
-
-		vc = nouveau_vbuf_split(avail, 5, 1,
-					mode, start, count, &restart);
-		if (vc == 0) {
-			FIRE_RING(chan);
-			continue;
-		}
-		count -= vc;
-
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
-
-		while (vc) {
-			push = MIN2(vc, 2047);
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U32, push));
-			OUT_RINGp    (chan, elts, push);
+	struct nouveau_channel* chan;
+	unsigned vb_flags;
+	int i;
 
-			vc -= push;
-			elts += push;
-		}
+        if(!nvfx->use_vertex_buffers)
+                return;
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
+	chan = nvfx->screen->base.channel;
+	vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
 
-		start = restart;
+	MARK_RING(chan, 2 * 16 + 3, 2 * 16 + 3);
+        for (i = 0; i < nvfx->vtxelt->num_per_vertex; i++) {
+                struct nvfx_per_vertex_element *ve = &nvfx->vtxelt->per_vertex[i];
+                struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+                struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
+
+                OUT_RELOC(chan, bo, RING_3D(NV34TCL_VTXBUF_ADDRESS(ve->idx), 1),
+				vb_flags, 0, 0);
+                OUT_RELOC(chan, bo, vb->buffer_offset + ve->src_offset + nvfx->base_vertex * vb->stride,
+				vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+				0, NV34TCL_VTXBUF_ADDRESS_DMA1);
 	}
+        nvfx->relocs_needed &=~ NVFX_RELOCATE_VTXBUF;
 }
 
 static void
-nvfx_draw_elements_inline(struct pipe_context *pipe,
-			  struct pipe_resource *ib,
-			  unsigned ib_size, int ib_bias,
-			  unsigned mode, unsigned start, unsigned count)
+nvfx_idxbuf_emit(struct nvfx_context* nvfx, unsigned ib_flags)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct pipe_transfer *transfer;
-	void *map;
-
-	map = pipe_buffer_map(pipe, ib, PIPE_TRANSFER_READ, &transfer);
-	if (!ib) {
-		NOUVEAU_ERR("failed mapping ib\n");
-		return;
-	}
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	unsigned ib_format = (nvfx->idxbuf.index_size == 2) ? NV34TCL_IDXBUF_FORMAT_TYPE_U16 : NV34TCL_IDXBUF_FORMAT_TYPE_U32;
+	struct nouveau_bo* bo = nvfx_resource(nvfx->idxbuf.buffer)->bo;
+	ib_flags |= nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD;
 
-	assert(ib_bias == 0);
-
-	switch (ib_size) {
-	case 1:
-		nvfx_draw_elements_u08(nvfx, map, mode, start, count);
-		break;
-	case 2:
-		nvfx_draw_elements_u16(nvfx, map, mode, start, count);
-		break;
-	case 4:
-		nvfx_draw_elements_u32(nvfx, map, mode, start, count);
-		break;
-	default:
-		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
-		break;
-	}
+	assert(nvfx->screen->index_buffer_reloc_flags);
 
-	pipe_buffer_unmap(pipe, ib, transfer);
+	MARK_RING(chan, 3, 3);
+	if(ib_flags & NOUVEAU_BO_DUMMY)
+		OUT_RELOC(chan, bo, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2), ib_flags, 0, 0);
+	else
+		OUT_RING(chan, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2));
+	OUT_RELOC(chan, bo, nvfx->idxbuf.offset + 1, ib_flags | NOUVEAU_BO_LOW, 0, 0);
+	OUT_RELOC(chan, bo, ib_format, ib_flags | NOUVEAU_BO_OR,
+			0, NV34TCL_IDXBUF_FORMAT_DMA1);
+	nvfx->relocs_needed &=~ NVFX_RELOCATE_IDXBUF;
 }
 
-static void
-nvfx_draw_elements_vbo(struct pipe_context *pipe,
-		       unsigned mode, unsigned start, unsigned count)
+void
+nvfx_idxbuf_validate(struct nvfx_context* nvfx)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-	unsigned restart = 0;
+	nvfx_idxbuf_emit(nvfx, 0);
+}
 
-	while (count) {
-		unsigned nr, vc, avail;
+void
+nvfx_idxbuf_relocate(struct nvfx_context* nvfx)
+{
+	nvfx_idxbuf_emit(nvfx, NOUVEAU_BO_DUMMY);
+}
 
-		nvfx_state_emit(nvfx);
+unsigned nvfx_vertex_formats[PIPE_FORMAT_COUNT] =
+{
+	[PIPE_FORMAT_R32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT,
+	[PIPE_FORMAT_R32G32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT,
+	[PIPE_FORMAT_R32G32B32A32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT,
+	[PIPE_FORMAT_R32G32B32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT,
+	[PIPE_FORMAT_R16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT,
+	[PIPE_FORMAT_R16G16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT,
+	[PIPE_FORMAT_R16G16B16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT,
+	[PIPE_FORMAT_R16G16B16A16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT,
+	[PIPE_FORMAT_R8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM,
+	[PIPE_FORMAT_R8G8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM,
+	[PIPE_FORMAT_R8G8B8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM,
+	[PIPE_FORMAT_R8G8B8A8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM,
+	[PIPE_FORMAT_R8G8B8A8_USCALED] = NV34TCL_VTXFMT_TYPE_8_USCALED,
+	[PIPE_FORMAT_R16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM,
+	[PIPE_FORMAT_R16G16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM,
+	[PIPE_FORMAT_R16G16B16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM,
+	[PIPE_FORMAT_R16G16B16A16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM,
+	[PIPE_FORMAT_R16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED,
+	[PIPE_FORMAT_R16G16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED,
+	[PIPE_FORMAT_R16G16B16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED,
+	[PIPE_FORMAT_R16G16B16A16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED,
+};
+
+static void *
+nvfx_vtxelts_state_create(struct pipe_context *pipe,
+			  unsigned num_elements,
+			  const struct pipe_vertex_element *elements)
+{
+	struct nvfx_vtxelt_state *cso = CALLOC_STRUCT(nvfx_vtxelt_state);
+	struct translate_key transkey;
+	unsigned per_vertex_size[16];
+	unsigned vb_compacted_index[16];
 
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+	if(num_elements > 16)
+	{
+		_debug_printf("Error: application attempted to use %u vertex elements, but only 16 are supported: ignoring the rest\n", num_elements);
+		num_elements = 16;
+	}
 
-		vc = nouveau_vbuf_split(avail, 6, 256,
-					mode, start, count, &restart);
-		if (!vc) {
-			FIRE_RING(chan);
-			continue;
-		}
+	memset(per_vertex_size, 0, sizeof(per_vertex_size));
+	memcpy(cso->pipe, elements, num_elements * sizeof(elements[0]));
+	cso->num_elements = num_elements;
+	cso->needs_translate = FALSE;
+
+	transkey.nr_elements = 0;
+	transkey.output_stride = 0;
+
+	for(unsigned i = 0; i < num_elements; ++i)
+        {
+		const struct pipe_vertex_element* ve = &elements[i];
+		if(!ve->instance_divisor)
+                        per_vertex_size[ve->vertex_buffer_index] += util_format_get_stride(ve->src_format, 1);
+        }
+
+        for(unsigned i = 0; i < 16; ++i)
+        {
+                if(per_vertex_size[i])
+                {
+                        unsigned idx = cso->num_per_vertex_buffer_infos++;
+                        cso->per_vertex_buffer_info[idx].vertex_buffer_index = i;
+                        cso->per_vertex_buffer_info[idx].per_vertex_size = per_vertex_size[i];
+                        vb_compacted_index[i] = idx;
+                }
+        }
+
+	for(unsigned i = 0; i < num_elements; ++i)
+	{
+		const struct pipe_vertex_element* ve = &elements[i];
+		unsigned type = nvfx_vertex_formats[ve->src_format];
+		unsigned ncomp = util_format_get_nr_components(ve->src_format);
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
+		//if(ve->frequency != PIPE_ELEMENT_FREQUENCY_PER_VERTEX)
+		if(ve->instance_divisor)
+		{
+			struct nvfx_low_frequency_element* lfve;
+			cso->vtxfmt[i] = NV34TCL_VTXFMT_TYPE_32_FLOAT;
+
+			//if(ve->frequency == PIPE_ELEMENT_FREQUENCY_CONSTANT)
+			if(0)
+				lfve = &cso->constant[cso->num_constant++];
+			else
+			{
+				lfve = &cso->per_instance[cso->num_per_instance++].base;
+				((struct nvfx_per_instance_element*)lfve)->instance_divisor = ve->instance_divisor;
+			}
 
-		nr = (vc & 0xff);
-		if (nr) {
-			OUT_RING(chan, RING_3D(NV34TCL_VB_INDEX_BATCH, 1));
-			OUT_RING  (chan, ((nr - 1) << 24) | start);
-			start += nr;
+                        lfve->idx = i;
+                        lfve->vertex_buffer_index = ve->vertex_buffer_index;
+                        lfve->src_offset = ve->src_offset;
+                        lfve->fetch_rgba_float = util_format_description(ve->src_format)->fetch_rgba_float;
+                        lfve->ncomp = ncomp;
 		}
-
-		nr = vc >> 8;
-		while (nr) {
-			unsigned push = nr > 2047 ? 2047 : nr;
-
-			nr -= push;
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_INDEX_BATCH, push));
-			while (push--) {
-				OUT_RING(chan, ((0x100 - 1) << 24) | start);
-				start += 0x100;
+		else
+		{
+			unsigned idx;
+
+			idx = cso->num_per_vertex++;
+			cso->per_vertex[idx].idx = i;
+			cso->per_vertex[idx].vertex_buffer_index = ve->vertex_buffer_index;
+			cso->per_vertex[idx].src_offset = ve->src_offset;
+
+			idx = transkey.nr_elements++;
+			transkey.element[idx].input_format = ve->src_format;
+			transkey.element[idx].input_buffer = vb_compacted_index[ve->vertex_buffer_index];
+			transkey.element[idx].input_offset = ve->src_offset;
+			transkey.element[idx].instance_divisor = 0;
+			transkey.element[idx].type = TRANSLATE_ELEMENT_NORMAL;
+			if(type)
+			{
+				transkey.element[idx].output_format = ve->src_format;
+				cso->vtxfmt[i] = (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type;
+			}
+			else
+			{
+				unsigned float32[4] = {PIPE_FORMAT_R32_FLOAT, PIPE_FORMAT_R32G32_FLOAT, PIPE_FORMAT_R32G32B32_FLOAT, PIPE_FORMAT_R32G32B32A32_FLOAT};
+				transkey.element[idx].output_format = float32[ncomp - 1];
+				cso->needs_translate = TRUE;
+				cso->vtxfmt[i] = (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | NV34TCL_VTXFMT_TYPE_32_FLOAT;
 			}
+			transkey.element[idx].output_offset = transkey.output_stride;
+			transkey.output_stride += (util_format_get_stride(transkey.element[idx].output_format, 1) + 3) & ~3;
 		}
+	}
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
+	cso->translate = translate_create(&transkey);
+	cso->vertex_length = transkey.output_stride >> 2;
+	cso->max_vertices_per_packet = 2047 / cso->vertex_length;
 
-		count -= vc;
-		start = restart;
-	}
+	return (void *)cso;
 }
 
 static void
-nvfx_draw_elements(struct pipe_context *pipe,
-		   struct pipe_resource *indexBuffer,
-		   unsigned indexSize, int indexBias,
-		   unsigned mode, unsigned start, unsigned count)
+nvfx_vtxelts_state_delete(struct pipe_context *pipe, void *hwcso)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	boolean idxbuf;
-
-	idxbuf = nvfx_vbo_set_idxbuf(nvfx, indexBuffer, indexSize);
-	if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx)) {
-		nvfx_draw_elements_swtnl(pipe,
-		                         indexBuffer, indexSize, indexBias,
-		                         mode, start, count);
-		return;
-	}
-
-	if (idxbuf) {
-		nvfx_draw_elements_vbo(pipe, mode, start, count);
-	} else {
-		nvfx_draw_elements_inline(pipe,
-		                          indexBuffer, indexSize, indexBias,
-					  mode, start, count);
-	}
-
-	pipe->flush(pipe, 0, NULL);
+	FREE(hwcso);
 }
 
-void
-nvfx_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
+static void
+nvfx_vtxelts_state_bind(struct pipe_context *pipe, void *hwcso)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
 
-	if (info->indexed && nvfx->idxbuf.buffer) {
-		unsigned offset;
-
-		assert(nvfx->idxbuf.offset % nvfx->idxbuf.index_size == 0);
-		offset = nvfx->idxbuf.offset / nvfx->idxbuf.index_size;
-
-		nvfx_draw_elements(pipe,
-				   nvfx->idxbuf.buffer,
-				   nvfx->idxbuf.index_size,
-				   info->index_bias,
-				   info->mode,
-				   info->start + offset,
-				   info->count);
-	}
-	else {
-		nvfx_draw_arrays(pipe,
-				info->mode,
-				info->start,
-				info->count);
-	}
+	nvfx->vtxelt = hwcso;
+	nvfx->use_vertex_buffers = -1;
+	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
 }
 
-boolean
-nvfx_vbo_validate(struct nvfx_context *nvfx)
+static void
+nvfx_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
 {
-	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	struct pipe_resource *ib = nvfx->idxbuf_buffer;
-	unsigned ib_format = nvfx->idxbuf_format;
-	int i;
-	int elements = MAX2(nvfx->vtxelt->num_elements, nvfx->hw_vtxelt_nr);
-	uint32_t vtxfmt[16];
-	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD;
-
-	if (!elements)
-		return TRUE;
-
-	nvfx->vbo_bo = 0;
-
-	MARK_RING(chan, (5 + 2) * 16 + 2 + 11, 16 + 2);
-	for (i = 0; i < nvfx->vtxelt->num_elements; i++) {
-		struct pipe_vertex_element *ve;
-		struct pipe_vertex_buffer *vb;
-		unsigned type, ncomp;
-
-		ve = &nvfx->vtxelt->pipe[i];
-		vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
-
-		if (nvfx_vbo_format_to_hw(ve->src_format, &type, &ncomp)) {
-			MARK_UNDO(chan);
-			nvfx->fallback_swtnl |= NVFX_NEW_ARRAYS;
-			return FALSE;
-		}
+	struct nvfx_context *nvfx = nvfx_context(pipe);
 
-		if (!vb->stride && type == NV34TCL_VTXFMT_TYPE_FLOAT) {
-			nvfx_vbo_static_attrib(nvfx, i, ve, vb, ncomp);
-			vtxfmt[i] = type;
-		} else {
-			vtxfmt[i] = ((vb->stride << NV34TCL_VTXFMT_STRIDE_SHIFT) |
-				(ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type);
-			nvfx->vbo_bo |= (1 << i);
-		}
+	for(unsigned i = 0; i < count; ++i)
+	{
+		pipe_resource_reference(&nvfx->vtxbuf[i].buffer, vb[i].buffer);
+		nvfx->vtxbuf[i].buffer_offset = vb[i].buffer_offset;
+		nvfx->vtxbuf[i].max_index = vb[i].max_index;
+		nvfx->vtxbuf[i].stride = vb[i].stride;
 	}
 
-	for(; i < elements; ++i)
-		vtxfmt[i] = NV34TCL_VTXFMT_TYPE_FLOAT;
+	for(unsigned i = count; i < nvfx->vtxbuf_nr; ++i)
+		pipe_resource_reference(&nvfx->vtxbuf[i].buffer, 0);
 
-	OUT_RING(chan, RING_3D(NV34TCL_VTXFMT(0), elements));
-	OUT_RINGp(chan, vtxfmt, elements);
-
-	if(nvfx->is_nv4x) {
-		unsigned i;
-		/* seems to be some kind of cache flushing */
-		for(i = 0; i < 3; ++i) {
-			OUT_RING(chan, RING_3D(0x1718, 1));
-			OUT_RING(chan, 0);
-		}
-	}
-
-	OUT_RING(chan, RING_3D(NV34TCL_VTXBUF_ADDRESS(0), elements));
-	for (i = 0; i < nvfx->vtxelt->num_elements; i++) {
-		struct pipe_vertex_element *ve;
-		struct pipe_vertex_buffer *vb;
+	nvfx->vtxbuf_nr = count;
+	nvfx->use_vertex_buffers = -1;
+	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
+}
 
-		ve = &nvfx->vtxelt->pipe[i];
-		vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+static void
+nvfx_set_index_buffer(struct pipe_context *pipe,
+		      const struct pipe_index_buffer *ib)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
 
-		if (!(nvfx->vbo_bo & (1 << i)))
-			OUT_RING(chan, 0);
-		else
-		{
-			struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
-			OUT_RELOC(chan, bo,
-				 vb->buffer_offset + ve->src_offset,
-				 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
-				 0, NV34TCL_VTXBUF_ADDRESS_DMA1);
-		}
+	if(ib)
+	{
+		pipe_resource_reference(&nvfx->idxbuf.buffer, ib->buffer);
+		nvfx->idxbuf.index_size = ib->index_size;
+		nvfx->idxbuf.offset = ib->offset;
 	}
-
-        for (; i < elements; i++)
-		OUT_RING(chan, 0);
-
-	OUT_RING(chan, RING_3D(0x1710, 1));
-	OUT_RING(chan, 0);
-
-	if (ib) {
-		unsigned ib_flags = nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD;
-		struct nouveau_bo* bo = nvfx_resource(ib)->bo;
-
-		assert(nvfx->screen->index_buffer_reloc_flags);
-
-		OUT_RING(chan, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2));
-		OUT_RELOC(chan, bo, 0, ib_flags | NOUVEAU_BO_LOW, 0, 0);
-		OUT_RELOC(chan, bo, ib_format, ib_flags | NOUVEAU_BO_OR,
-				  0, NV34TCL_IDXBUF_FORMAT_DMA1);
+	else
+	{
+		pipe_resource_reference(&nvfx->idxbuf.buffer, 0);
+		nvfx->idxbuf.index_size = 0;
+		nvfx->idxbuf.offset = 0;
 	}
 
-	nvfx->hw_vtxelt_nr = nvfx->vtxelt->num_elements;
-	return TRUE;
+	nvfx->dirty |= NVFX_NEW_INDEX;
+	nvfx->draw_dirty |= NVFX_NEW_INDEX;
 }
 
 void
-nvfx_vbo_relocate(struct nvfx_context *nvfx)
+nvfx_init_vbo_functions(struct nvfx_context *nvfx)
 {
-	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
-	int i;
+	nvfx->pipe.set_vertex_buffers = nvfx_set_vertex_buffers;
+	nvfx->pipe.set_index_buffer = nvfx_set_index_buffer;
 
-	MARK_RING(chan, 2 * 16 + 3, 2 * 16 + 3);
-	for(i = 0; i < nvfx->vtxelt->num_elements; ++i) {
-		if(nvfx->vbo_bo & (1 << i)) {
-			struct pipe_vertex_element *ve = &nvfx->vtxelt->pipe[i];
-			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
-			struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
-			OUT_RELOC(chan, bo, RING_3D(NV34TCL_VTXBUF_ADDRESS(i), 1),
-					vb_flags, 0, 0);
-			OUT_RELOC(chan, bo, vb->buffer_offset + ve->src_offset,
-					vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
-					0, NV34TCL_VTXBUF_ADDRESS_DMA1);
-		}
-	}
-
-	if(nvfx->idxbuf_buffer)
-	{
-		unsigned ib_flags = nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
-		struct nouveau_bo* bo = nvfx_resource(nvfx->idxbuf_buffer)->bo;
-
-		assert(nvfx->screen->index_buffer_reloc_flags);
-
-		OUT_RELOC(chan, bo, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2),
-				ib_flags, 0, 0);
-		OUT_RELOC(chan, bo, 0,
-				ib_flags | NOUVEAU_BO_LOW, 0, 0);
-		OUT_RELOC(chan, bo, nvfx->idxbuf_format,
-				ib_flags | NOUVEAU_BO_OR,
-				0, NV34TCL_IDXBUF_FORMAT_DMA1);
-	}
+	nvfx->pipe.create_vertex_elements_state = nvfx_vtxelts_state_create;
+	nvfx->pipe.delete_vertex_elements_state = nvfx_vtxelts_state_delete;
+	nvfx->pipe.bind_vertex_elements_state = nvfx_vtxelts_state_bind;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c
index 24d9846310..ea7e88c561 100644
--- a/src/gallium/drivers/nvfx/nvfx_vertprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c
@@ -1,15 +1,19 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
-#include "util/u_inlines.h"
+#include "util/u_linkage.h"
+#include "util/u_debug.h"
 
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_util.h"
 
+#include "draw/draw_context.h"
+
 #include "nvfx_context.h"
 #include "nvfx_state.h"
+#include "nvfx_resource.h"
 
 /* TODO (at least...):
  *  1. Indexed consts  + ARL
@@ -25,26 +29,34 @@
 #include "nv30_vertprog.h"
 #include "nv40_vertprog.h"
 
-#define NVFX_VP_INST_DEST_CLIP(n) ((~0 - 6) + (n))
+struct nvfx_loop_entry
+{
+	unsigned brk_target;
+	unsigned cont_target;
+};
 
 struct nvfx_vpc {
+	struct nvfx_context* nvfx;
 	struct nvfx_vertex_program *vp;
 
 	struct nvfx_vertex_program_exec *vpi;
 
 	unsigned r_temps;
 	unsigned r_temps_discard;
-	struct nvfx_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
-	struct nvfx_sreg *r_address;
-	struct nvfx_sreg *r_temp;
+	struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nvfx_reg *r_address;
+	struct nvfx_reg *r_temp;
 
-	struct nvfx_sreg *imm;
+	struct nvfx_reg *imm;
 	unsigned nr_imm;
 
 	unsigned hpos_idx;
+
+	struct util_dynarray label_relocs;
+	struct util_dynarray loop_stack;
 };
 
-static struct nvfx_sreg
+static struct nvfx_reg
 temp(struct nvfx_vpc *vpc)
 {
 	int idx = ffs(~vpc->r_temps) - 1;
@@ -52,22 +64,22 @@ temp(struct nvfx_vpc *vpc)
 	if (idx < 0) {
 		NOUVEAU_ERR("out of temps!!\n");
 		assert(0);
-		return nvfx_sr(NVFXSR_TEMP, 0);
+		return nvfx_reg(NVFXSR_TEMP, 0);
 	}
 
 	vpc->r_temps |= (1 << idx);
 	vpc->r_temps_discard |= (1 << idx);
-	return nvfx_sr(NVFXSR_TEMP, idx);
+	return nvfx_reg(NVFXSR_TEMP, idx);
 }
 
-static INLINE void
+static inline void
 release_temps(struct nvfx_vpc *vpc)
 {
 	vpc->r_temps &= ~vpc->r_temps_discard;
 	vpc->r_temps_discard = 0;
 }
 
-static struct nvfx_sreg
+static struct nvfx_reg
 constant(struct nvfx_vpc *vpc, int pipe, float x, float y, float z, float w)
 {
 	struct nvfx_vertex_program *vp = vpc->vp;
@@ -77,7 +89,7 @@ constant(struct nvfx_vpc *vpc, int pipe, float x, float y, float z, float w)
 	if (pipe >= 0) {
 		for (idx = 0; idx < vp->nr_consts; idx++) {
 			if (vp->consts[idx].index == pipe)
-				return nvfx_sr(NVFXSR_CONST, idx);
+				return nvfx_reg(NVFXSR_CONST, idx);
 		}
 	}
 
@@ -90,35 +102,36 @@ constant(struct nvfx_vpc *vpc, int pipe, float x, float y, float z, float w)
 	vpd->value[1] = y;
 	vpd->value[2] = z;
 	vpd->value[3] = w;
-	return nvfx_sr(NVFXSR_CONST, idx);
+	return nvfx_reg(NVFXSR_CONST, idx);
 }
 
-#define arith(cc,s,o,d,m,s0,s1,s2) \
-	nvfx_vp_arith(nvfx, (cc), NVFX_VP_INST_SLOT_##s, NVFX_VP_INST_##s##_OP_##o, (d), (m), (s0), (s1), (s2))
+#define arith(s,o,d,m,s0,s1,s2) \
+	nvfx_insn(0, (NVFX_VP_INST_SLOT_##s << 7) | NVFX_VP_INST_##s##_OP_##o, -1, (d), (m), (s0), (s1), (s2))
 
 static void
-emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos, struct nvfx_sreg src)
+emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos, struct nvfx_src src)
 {
 	struct nvfx_vertex_program *vp = vpc->vp;
 	uint32_t sr = 0;
+	struct nvfx_relocation reloc;
 
-	switch (src.type) {
+	switch (src.reg.type) {
 	case NVFXSR_TEMP:
 		sr |= (NVFX_VP(SRC_REG_TYPE_TEMP) << NVFX_VP(SRC_REG_TYPE_SHIFT));
-		sr |= (src.index << NVFX_VP(SRC_TEMP_SRC_SHIFT));
+		sr |= (src.reg.index << NVFX_VP(SRC_TEMP_SRC_SHIFT));
 		break;
 	case NVFXSR_INPUT:
 		sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
 		       NVFX_VP(SRC_REG_TYPE_SHIFT));
-		vp->ir |= (1 << src.index);
-		hw[1] |= (src.index << NVFX_VP(INST_INPUT_SRC_SHIFT));
+		vp->ir |= (1 << src.reg.index);
+		hw[1] |= (src.reg.index << NVFX_VP(INST_INPUT_SRC_SHIFT));
 		break;
 	case NVFXSR_CONST:
 		sr |= (NVFX_VP(SRC_REG_TYPE_CONST) <<
 		       NVFX_VP(SRC_REG_TYPE_SHIFT));
-		assert(vpc->vpi->const_index == -1 ||
-		       vpc->vpi->const_index == src.index);
-		vpc->vpi->const_index = src.index;
+		reloc.location = vp->nr_insns - 1;
+		reloc.target = src.reg.index;
+		util_dynarray_append(&vp->const_relocs, struct nvfx_relocation, reloc);
 		break;
 	case NVFXSR_NONE:
 		sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
@@ -161,100 +174,67 @@ emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos,
 }
 
 static void
-emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot, struct nvfx_sreg dst)
+emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot, struct nvfx_reg dst)
 {
 	struct nvfx_vertex_program *vp = vpc->vp;
 
 	switch (dst.type) {
+	case NVFXSR_NONE:
+		if(!nvfx->is_nv4x)
+			hw[0] |= NV30_VP_INST_DEST_TEMP_ID_MASK;
+		else {
+			hw[3] |= NV40_VP_INST_DEST_MASK;
+			if (slot == 0)
+				hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK;
+			else
+				hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+		}
+		break;
 	case NVFXSR_TEMP:
 		if(!nvfx->is_nv4x)
 			hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
 		else {
 			hw[3] |= NV40_VP_INST_DEST_MASK;
-			if (slot == 0) {
-				hw[0] |= (dst.index <<
-					  NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
-			} else {
-				hw[3] |= (dst.index <<
-					  NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
-			}
+			if (slot == 0)
+				hw[0] |= (dst.index << NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
+			else
+				hw[3] |= (dst.index << NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
 		}
 		break;
 	case NVFXSR_OUTPUT:
 		/* TODO: this may be wrong because on nv30 COL0 and BFC0 are swapped */
-		switch (dst.index) {
-		case NVFX_VP_INST_DEST_CLIP(0):
-			vp->or |= (1 << 6);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0;
-			dst.index = NVFX_VP(INST_DEST_FOGC);
-			break;
-		case NVFX_VP_INST_DEST_CLIP(1):
-			vp->or |= (1 << 7);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1;
-			dst.index = NVFX_VP(INST_DEST_FOGC);
-			break;
-		case NVFX_VP_INST_DEST_CLIP(2):
-			vp->or |= (1 << 8);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2;
-			dst.index = NVFX_VP(INST_DEST_FOGC);
-			break;
-		case NVFX_VP_INST_DEST_CLIP(3):
-			vp->or |= (1 << 9);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3;
-			dst.index = NVFX_VP(INST_DEST_PSZ);
-			break;
-		case NVFX_VP_INST_DEST_CLIP(4):
-			vp->or |= (1 << 10);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4;
-			dst.index = NVFX_VP(INST_DEST_PSZ);
-			break;
-		case NVFX_VP_INST_DEST_CLIP(5):
-			vp->or |= (1 << 11);
-			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE5;
-			dst.index = NVFX_VP(INST_DEST_PSZ);
-			break;
-		default:
-			if(!nvfx->is_nv4x) {
-				switch (dst.index) {
-				case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
-				case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
-				case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
-				case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
-				case NV30_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
-				case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
-				case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
-				case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
-				case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
-				case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
-				case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
-				case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
-				case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
-				case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
-				}
-			} else {
-				switch (dst.index) {
-				case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
-				case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
-				case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
-				case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
-				case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
-				case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
-				case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
-				case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
-				case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
-				case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
-				case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
-				case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
-				case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
-				case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
-				}
+		if(nvfx->is_nv4x) {
+			switch (dst.index) {
+			case NV30_VP_INST_DEST_CLP(0):
+				dst.index = NVFX_VP(INST_DEST_FOGC);
+				break;
+			case NV30_VP_INST_DEST_CLP(1):
+				dst.index = NVFX_VP(INST_DEST_FOGC);
+				break;
+			case NV30_VP_INST_DEST_CLP(2):
+				dst.index = NVFX_VP(INST_DEST_FOGC);
+				break;
+			case NV30_VP_INST_DEST_CLP(3):
+				dst.index = NVFX_VP(INST_DEST_PSZ);
+				break;
+			case NV30_VP_INST_DEST_CLP(4):
+				dst.index = NVFX_VP(INST_DEST_PSZ);
+				break;
+			case NV30_VP_INST_DEST_CLP(5):
+				dst.index = NVFX_VP(INST_DEST_PSZ);
+				break;
+			case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+			case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+			case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+			case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+			case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
+			case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
 			}
-			break;
 		}
 
 		if(!nvfx->is_nv4x) {
 			hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
-			hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+			hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK;
 
 			/*XXX: no way this is entirely correct, someone needs to
 			 *     figure out what exactly it is.
@@ -264,7 +244,7 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot
 			hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
 			if (slot == 0) {
 				hw[0] |= NV40_VP_INST_VEC_RESULT;
-				hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+				hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK;
 			} else {
 				hw[3] |= NV40_VP_INST_SCA_RESULT;
 				hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
@@ -277,26 +257,27 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot
 }
 
 static void
-nvfx_vp_arith(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, int slot, int op,
-	      struct nvfx_sreg dst, int mask,
-	      struct nvfx_sreg s0, struct nvfx_sreg s1,
-	      struct nvfx_sreg s2)
+nvfx_vp_emit(struct nvfx_vpc *vpc, struct nvfx_insn insn)
 {
+	struct nvfx_context* nvfx = vpc->nvfx;
 	struct nvfx_vertex_program *vp = vpc->vp;
+	unsigned slot = insn.op >> 7;
+	unsigned op = insn.op & 0x7f;
 	uint32_t *hw;
 
 	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
 	vpc->vpi = &vp->insns[vp->nr_insns - 1];
 	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
-	vpc->vpi->const_index = -1;
 
 	hw = vpc->vpi->data;
 
-	hw[0] |= (NVFX_COND_TR << NVFX_VP(INST_COND_SHIFT));
-	hw[0] |= ((0 << NVFX_VP(INST_COND_SWZ_X_SHIFT)) |
-		  (1 << NVFX_VP(INST_COND_SWZ_Y_SHIFT)) |
-		  (2 << NVFX_VP(INST_COND_SWZ_Z_SHIFT)) |
-		  (3 << NVFX_VP(INST_COND_SWZ_W_SHIFT)));
+	hw[0] |= (insn.cc_test << NVFX_VP(INST_COND_SHIFT));
+	hw[0] |= ((insn.cc_swz[0] << NVFX_VP(INST_COND_SWZ_X_SHIFT)) |
+		  (insn.cc_swz[1] << NVFX_VP(INST_COND_SWZ_Y_SHIFT)) |
+		  (insn.cc_swz[2] << NVFX_VP(INST_COND_SWZ_Z_SHIFT)) |
+		  (insn.cc_swz[3] << NVFX_VP(INST_COND_SWZ_W_SHIFT)));
+	if(insn.cc_update)
+		hw[0] |= NVFX_VP(INST_COND_UPDATE_ENABLE);
 
 	if(!nvfx->is_nv4x) {
 		if(slot == 0)
@@ -309,54 +290,56 @@ nvfx_vp_arith(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, int slot, int op,
 //		hw[3] |= NVFX_VP(INST_SCA_DEST_TEMP_MASK);
 //		hw[3] |= (mask << NVFX_VP(INST_VEC_WRITEMASK_SHIFT));
 
-		if (dst.type == NVFXSR_OUTPUT) {
+		if (insn.dst.type == NVFXSR_OUTPUT) {
 			if (slot)
-				hw[3] |= (mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
+				hw[3] |= (insn.mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
 			else
-				hw[3] |= (mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
+				hw[3] |= (insn.mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
 		} else {
 			if (slot)
-				hw[3] |= (mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
+				hw[3] |= (insn.mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
 			else
-				hw[3] |= (mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
+				hw[3] |= (insn.mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
 		}
 	 } else {
 		if (slot == 0) {
 			hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
 			hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
-			hw[3] |= (mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
+			hw[3] |= (insn.mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
 	    } else {
 			hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
-			hw[0] |= (NV40_VP_INST_VEC_DEST_TEMP_MASK | (1 << 20));
-			hw[3] |= (mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
+			hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK ;
+			hw[3] |= (insn.mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
 		}
 	}
 
-	emit_dst(nvfx, vpc, hw, slot, dst);
-	emit_src(nvfx, vpc, hw, 0, s0);
-	emit_src(nvfx, vpc, hw, 1, s1);
-	emit_src(nvfx, vpc, hw, 2, s2);
+	emit_dst(nvfx, vpc, hw, slot, insn.dst);
+	emit_src(nvfx, vpc, hw, 0, insn.src[0]);
+	emit_src(nvfx, vpc, hw, 1, insn.src[1]);
+	emit_src(nvfx, vpc, hw, 2, insn.src[2]);
 }
 
-static INLINE struct nvfx_sreg
+static inline struct nvfx_src
 tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
-	struct nvfx_sreg src = { 0 };
+	struct nvfx_src src;
 
 	switch (fsrc->Register.File) {
 	case TGSI_FILE_INPUT:
-		src = nvfx_sr(NVFXSR_INPUT, fsrc->Register.Index);
+		src.reg = nvfx_reg(NVFXSR_INPUT, fsrc->Register.Index);
 		break;
 	case TGSI_FILE_CONSTANT:
-		src = constant(vpc, fsrc->Register.Index, 0, 0, 0, 0);
+		src.reg = constant(vpc, fsrc->Register.Index, 0, 0, 0, 0);
 		break;
 	case TGSI_FILE_IMMEDIATE:
-		src = vpc->imm[fsrc->Register.Index];
+		src.reg = vpc->imm[fsrc->Register.Index];
 		break;
 	case TGSI_FILE_TEMPORARY:
-		src = vpc->r_temp[fsrc->Register.Index];
+		src.reg = vpc->r_temp[fsrc->Register.Index];
 		break;
 	default:
 		NOUVEAU_ERR("bad src file\n");
+		src.reg.index = 0;
+		src.reg.type = 0;
 		break;
 	}
 
@@ -369,11 +352,14 @@ tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
 	return src;
 }
 
-static INLINE struct nvfx_sreg
+static INLINE struct nvfx_reg
 tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
-	struct nvfx_sreg dst = { 0 };
+	struct nvfx_reg dst;
 
 	switch (fdst->Register.File) {
+	case TGSI_FILE_NULL:
+		dst = nvfx_reg(NVFXSR_NONE, 0);
+		break;
 	case TGSI_FILE_OUTPUT:
 		dst = vpc->r_result[fdst->Register.Index];
 		break;
@@ -384,14 +370,16 @@ tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
 		dst = vpc->r_address[fdst->Register.Index];
 		break;
 	default:
-		NOUVEAU_ERR("bad dst file\n");
+		NOUVEAU_ERR("bad dst file %i\n", fdst->Register.File);
+		dst.index = 0;
+		dst.type = 0;
 		break;
 	}
 
 	return dst;
 }
 
-static INLINE int
+static inline int
 tgsi_mask(uint tgsi)
 {
 	int mask = 0;
@@ -405,10 +393,14 @@ tgsi_mask(uint tgsi)
 
 static boolean
 nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
-				const struct tgsi_full_instruction *finst)
+				unsigned idx, const struct tgsi_full_instruction *finst)
 {
-	struct nvfx_sreg src[3], dst, tmp;
-	struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0);
+	struct nvfx_src src[3], tmp;
+	struct nvfx_reg dst;
+	struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
+	struct nvfx_insn insn;
+	struct nvfx_relocation reloc;
+	struct nvfx_loop_entry loop;
 	int mask;
 	int ai = -1, ci = -1, ii = -1;
 	int i;
@@ -436,9 +428,8 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 				ai = fsrc->Register.Index;
 				src[i] = tgsi_src(vpc, fsrc);
 			} else {
-				src[i] = temp(vpc);
-				arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL,
-				      tgsi_src(vpc, fsrc), none, none);
+				src[i] = nvfx_src(temp(vpc));
+				nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none));
 			}
 			break;
 		case TGSI_FILE_CONSTANT:
@@ -447,9 +438,8 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 				ci = fsrc->Register.Index;
 				src[i] = tgsi_src(vpc, fsrc);
 			} else {
-				src[i] = temp(vpc);
-				arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL,
-				      tgsi_src(vpc, fsrc), none, none);
+				src[i] = nvfx_src(temp(vpc));
+				nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none));
 			}
 			break;
 		case TGSI_FILE_IMMEDIATE:
@@ -458,9 +448,8 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 				ii = fsrc->Register.Index;
 				src[i] = tgsi_src(vpc, fsrc);
 			} else {
-				src[i] = temp(vpc);
-				arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL,
-				      tgsi_src(vpc, fsrc), none, none);
+				src[i] = nvfx_src(temp(vpc));
+				nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none));
 			}
 			break;
 		case TGSI_FILE_TEMPORARY:
@@ -477,128 +466,231 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 
 	switch (finst->Instruction.Opcode) {
 	case TGSI_OPCODE_ABS:
-		arith(vpc, VEC, MOV, dst, mask, abs(src[0]), none, none);
+		nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, abs(src[0]), none, none));
 		break;
 	case TGSI_OPCODE_ADD:
-		arith(vpc, VEC, ADD, dst, mask, src[0], none, src[1]);
+		nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, src[0], none, src[1]));
 		break;
 	case TGSI_OPCODE_ARL:
-		arith(vpc, VEC, ARL, dst, mask, src[0], none, none);
+		nvfx_vp_emit(vpc, arith(VEC, ARL, dst, mask, src[0], none, none));
+		break;
+	case TGSI_OPCODE_CMP:
+		insn = arith(VEC, MOV, none.reg, mask, src[0], none, none);
+		insn.cc_update = 1;
+		nvfx_vp_emit(vpc, insn);
+
+		insn = arith(VEC, MOV, dst, mask, src[2], none, none);
+		insn.cc_test = NVFX_COND_GE;
+		nvfx_vp_emit(vpc, insn);
+
+		insn = arith(VEC, MOV, dst, mask, src[1], none, none);
+		insn.cc_test = NVFX_COND_LT;
+		nvfx_vp_emit(vpc, insn);
 		break;
 	case TGSI_OPCODE_COS:
-		arith(vpc, SCA, COS, dst, mask, none, none, src[0]);
+		nvfx_vp_emit(vpc, arith(SCA, COS, dst, mask, none, none, src[0]));
 		break;
+        case TGSI_OPCODE_DP2:
+                tmp = nvfx_src(temp(vpc));
+                nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, NVFX_VP_MASK_X | NVFX_VP_MASK_Y, src[0], src[1], none));
+                nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, swz(tmp, X, X, X, X), swz(tmp, Y, Y, Y, Y), none));
+                break;
 	case TGSI_OPCODE_DP3:
-		arith(vpc, VEC, DP3, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, DP3, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_DP4:
-		arith(vpc, VEC, DP4, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, DP4, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_DPH:
-		arith(vpc, VEC, DPH, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, DPH, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_DST:
-		arith(vpc, VEC, DST, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, DST, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_EX2:
-		arith(vpc, SCA, EX2, dst, mask, none, none, src[0]);
+		nvfx_vp_emit(vpc, arith(SCA, EX2, dst, mask, none, none, src[0]));
 		break;
 	case TGSI_OPCODE_EXP:
-		arith(vpc, SCA, EXP, dst, mask, none, none, src[0]);
+		nvfx_vp_emit(vpc, arith(SCA, EXP, dst, mask, none, none, src[0]));
 		break;
 	case TGSI_OPCODE_FLR:
-		arith(vpc, VEC, FLR, dst, mask, src[0], none, none);
+		nvfx_vp_emit(vpc, arith(VEC, FLR, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_FRC:
-		arith(vpc, VEC, FRC, dst, mask, src[0], none, none);
+		nvfx_vp_emit(vpc, arith(VEC, FRC, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_LG2:
-		arith(vpc, SCA, LG2, dst, mask, none, none, src[0]);
+		nvfx_vp_emit(vpc, arith(SCA, LG2, dst, mask, none, none, src[0]));
 		break;
 	case TGSI_OPCODE_LIT:
-		arith(vpc, SCA, LIT, dst, mask, none, none, src[0]);
+		nvfx_vp_emit(vpc, arith(SCA, LIT, dst, mask, none, none, src[0]));
 		break;
 	case TGSI_OPCODE_LOG:
-		arith(vpc, SCA, LOG, dst, mask, none, none, src[0]);
+		nvfx_vp_emit(vpc, arith(SCA, LOG, dst, mask, none, none, src[0]));
 		break;
 	case TGSI_OPCODE_LRP:
-		tmp = temp(vpc);
-		arith(vpc, VEC, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
-		arith(vpc, VEC, MAD, dst, mask, src[0], src[1], tmp);
+		tmp = nvfx_src(temp(vpc));
+		nvfx_vp_emit(vpc, arith(VEC, MAD, tmp.reg, mask, neg(src[0]), src[2], src[2]));
+		nvfx_vp_emit(vpc, arith(VEC, MAD, dst, mask, src[0], src[1], tmp));
 		break;
 	case TGSI_OPCODE_MAD:
-		arith(vpc, VEC, MAD, dst, mask, src[0], src[1], src[2]);
+		nvfx_vp_emit(vpc, arith(VEC, MAD, dst, mask, src[0], src[1], src[2]));
 		break;
 	case TGSI_OPCODE_MAX:
-		arith(vpc, VEC, MAX, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, MAX, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_MIN:
-		arith(vpc, VEC, MIN, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, MIN, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_MOV:
-		arith(vpc, VEC, MOV, dst, mask, src[0], none, none);
+		nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, src[0], none, none));
 		break;
 	case TGSI_OPCODE_MUL:
-		arith(vpc, VEC, MUL, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, MUL, dst, mask, src[0], src[1], none));
+		break;
+	case TGSI_OPCODE_NOP:
 		break;
 	case TGSI_OPCODE_POW:
-		tmp = temp(vpc);
-		arith(vpc, SCA, LG2, tmp, NVFX_VP_MASK_X, none, none,
-		      swz(src[0], X, X, X, X));
-		arith(vpc, VEC, MUL, tmp, NVFX_VP_MASK_X, swz(tmp, X, X, X, X),
-		      swz(src[1], X, X, X, X), none);
-		arith(vpc, SCA, EX2, dst, mask, none, none,
-		      swz(tmp, X, X, X, X));
+		tmp = nvfx_src(temp(vpc));
+		nvfx_vp_emit(vpc, arith(SCA, LG2, tmp.reg, NVFX_VP_MASK_X, none, none, swz(src[0], X, X, X, X)));
+		nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, NVFX_VP_MASK_X, swz(tmp, X, X, X, X), swz(src[1], X, X, X, X), none));
+		nvfx_vp_emit(vpc, arith(SCA, EX2, dst, mask, none, none, swz(tmp, X, X, X, X)));
 		break;
 	case TGSI_OPCODE_RCP:
-		arith(vpc, SCA, RCP, dst, mask, none, none, src[0]);
-		break;
-	case TGSI_OPCODE_RET:
+		nvfx_vp_emit(vpc, arith(SCA, RCP, dst, mask, none, none, src[0]));
 		break;
 	case TGSI_OPCODE_RSQ:
-		arith(vpc, SCA, RSQ, dst, mask, none, none, abs(src[0]));
+		nvfx_vp_emit(vpc, arith(SCA, RSQ, dst, mask, none, none, abs(src[0])));
 		break;
 	case TGSI_OPCODE_SEQ:
-		arith(vpc, VEC, SEQ, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, SEQ, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SFL:
-		arith(vpc, VEC, SFL, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, SFL, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SGE:
-		arith(vpc, VEC, SGE, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, SGE, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SGT:
-		arith(vpc, VEC, SGT, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, SGT, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SIN:
-		arith(vpc, SCA, SIN, dst, mask, none, none, src[0]);
+		nvfx_vp_emit(vpc, arith(SCA, SIN, dst, mask, none, none, src[0]));
 		break;
 	case TGSI_OPCODE_SLE:
-		arith(vpc, VEC, SLE, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, SLE, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SLT:
-		arith(vpc, VEC, SLT, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, SLT, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SNE:
-		arith(vpc, VEC, SNE, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, SNE, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SSG:
-		arith(vpc, VEC, SSG, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, SSG, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_STR:
-		arith(vpc, VEC, STR, dst, mask, src[0], src[1], none);
+		nvfx_vp_emit(vpc, arith(VEC, STR, dst, mask, src[0], src[1], none));
 		break;
 	case TGSI_OPCODE_SUB:
-		arith(vpc, VEC, ADD, dst, mask, src[0], none, neg(src[1]));
+		nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, src[0], none, neg(src[1])));
 		break;
+        case TGSI_OPCODE_TRUNC:
+                tmp = nvfx_src(temp(vpc));
+                insn = arith(VEC, MOV, none.reg, mask, src[0], none, none);
+                insn.cc_update = 1;
+                nvfx_vp_emit(vpc, insn);
+
+                nvfx_vp_emit(vpc, arith(VEC, FLR, tmp.reg, mask, abs(src[0]), none, none));
+                nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, tmp, none, none));
+
+                insn = arith(VEC, MOV, dst, mask, neg(tmp), none, none);
+                insn.cc_test = NVFX_COND_LT;
+                nvfx_vp_emit(vpc, insn);
+                break;
 	case TGSI_OPCODE_XPD:
-		tmp = temp(vpc);
-		arith(vpc, VEC, MUL, tmp, mask,
-		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
-		arith(vpc, VEC, MAD, dst, (mask & ~NVFX_VP_MASK_W),
-		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
-		      neg(tmp));
+		tmp = nvfx_src(temp(vpc));
+		nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none));
+		nvfx_vp_emit(vpc, arith(VEC, MAD, dst, (mask & ~NVFX_VP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp)));
+		break;
+
+	case TGSI_OPCODE_IF:
+		insn = arith(VEC, MOV, none.reg, NVFX_VP_MASK_X, src[0], none, none);
+		insn.cc_update = 1;
+		nvfx_vp_emit(vpc, insn);
+
+		reloc.location = vpc->vp->nr_insns;
+		reloc.target = finst->Label.Label + 1;
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+
+		insn = arith(SCA, BRA, none.reg, 0, none, none, none);
+		insn.cc_test = NVFX_COND_EQ;
+		insn.cc_swz[0] = insn.cc_swz[1] = insn.cc_swz[2] = insn.cc_swz[3] = 0;
+		nvfx_vp_emit(vpc, insn);
 		break;
+
+	case TGSI_OPCODE_ELSE:
+	case TGSI_OPCODE_BRA:
+	case TGSI_OPCODE_CAL:
+		reloc.location = vpc->vp->nr_insns;
+		reloc.target = finst->Label.Label;
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+
+		if(finst->Instruction.Opcode == TGSI_OPCODE_CAL)
+			insn = arith(SCA, CAL, none.reg, 0, none, none, none);
+		else
+			insn = arith(SCA, BRA, none.reg, 0, none, none, none);
+		nvfx_vp_emit(vpc, insn);
+		break;
+
+	case TGSI_OPCODE_RET:
+		tmp = none;
+		tmp.swz[0] = tmp.swz[1] = tmp.swz[2] = tmp.swz[3] = 0;
+		nvfx_vp_emit(vpc, arith(SCA, RET, none.reg, 0, none, none, tmp));
+		break;
+
+	case TGSI_OPCODE_BGNSUB:
+	case TGSI_OPCODE_ENDSUB:
+	case TGSI_OPCODE_ENDIF:
+		/* nothing to do here */
+		break;
+
+	case TGSI_OPCODE_BGNLOOP:
+		loop.cont_target = idx;
+		loop.brk_target = finst->Label.Label + 1;
+		util_dynarray_append(&vpc->loop_stack, struct nvfx_loop_entry, loop);
+		break;
+
+	case TGSI_OPCODE_ENDLOOP:
+		loop = util_dynarray_pop(&vpc->loop_stack, struct nvfx_loop_entry);
+
+		reloc.location = vpc->vp->nr_insns;
+		reloc.target = loop.cont_target;
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+
+		nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
+		break;
+
+	case TGSI_OPCODE_CONT:
+		loop = util_dynarray_top(&vpc->loop_stack, struct nvfx_loop_entry);
+
+		reloc.location = vpc->vp->nr_insns;
+		reloc.target = loop.cont_target;
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+
+		nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
+		break;
+
+	case TGSI_OPCODE_BRK:
+		loop = util_dynarray_top(&vpc->loop_stack, struct nvfx_loop_entry);
+
+		reloc.location = vpc->vp->nr_insns;
+		reloc.target = loop.brk_target;
+		util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+
+		nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none));
+		break;
+
 	default:
 		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
 		return FALSE;
@@ -649,12 +741,8 @@ nvfx_vertprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 		hw = NVFX_VP(INST_DEST_PSZ);
 		break;
 	case TGSI_SEMANTIC_GENERIC:
-		if (fdec->Semantic.Index <= 7) {
-			hw = NVFX_VP(INST_DEST_TC(fdec->Semantic.Index));
-		} else {
-			NOUVEAU_ERR("bad generic semantic index\n");
-			return FALSE;
-		}
+		hw = (vpc->vp->generic_to_fp_input[fdec->Semantic.Index] & 0xf)
+			+ NVFX_VP(INST_DEST_TC(0)) - NVFX_FP_OP_INPUT_SRC_TC(0);
 		break;
 	case TGSI_SEMANTIC_EDGEFLAG:
 		/* not really an error just a fallback */
@@ -665,7 +753,7 @@ nvfx_vertprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
 		return FALSE;
 	}
 
-	vpc->r_result[idx] = nvfx_sr(NVFXSR_OUTPUT, hw);
+	vpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
 	return TRUE;
 }
 
@@ -674,6 +762,36 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
 {
 	struct tgsi_parse_context p;
 	int high_temp = -1, high_addr = -1, nr_imm = 0, i;
+	struct util_semantic_set set;
+	unsigned char sem_layout[8];
+	unsigned num_outputs;
+
+	num_outputs = util_semantic_set_from_program_file(&set, vpc->vp->pipe.tokens, TGSI_FILE_OUTPUT);
+
+	if(num_outputs > 8) {
+		NOUVEAU_ERR("too many vertex program outputs: %i\n", num_outputs);
+		return FALSE;
+	}
+	util_semantic_layout_from_set(sem_layout, &set, 8, 8);
+
+	/* hope 0xf is (0, 0, 0, 1) initialized; otherwise, we are _probably_ not required to do this */
+	memset(vpc->vp->generic_to_fp_input, 0x0f, sizeof(vpc->vp->generic_to_fp_input));
+	for(int i = 0; i < 8; ++i) {
+		if(sem_layout[i] == 0xff)
+			continue;
+		//printf("vp: GENERIC[%i] to fpreg %i\n", sem_layout[i], NVFX_FP_OP_INPUT_SRC_TC(0) + i);
+		vpc->vp->generic_to_fp_input[sem_layout[i]] = 0xf0 | NVFX_FP_OP_INPUT_SRC_TC(i);
+	}
+
+	vpc->vp->sprite_fp_input = -1;
+	for(int i = 0; i < 8; ++i)
+	{
+		if(sem_layout[i] == 0xff)
+		{
+			vpc->vp->sprite_fp_input = NVFX_FP_OP_INPUT_SRC_TC(i);
+			break;
+		}
+	}
 
 	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
 	while (!tgsi_parse_end_of_tokens(&p)) {
@@ -737,18 +855,18 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
 	tgsi_parse_free(&p);
 
 	if (nr_imm) {
-		vpc->imm = CALLOC(nr_imm, sizeof(struct nvfx_sreg));
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nvfx_reg));
 		assert(vpc->imm);
 	}
 
 	if (++high_temp) {
-		vpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_sreg));
+		vpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_reg));
 		for (i = 0; i < high_temp; i++)
 			vpc->r_temp[i] = temp(vpc);
 	}
 
 	if (++high_addr) {
-		vpc->r_address = CALLOC(high_addr, sizeof(struct nvfx_sreg));
+		vpc->r_address = CALLOC(high_addr, sizeof(struct nvfx_reg));
 		for (i = 0; i < high_addr; i++)
 			vpc->r_address[i] = temp(vpc);
 	}
@@ -757,20 +875,31 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
 	return TRUE;
 }
 
+DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_vp, "NVFX_DUMP_VP", FALSE)
+
 static void
 nvfx_vertprog_translate(struct nvfx_context *nvfx,
 			struct nvfx_vertex_program *vp)
 {
 	struct tgsi_parse_context parse;
 	struct nvfx_vpc *vpc = NULL;
-	struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0);
+	struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
+	struct util_dynarray insns;
 	int i;
 
 	vpc = CALLOC(1, sizeof(struct nvfx_vpc));
 	if (!vpc)
 		return;
+	vpc->nvfx = nvfx;
 	vpc->vp = vp;
 
+	/* reserve space for ucps */
+	if(nvfx->use_vp_clipping)
+	{
+		for(i = 0; i < 6; ++i)
+			constant(vpc, -1, 0, 0, 0, 0);
+	}
+
 	if (!nvfx_vertprog_prepare(nvfx, vpc)) {
 		FREE(vpc);
 		return;
@@ -780,13 +909,15 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 	 * planes are enabled.  We need to append code to the vtxprog
 	 * to handle clip planes later.
 	 */
-	if (vp->ucp.nr)  {
+	/* TODO: maybe support patching this depending on whether there are ucps: not sure if it is really matters much */
+	if (nvfx->use_vp_clipping)  {
 		vpc->r_result[vpc->hpos_idx] = temp(vpc);
 		vpc->r_temps_discard = 0;
 	}
 
 	tgsi_parse_init(&parse, vp->pipe.tokens);
 
+	util_dynarray_init(&insns);
 	while (!tgsi_parse_end_of_tokens(&parse)) {
 		tgsi_parse_token(&parse);
 
@@ -809,8 +940,10 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
 		{
 			const struct tgsi_full_instruction *finst;
+			unsigned idx = insns.size >> 2;
+			util_dynarray_append(&insns, unsigned, vp->nr_insns);
 			finst = &parse.FullToken.FullInstruction;
-			if (!nvfx_vertprog_parse_instruction(nvfx, vpc, finst))
+			if (!nvfx_vertprog_parse_instruction(nvfx, vpc, idx, finst))
 				goto out_err;
 		}
 			break;
@@ -819,43 +952,87 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx,
 		}
 	}
 
+	util_dynarray_append(&insns, unsigned, vp->nr_insns);
+
+	for(unsigned i = 0; i < vpc->label_relocs.size; i += sizeof(struct nvfx_relocation))
+	{
+		struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)vpc->label_relocs.data + i);
+		struct nvfx_relocation hw_reloc;
+
+		hw_reloc.location = label_reloc->location;
+		hw_reloc.target = ((unsigned*)insns.data)[label_reloc->target];
+
+		//debug_printf("hw %u -> tgsi %u = hw %u\n", hw_reloc.location, label_reloc->target, hw_reloc.target);
+
+		util_dynarray_append(&vp->branch_relocs, struct nvfx_relocation, hw_reloc);
+	}
+	util_dynarray_fini(&insns);
+	util_dynarray_trim(&vp->branch_relocs);
+
+	/* XXX: what if we add a RET before?!  make sure we jump here...*/
+
 	/* Write out HPOS if it was redirected to a temp earlier */
 	if (vpc->r_result[vpc->hpos_idx].type != NVFXSR_OUTPUT) {
-		struct nvfx_sreg hpos = nvfx_sr(NVFXSR_OUTPUT,
+		struct nvfx_reg hpos = nvfx_reg(NVFXSR_OUTPUT,
 						NVFX_VP(INST_DEST_POS));
-		struct nvfx_sreg htmp = vpc->r_result[vpc->hpos_idx];
+		struct nvfx_src htmp = nvfx_src(vpc->r_result[vpc->hpos_idx]);
 
-		arith(vpc, VEC, MOV, hpos, NVFX_VP_MASK_ALL, htmp, none, none);
+		nvfx_vp_emit(vpc, arith(VEC, MOV, hpos, NVFX_VP_MASK_ALL, htmp, none, none));
 	}
 
 	/* Insert code to handle user clip planes */
-	for (i = 0; i < vp->ucp.nr; i++) {
-		struct nvfx_sreg cdst = nvfx_sr(NVFXSR_OUTPUT,
-						NVFX_VP_INST_DEST_CLIP(i));
-		struct nvfx_sreg ceqn = constant(vpc, -1,
-						 nvfx->clip.ucp[i][0],
-						 nvfx->clip.ucp[i][1],
-						 nvfx->clip.ucp[i][2],
-						 nvfx->clip.ucp[i][3]);
-		struct nvfx_sreg htmp = vpc->r_result[vpc->hpos_idx];
-		unsigned mask;
-
-		switch (i) {
-		case 0: case 3: mask = NVFX_VP_MASK_Y; break;
-		case 1: case 4: mask = NVFX_VP_MASK_Z; break;
-		case 2: case 5: mask = NVFX_VP_MASK_W; break;
-		default:
-			NOUVEAU_ERR("invalid clip dist #%d\n", i);
-			goto out_err;
+	if(nvfx->use_vp_clipping)
+	{
+		for (i = 0; i < 6; i++) {
+			struct nvfx_reg cdst = nvfx_reg(NVFXSR_OUTPUT, NV30_VP_INST_DEST_CLP(i));
+			struct nvfx_src ceqn = nvfx_src(nvfx_reg(NVFXSR_CONST, i));
+			struct nvfx_src htmp = nvfx_src(vpc->r_result[vpc->hpos_idx]);
+			unsigned mask;
+
+			if(nvfx->is_nv4x)
+			{
+				switch (i) {
+				case 0: case 3: mask = NVFX_VP_MASK_Y; break;
+				case 1: case 4: mask = NVFX_VP_MASK_Z; break;
+				case 2: case 5: mask = NVFX_VP_MASK_W; break;
+				default:
+					NOUVEAU_ERR("invalid clip dist #%d\n", i);
+					goto out_err;
+				}
+			}
+			else
+				mask = NVFX_VP_MASK_X;
+
+			nvfx_vp_emit(vpc, arith(VEC, DP4, cdst, mask, htmp, ceqn, none));
 		}
+	}
+	else
+	{
+		if(vp->nr_insns)
+			vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
 
-		arith(vpc, VEC, DP4, cdst, mask, htmp, ceqn, none);
+		nvfx_vp_emit(vpc, arith(VEC, NOP, none.reg, 0, none, none, none));
+		vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
 	}
 
-	vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
+	if(debug_get_option_nvfx_dump_vp())
+	{
+		debug_printf("\n");
+		tgsi_dump(vp->pipe.tokens, 0);
+
+		debug_printf("\n%s vertex program:\n", nvfx->is_nv4x ? "nv4x" : "nv3x");
+		for (i = 0; i < vp->nr_insns; i++)
+			debug_printf("%3u: %08x %08x %08x %08x\n", i, vp->insns[i].data[0], vp->insns[i].data[1], vp->insns[i].data[2], vp->insns[i].data[3]);
+		debug_printf("\n");
+	}
+
+	vp->clip_nr = -1;
+	vp->exec_start = -1;
 	vp->translated = TRUE;
 out_err:
 	tgsi_parse_free(&parse);
+	util_dynarray_fini(&vpc->label_relocs);
+	util_dynarray_fini(&vpc->loop_stack);
 	if (vpc->r_temp)
 		FREE(vpc->r_temp);
 	if (vpc->r_address)
@@ -868,26 +1045,17 @@ out_err:
 boolean
 nvfx_vertprog_validate(struct nvfx_context *nvfx)
 {
-	struct pipe_context *pipe = &nvfx->pipe;
 	struct nvfx_screen *screen = nvfx->screen;
 	struct nouveau_channel *chan = screen->base.channel;
 	struct nouveau_grobj *eng3d = screen->eng3d;
 	struct nvfx_vertex_program *vp;
 	struct pipe_resource *constbuf;
-	struct pipe_transfer *transfer = NULL;
 	boolean upload_code = FALSE, upload_data = FALSE;
 	int i;
 
 	if (nvfx->render_mode == HW) {
 		vp = nvfx->vertprog;
 		constbuf = nvfx->constbuf[PIPE_SHADER_VERTEX];
-
-		// TODO: ouch! can't we just use constant slots for these?!
-		if ((nvfx->dirty & NVFX_NEW_UCP) ||
-		    memcmp(&nvfx->clip, &vp->ucp, sizeof(vp->ucp))) {
-			nvfx_vertprog_destroy(nvfx, vp);
-			memcpy(&vp->ucp, &nvfx->clip, sizeof(vp->ucp));
-		}
 	} else {
 		vp = nvfx->swtnl.vertprog;
 		constbuf = NULL;
@@ -918,7 +1086,11 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 			}
 
 			if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec))
-				assert(0);
+			{
+				debug_printf("Vertex shader too long: %u instructions\n", vplen);
+				nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
+				return FALSE;
+			}
 		}
 
 		upload_code = TRUE;
@@ -937,7 +1109,11 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 			}
 
 			if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data))
-				assert(0);
+                        {
+                                debug_printf("Vertex shader uses too many constants: %u constants\n", vp->nr_consts);
+                                nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
+                                return FALSE;
+                        }
 		}
 
 		/*XXX: handle this some day */
@@ -952,44 +1128,57 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 	 * fixup offsets and register IDs.
 	 */
 	if (vp->exec_start != vp->exec->start) {
-		for (i = 0; i < vp->nr_insns; i++) {
-			struct nvfx_vertex_program_exec *vpi = &vp->insns[i];
+		//printf("vp_relocs %u -> %u\n", vp->exec_start, vp->exec->start);
+		for(unsigned i = 0; i < vp->branch_relocs.size; i += sizeof(struct nvfx_relocation))
+		{
+			struct nvfx_relocation* reloc = (struct nvfx_relocation*)((char*)vp->branch_relocs.data + i);
+			uint32_t* hw = vp->insns[reloc->location].data;
+			unsigned target = vp->exec->start + reloc->target;
+
+			//debug_printf("vp_reloc hw %u -> hw %u\n", reloc->location, target);
 
-			if (vpi->has_branch_offset) {
-				assert(0);
+			if(!nvfx->is_nv4x)
+			{
+				hw[2] &=~ NV30_VP_INST_IADDR_MASK;
+				hw[2] |= (target & 0x1ff) << NV30_VP_INST_IADDR_SHIFT;
+			}
+			else
+			{
+				hw[3] &=~ NV40_VP_INST_IADDRL_MASK;
+				hw[3] |= (target & 7) << NV40_VP_INST_IADDRL_SHIFT;
+
+				hw[2] &=~ NV40_VP_INST_IADDRH_MASK;
+				hw[2] |= ((target >> 3) & 0x3f) << NV40_VP_INST_IADDRH_SHIFT;
 			}
 		}
 
 		vp->exec_start = vp->exec->start;
 	}
 
-	if (vp->nr_consts && vp->data_start != vp->data->start) {
-		for (i = 0; i < vp->nr_insns; i++) {
-			struct nvfx_vertex_program_exec *vpi = &vp->insns[i];
+	if (vp->data_start != vp->data->start) {
+		for(unsigned i = 0; i < vp->const_relocs.size; i += sizeof(struct nvfx_relocation))
+		{
+			struct nvfx_relocation* reloc = (struct nvfx_relocation*)((char*)vp->const_relocs.data + i);
+			struct nvfx_vertex_program_exec *vpi = &vp->insns[reloc->location];
 
-			if (vpi->const_index >= 0) {
-				vpi->data[1] &= ~NVFX_VP(INST_CONST_SRC_MASK);
-				vpi->data[1] |=
-					(vpi->const_index + vp->data->start) <<
+			vpi->data[1] &= ~NVFX_VP(INST_CONST_SRC_MASK);
+			vpi->data[1] |=
+					(reloc->target + vp->data->start) <<
 					NVFX_VP(INST_CONST_SRC_SHIFT);
-
-			}
 		}
 
 		vp->data_start = vp->data->start;
+		upload_code = TRUE;
 	}
 
 	/* Update + Upload constant values */
 	if (vp->nr_consts) {
 		float *map = NULL;
 
-		if (constbuf) {
-			map = pipe_buffer_map(pipe, constbuf,
-					      PIPE_TRANSFER_READ,
-					      &transfer);
-		}
+		if (constbuf)
+			map = (float*)nvfx_buffer(constbuf)->data;
 
-		for (i = 0; i < vp->nr_consts; i++) {
+		for (i = nvfx->use_vp_clipping ? 6 : 0; i < vp->nr_consts; i++) {
 			struct nvfx_vertex_program_data *vpd = &vp->consts[i];
 
 			if (vpd->index >= 0) {
@@ -1005,41 +1194,28 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 			OUT_RING  (chan, i + vp->data->start);
 			OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
 		}
-
-		if (constbuf)
-			pipe_buffer_unmap(pipe, constbuf, transfer);
 	}
 
 	/* Upload vtxprog */
 	if (upload_code) {
-#if 0
-		for (i = 0; i < vp->nr_insns; i++) {
-			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[0]);
-			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[1]);
-			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[2]);
-			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
-		}
-#endif
 		BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_FROM_ID, 1);
 		OUT_RING  (chan, vp->exec->start);
 		for (i = 0; i < vp->nr_insns; i++) {
 			BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_INST(0), 4);
 			OUT_RINGp (chan, vp->insns[i].data, 4);
 		}
+		vp->clip_nr = -1;
 	}
 
-	if(nvfx->dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP))
+	if(nvfx->dirty & (NVFX_NEW_VERTPROG))
 	{
-		WAIT_RING(chan, 7);
+		WAIT_RING(chan, 6);
 		OUT_RING(chan, RING_3D(NV34TCL_VP_START_FROM_ID, 1));
 		OUT_RING(chan, vp->exec->start);
 		if(nvfx->is_nv4x) {
-			OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 2));
+			OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 1));
 			OUT_RING(chan, vp->ir);
-			OUT_RING(chan, vp->or);
 		}
-		OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1));
-		OUT_RING(chan, vp->clip_ctrl);
 	}
 
 	return TRUE;
@@ -1048,25 +1224,63 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 void
 nvfx_vertprog_destroy(struct nvfx_context *nvfx, struct nvfx_vertex_program *vp)
 {
-	vp->translated = FALSE;
-
-	if (vp->nr_insns) {
+	if (vp->nr_insns)
 		FREE(vp->insns);
-		vp->insns = NULL;
-		vp->nr_insns = 0;
-	}
 
-	if (vp->nr_consts) {
+	if (vp->nr_consts)
 		FREE(vp->consts);
-		vp->consts = NULL;
-		vp->nr_consts = 0;
-	}
 
 	nouveau_resource_free(&vp->exec);
-	vp->exec_start = 0;
 	nouveau_resource_free(&vp->data);
-	vp->data_start = 0;
-	vp->data_start_min = 0;
 
-	vp->ir = vp->or = vp->clip_ctrl = 0;
+	util_dynarray_fini(&vp->branch_relocs);
+	util_dynarray_fini(&vp->const_relocs);
+}
+
+static void *
+nvfx_vp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+        struct nvfx_context *nvfx = nvfx_context(pipe);
+        struct nvfx_vertex_program *vp;
+
+        // TODO: use a 64-bit atomic here!
+        static unsigned long long id = 0;
+
+        vp = CALLOC(1, sizeof(struct nvfx_vertex_program));
+        vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+        vp->draw = draw_create_vertex_shader(nvfx->draw, &vp->pipe);
+        vp->id = ++id;
+
+        return (void *)vp;
+}
+
+static void
+nvfx_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+        struct nvfx_context *nvfx = nvfx_context(pipe);
+
+        nvfx->vertprog = hwcso;
+        nvfx->dirty |= NVFX_NEW_VERTPROG;
+        nvfx->draw_dirty |= NVFX_NEW_VERTPROG;
+}
+
+static void
+nvfx_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+        struct nvfx_context *nvfx = nvfx_context(pipe);
+        struct nvfx_vertex_program *vp = hwcso;
+
+        draw_delete_vertex_shader(nvfx->draw, vp->draw);
+        nvfx_vertprog_destroy(nvfx, vp);
+        FREE((void*)vp->pipe.tokens);
+        FREE(vp);
+}
+
+void
+nvfx_init_vertprog_functions(struct nvfx_context *nvfx)
+{
+        nvfx->pipe.create_vs_state = nvfx_vp_state_create;
+        nvfx->pipe.bind_vs_state = nvfx_vp_state_bind;
+        nvfx->pipe.delete_vs_state = nvfx_vp_state_delete;
 }
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index e8b6c4f7af..624dadd07d 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -65,7 +65,7 @@ static void r300_release_referenced_objects(struct r300_context *r300)
     unsigned i;
 
     /* Framebuffer state. */
-    util_assign_framebuffer_state(fb, NULL);
+    util_unreference_framebuffer_state(fb);
 
     /* Textures. */
     for (i = 0; i < textures->sampler_view_count; i++)
@@ -99,8 +99,10 @@ static void r300_destroy_context(struct pipe_context* context)
     struct r300_context* r300 = r300_context(context);
     struct r300_atom *atom;
 
-    util_blitter_destroy(r300->blitter);
-    draw_destroy(r300->draw);
+    if (r300->blitter)
+        util_blitter_destroy(r300->blitter);
+    if (r300->draw)
+        draw_destroy(r300->draw);
 
     /* Print stats, if enabled. */
     if (SCREEN_DBG_ON(r300->screen, DBG_STATS)) {
@@ -112,40 +114,48 @@ static void r300_destroy_context(struct pipe_context* context)
         }
     }
 
-    u_upload_destroy(r300->upload_vb);
-    u_upload_destroy(r300->upload_ib);
+    if (r300->upload_vb)
+        u_upload_destroy(r300->upload_vb);
+    if (r300->upload_ib)
+        u_upload_destroy(r300->upload_ib);
 
-    /* setup hyper-z mm */
-    if (r300->rws->get_value(r300->rws, R300_CAN_HYPERZ))
-        r300_hyperz_destroy_mm(r300);
-
-    translate_cache_destroy(r300->tran.translate_cache);
+    if (r300->tran.translate_cache)
+        translate_cache_destroy(r300->tran.translate_cache);
 
+    /* XXX: This function assumes r300->query_list was initialized */
     r300_release_referenced_objects(r300);
 
-    r300->rws->cs_destroy(r300->cs);
+    if (r300->zmask_mm)
+        r300_hyperz_destroy_mm(r300);
+
+    if (r300->cs)
+        r300->rws->cs_destroy(r300->cs);
 
+    /* XXX: No way to tell if this was initialized or not? */
     util_mempool_destroy(&r300->pool_transfers);
 
     r300_update_num_contexts(r300->screen, -1);
 
-    FREE(r300->aa_state.state);
-    FREE(r300->blend_color_state.state);
-    FREE(r300->clip_state.state);
-    FREE(r300->fb_state.state);
-    FREE(r300->gpu_flush.state);
-    FREE(r300->hyperz_state.state);
-    FREE(r300->invariant_state.state);
-    FREE(r300->rs_block_state.state);
-    FREE(r300->scissor_state.state);
-    FREE(r300->textures_state.state);
-    FREE(r300->vap_invariant_state.state);
-    FREE(r300->viewport_state.state);
-    FREE(r300->ztop_state.state);
-    FREE(r300->fs_constants.state);
-    FREE(r300->vs_constants.state);
-    if (!r300->screen->caps.has_tcl) {
-        FREE(r300->vertex_stream_state.state);
+    /* Free the structs allocated in r300_setup_atoms() */
+    if (r300->aa_state.state) {
+        FREE(r300->aa_state.state);
+        FREE(r300->blend_color_state.state);
+        FREE(r300->clip_state.state);
+        FREE(r300->fb_state.state);
+        FREE(r300->gpu_flush.state);
+        FREE(r300->hyperz_state.state);
+        FREE(r300->invariant_state.state);
+        FREE(r300->rs_block_state.state);
+        FREE(r300->scissor_state.state);
+        FREE(r300->textures_state.state);
+        FREE(r300->vap_invariant_state.state);
+        FREE(r300->viewport_state.state);
+        FREE(r300->ztop_state.state);
+        FREE(r300->fs_constants.state);
+        FREE(r300->vs_constants.state);
+        if (!r300->screen->caps.has_tcl) {
+            FREE(r300->vertex_stream_state.state);
+        }
     }
     FREE(r300);
 }
@@ -158,12 +168,14 @@ void r300_flush_cb(void *data)
 }
 
 #define R300_INIT_ATOM(atomname, atomsize) \
+ do { \
     r300->atomname.name = #atomname; \
     r300->atomname.state = NULL; \
     r300->atomname.size = atomsize; \
     r300->atomname.emit = r300_emit_##atomname; \
     r300->atomname.dirty = FALSE; \
-    insert_at_tail(&r300->atom_list, &r300->atomname);
+    insert_at_tail(&r300->atom_list, &r300->atomname); \
+ } while (0)
 
 static void r300_setup_atoms(struct r300_context* r300)
 {
@@ -404,19 +416,21 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
 
     r300->context.destroy = r300_destroy_context;
 
-    r300->cs = rws->cs_create(rws);
+    make_empty_list(&r300->query_list);
 
     util_mempool_create(&r300->pool_transfers,
                         sizeof(struct pipe_transfer), 64,
                         UTIL_MEMPOOL_SINGLETHREADED);
 
+    r300->cs = rws->cs_create(rws);
+    if (r300->cs == NULL)
+        goto fail;
+
     if (!r300screen->caps.has_tcl) {
         /* Create a Draw. This is used for SW TCL. */
         r300->draw = draw_create(&r300->context);
         /* Enable our renderer. */
         draw_set_rasterize_stage(r300->draw, r300_draw_stage(r300));
-        /* Enable Draw's clipping. */
-        draw_set_driver_clipping(r300->draw, FALSE);
         /* Disable converting points/lines to triangles. */
         draw_wide_line_threshold(r300->draw, 10000000.f);
         draw_wide_point_threshold(r300->draw, 10000000.f);
@@ -424,8 +438,6 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
 
     r300_setup_atoms(r300);
 
-    make_empty_list(&r300->query_list);
-
     r300_init_blit_functions(r300);
     r300_init_flush_functions(r300);
     r300_init_query_functions(r300);
@@ -433,6 +445,8 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300_init_resource_functions(r300);
 
     r300->blitter = util_blitter_create(&r300->context);
+    if (r300->blitter == NULL)
+        goto fail;
 
     /* Render functions must be initialized after blitter. */
     r300_init_render_functions(r300);
@@ -441,22 +455,25 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
 
     /* setup hyper-z mm */
     if (r300->rws->get_value(r300->rws, R300_CAN_HYPERZ))
-        r300_hyperz_init_mm(r300);
+        if (!r300_hyperz_init_mm(r300))
+            goto fail;
 
     r300->upload_ib = u_upload_create(&r300->context,
 				      32 * 1024, 16,
 				      PIPE_BIND_INDEX_BUFFER);
 
     if (r300->upload_ib == NULL)
-        goto no_upload_ib;
+        goto fail;
 
     r300->upload_vb = u_upload_create(&r300->context,
 				      128 * 1024, 16,
 				      PIPE_BIND_VERTEX_BUFFER);
     if (r300->upload_vb == NULL)
-        goto no_upload_vb;
+        goto fail;
 
     r300->tran.translate_cache = translate_cache_create();
+    if (r300->tran.translate_cache == NULL)
+        goto fail;
 
     r300_init_states(&r300->context);
 
@@ -486,10 +503,8 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
 
     return &r300->context;
 
- no_upload_ib:
-    u_upload_destroy(r300->upload_ib);
- no_upload_vb:
-    FREE(r300);
+ fail:
+    r300_destroy_context(&r300->context);
     return NULL;
 }
 
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index 6fa7f470f9..8f0e86fd37 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -254,8 +254,8 @@ struct r300_ztop_state {
 struct r300_constant_buffer {
     /* Buffer of constants */
     uint32_t *ptr;
-    /* Total number of vec4s */
-    unsigned count;
+    /* Remapping table. */
+    unsigned *remap_table;
 };
 
 /* Query object.
@@ -449,6 +449,7 @@ struct r300_context {
     struct r300_screen *screen;
     /* Draw module. Used mostly for SW TCL. */
     struct draw_context* draw;
+    size_t draw_vbo_size;
     /* Accelerated blit support. */
     struct blitter_context* blitter;
     /* Stencil two-sided reference value fallback. */
@@ -649,6 +650,11 @@ void r300_translate_index_buffer(struct r300_context *r300,
 /* r300_render_stencilref.c */
 void r300_plug_in_stencil_ref_fallback(struct r300_context *r300);
 
+/* r300_render.c */
+void r300_draw_flush_vbuf(struct r300_context *r300);
+boolean r500_index_bias_supported(struct r300_context *r300);
+void r500_emit_index_bias(struct r300_context *r300, int index_bias);
+
 /* r300_state.c */
 enum r300_fb_state_change {
     R300_CHANGED_FB_STATE = 0,
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index d0fd45349e..232259e21d 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -180,9 +180,18 @@ void r300_emit_fs_constants(struct r300_context* r300, unsigned size, void *stat
 
     BEGIN_CS(size);
     OUT_CS_REG_SEQ(R300_PFS_PARAM_0_X, count * 4);
-    for (i = 0; i < count; i++)
-        for (j = 0; j < 4; j++)
-            OUT_CS(pack_float24(*(float*)&buf->ptr[i*4+j]));
+    if (buf->remap_table){
+        for (i = 0; i < count; i++) {
+            float *data = (float*)&buf->ptr[buf->remap_table[i]*4];
+            for (j = 0; j < 4; j++)
+                OUT_CS(pack_float24(data[j]));
+        }
+    } else {
+        for (i = 0; i < count; i++)
+            for (j = 0; j < 4; j++)
+                OUT_CS(pack_float24(*(float*)&buf->ptr[i*4+j]));
+    }
+
     END_CS;
 }
 
@@ -226,7 +235,7 @@ void r500_emit_fs_constants(struct r300_context* r300, unsigned size, void *stat
 {
     struct r300_fragment_shader *fs = r300_fs(r300);
     struct r300_constant_buffer *buf = (struct r300_constant_buffer*)state;
-    unsigned count = fs->shader->externals_count * 4;
+    unsigned count = fs->shader->externals_count;
     CS_LOCALS(r300);
 
     if (count == 0)
@@ -234,8 +243,15 @@ void r500_emit_fs_constants(struct r300_context* r300, unsigned size, void *stat
 
     BEGIN_CS(size);
     OUT_CS_REG(R500_GA_US_VECTOR_INDEX, R500_GA_US_VECTOR_INDEX_TYPE_CONST);
-    OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA, count);
-    OUT_CS_TABLE(buf->ptr, count);
+    OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA, count * 4);
+    if (buf->remap_table){
+        for (unsigned i = 0; i < count; i++) {
+            uint32_t *data = &buf->ptr[buf->remap_table[i]*4];
+            OUT_CS_TABLE(data, 4);
+        }
+    } else {
+        OUT_CS_TABLE(buf->ptr, count * 4);
+    }
     END_CS;
 }
 
@@ -893,7 +909,7 @@ void r300_emit_vs_state(struct r300_context* r300, unsigned size, void* state)
 
     unsigned pvs_num_slots = MIN3(vtx_mem_size / input_count,
                                   vtx_mem_size / output_count, 10);
-    unsigned pvs_num_controllers = MIN2(vtx_mem_size / temp_count, 6);
+    unsigned pvs_num_controllers = MIN2(vtx_mem_size / temp_count, 5);
 
     unsigned imm_first = vs->externals_count;
     unsigned imm_end = vs->code.constants.Count;
@@ -961,6 +977,7 @@ void r300_emit_vs_constants(struct r300_context* r300,
     unsigned count =
         ((struct r300_vertex_shader*)r300->vs_state.state)->externals_count;
     struct r300_constant_buffer *buf = (struct r300_constant_buffer*)state;
+    unsigned i;
     CS_LOCALS(r300);
 
     if (!count)
@@ -971,7 +988,14 @@ void r300_emit_vs_constants(struct r300_context* r300,
                (r300->screen->caps.is_r500 ?
                R500_PVS_CONST_START : R300_PVS_CONST_START));
     OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, count * 4);
-    OUT_CS_TABLE(buf->ptr, count * 4);
+    if (buf->remap_table){
+        for (i = 0; i < count; i++) {
+            uint32_t *data = &buf->ptr[buf->remap_table[i]*4];
+            OUT_CS_TABLE(data, 4);
+        }
+    } else {
+        OUT_CS_TABLE(buf->ptr, count * 4);
+    }
     END_CS;
 }
 
@@ -1219,6 +1243,8 @@ unsigned r300_get_num_cs_end_dwords(struct r300_context *r300)
     /* Emitted in flush. */
     dwords += 26; /* emit_query_end */
     dwords += r300->hyperz_state.size + 2; /* emit_hyperz_end + zcache flush */
+    if (r500_index_bias_supported(r300))
+        dwords += 2;
 
     return dwords;
 }
diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c
index fe182b6615..2b5d2e42ba 100644
--- a/src/gallium/drivers/r300/r300_flush.c
+++ b/src/gallium/drivers/r300/r300_flush.c
@@ -43,9 +43,14 @@ static void r300_flush(struct pipe_context* pipe,
     u_upload_flush(r300->upload_vb);
     u_upload_flush(r300->upload_ib);
 
+    if (r300->draw)
+	r300_draw_flush_vbuf(r300);
+
     if (r300->dirty_hw) {
         r300_emit_hyperz_end(r300);
         r300_emit_query_end(r300);
+        if (r500_index_bias_supported(r300))
+            r500_emit_index_bias(r300, 0);
 
         r300->flush_counter++;
         r300->rws->cs_flush(r300->cs);
diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index 2a0c30620a..9845e54610 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -257,17 +257,17 @@ static void r300_emit_fs_code_to_buffer(
         shader->cb_code_size = 19 +
                                ((code->inst_end + 1) * 6) +
                                imm_count * 7 +
-			       code->int_constant_count * 2;
+                               code->int_constant_count * 2;
 
         NEW_CB(shader->cb_code, shader->cb_code_size);
         OUT_CB_REG(R500_US_CONFIG, R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
         OUT_CB_REG(R500_US_PIXSIZE, code->max_temp_idx);
         OUT_CB_REG(R500_US_FC_CTRL, code->us_fc_ctrl);
         for(i = 0; i < code->int_constant_count; i++){
-		OUT_CB_REG(R500_US_FC_INT_CONST_0 + (i * 4),
-						code->int_constants[i]);
-	}
-	OUT_CB_REG(R500_US_CODE_RANGE,
+                OUT_CB_REG(R500_US_FC_INT_CONST_0 + (i * 4),
+                                                code->int_constants[i]);
+        }
+        OUT_CB_REG(R500_US_CODE_RANGE,
                    R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(code->inst_end));
         OUT_CB_REG(R500_US_CODE_OFFSET, 0);
         OUT_CB_REG(R500_US_CODE_ADDR,
@@ -386,6 +386,7 @@ static void r300_translate_fragment_shader(
     compiler.state = shader->compare_state;
     compiler.Base.is_r500 = r300->screen->caps.is_r500;
     compiler.Base.max_temp_regs = compiler.Base.is_r500 ? 128 : 32;
+    compiler.Base.remove_unused_constants = TRUE;
     compiler.AllocateHwInputs = &allocate_hardware_inputs;
     compiler.UserData = &shader->inputs;
 
@@ -431,9 +432,8 @@ static void r300_translate_fragment_shader(
     }
 
     if (compiler.Base.Error) {
-        DBG(r300, DBG_FP, "r300 FP: Compiler Error:\n%sUsing a dummy shader"
-                " instead.\nIf there's an 'unknown opcode' message, please"
-                " file a bug report and attach this log.\n", compiler.Base.ErrorMsg);
+        fprintf(stderr, "r300 FP: Compiler Error:\n%sUsing a dummy shader"
+                " instead.\n", compiler.Base.ErrorMsg);
 
         if (shader->dummy) {
             fprintf(stderr, "r300 FP: Cannot compile the dummy shader! "
@@ -447,7 +447,12 @@ static void r300_translate_fragment_shader(
     }
 
     /* Initialize numbers of constants for each type. */
-    shader->externals_count = ttr.immediate_offset;
+    shader->externals_count = 0;
+    for (i = 0;
+         i < shader->code.constants.Count &&
+         shader->code.constants.Constants[i].Type == RC_CONSTANT_EXTERNAL; i++) {
+        shader->externals_count = i+1;
+    }
     shader->immediates_count = 0;
     shader->rc_state_count = 0;
 
diff --git a/src/gallium/drivers/r300/r300_hyperz.c b/src/gallium/drivers/r300/r300_hyperz.c
index 811b5646e1..eb5b0c36f8 100644
--- a/src/gallium/drivers/r300/r300_hyperz.c
+++ b/src/gallium/drivers/r300/r300_hyperz.c
@@ -354,7 +354,12 @@ void r300_zmask_alloc_block(struct r300_context *r300, struct r300_surface *surf
     /* We currently don't handle decompression for 3D textures and cubemaps
      * correctly. */
     if (tex->desc.b.b.target != PIPE_TEXTURE_1D &&
-        tex->desc.b.b.target != PIPE_TEXTURE_2D)
+        tex->desc.b.b.target != PIPE_TEXTURE_2D &&
+        tex->desc.b.b.target != PIPE_TEXTURE_RECT)
+        return;
+
+    /* Cannot flush zmask of 16-bit zbuffers. */
+    if (util_format_get_blocksizebits(tex->desc.b.b.format) == 16)
         return;
 
     if (tex->zmask_mem[level])
@@ -373,23 +378,36 @@ void r300_zmask_alloc_block(struct r300_context *r300, struct r300_surface *surf
     return;
 }
 
-void r300_hyperz_init_mm(struct r300_context *r300)
+boolean r300_hyperz_init_mm(struct r300_context *r300)
 {
     struct r300_screen* r300screen = r300->screen;
     int frag_pipes = r300screen->caps.num_frag_pipes;
 
-    if (r300screen->caps.hiz_ram)
+    r300->zmask_mm = u_mmInit(0, r300screen->caps.zmask_ram * frag_pipes);
+    if (!r300->zmask_mm)
+      return FALSE;
+
+    if (r300screen->caps.hiz_ram) {
       r300->hiz_mm = u_mmInit(0, r300screen->caps.hiz_ram * frag_pipes);
+      if (!r300->hiz_mm) {
+        u_mmDestroy(r300->zmask_mm);
+        r300->zmask_mm = NULL;
+        return FALSE;
+      }
+    }
 
-    r300->zmask_mm = u_mmInit(0, r300screen->caps.zmask_ram * frag_pipes);
+    return TRUE;
 }
 
 void r300_hyperz_destroy_mm(struct r300_context *r300)
 {
     struct r300_screen* r300screen = r300->screen;
 
-    if (r300screen->caps.hiz_ram)
+    if (r300screen->caps.hiz_ram) {
       u_mmDestroy(r300->hiz_mm);
+      r300->hiz_mm = NULL;
+    }
 
     u_mmDestroy(r300->zmask_mm);
+    r300->zmask_mm = NULL;
 }
diff --git a/src/gallium/drivers/r300/r300_hyperz.h b/src/gallium/drivers/r300/r300_hyperz.h
index 09e1ff6625..30a23ec649 100644
--- a/src/gallium/drivers/r300/r300_hyperz.h
+++ b/src/gallium/drivers/r300/r300_hyperz.h
@@ -30,6 +30,6 @@ void r300_update_hyperz_state(struct r300_context* r300);
 void r300_hiz_alloc_block(struct r300_context *r300, struct r300_surface *surf);
 void r300_zmask_alloc_block(struct r300_context *r300, struct r300_surface *surf, int compress);
 
-void r300_hyperz_init_mm(struct r300_context *r300);
+boolean r300_hyperz_init_mm(struct r300_context *r300);
 void r300_hyperz_destroy_mm(struct r300_context *r300);
 #endif
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index 60d3b600cb..6bea783f69 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -1607,6 +1607,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #	define R300_TX_FORMAT_3D		   (1 << 25)
 #	define R300_TX_FORMAT_CUBIC_MAP		   (2 << 25)
+#	define R300_TX_FORMAT_TEX_COORD_TYPE_MASK  (0x3 << 25)
 
 	/* alpha modes, convenience mostly */
 	/* if you have alpha, pick constant appropriate to the
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 86b11ca045..20bad2c56f 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -118,13 +118,13 @@ static uint32_t r300_provoking_vertex_fixes(struct r300_context *r300,
     return color_control;
 }
 
-static boolean index_bias_supported(struct r300_context *r300)
+boolean r500_index_bias_supported(struct r300_context *r300)
 {
     return r300->screen->caps.is_r500 &&
            r300->rws->get_value(r300->rws, R300_VID_DRM_2_3_0);
 }
 
-static void r500_emit_index_bias(struct r300_context *r300, int index_bias)
+void r500_emit_index_bias(struct r300_context *r300, int index_bias)
 {
     CS_LOCALS(r300);
 
@@ -199,7 +199,7 @@ static void r300_prepare_for_rendering(struct r300_context *r300,
     boolean emit_aos       = flags & PREP_EMIT_AOS;
     boolean emit_aos_swtcl = flags & PREP_EMIT_AOS_SWTCL;
     boolean indexed        = flags & PREP_INDEXED;
-    boolean hw_index_bias  = index_bias_supported(r300);
+    boolean hw_index_bias  = r500_index_bias_supported(r300);
 
     /* Add dirty state, index offset, and AOS. */
     if (first_draw) {
@@ -506,7 +506,7 @@ static void r300_draw_range_elements(struct pipe_context* pipe,
         translate = TRUE;
     }
 
-    if (indexBias && !index_bias_supported(r300)) {
+    if (indexBias && !r500_index_bias_supported(r300)) {
         r300_split_index_bias(r300, indexBias, &buffer_offset, &index_offset);
     }
 
@@ -680,18 +680,11 @@ static void r300_swtcl_draw_vbo(struct pipe_context* pipe,
     if (info->indexed && r300->index_buffer.buffer) {
         indices = pipe_buffer_map(pipe, r300->index_buffer.buffer,
                                   PIPE_TRANSFER_READ, &ib_transfer);
-        if (indices)
-            indices = (void *) ((char *) indices + r300->index_buffer.offset);
     }
 
-    draw_set_mapped_element_buffer_range(r300->draw, (indices) ?
-                                         r300->index_buffer.index_size : 0,
-                                         info->index_bias,
-                                         info->min_index,
-                                         info->max_index,
-                                         indices);
+    draw_set_mapped_index_buffer(r300->draw, indices);
 
-    draw_arrays(r300->draw, info->mode, info->start, count);
+    draw_vbo(r300->draw, info);
 
     /* XXX Not sure whether this is the best fix.
      * It prevents CS from being rejected and weird assertion failures. */
@@ -707,8 +700,7 @@ static void r300_swtcl_draw_vbo(struct pipe_context* pipe,
 
     if (ib_transfer) {
         pipe_buffer_unmap(pipe, r300->index_buffer.buffer, ib_transfer);
-        draw_set_mapped_element_buffer_range(r300->draw, 0, 0, info->start,
-                info->start + count - 1, NULL);
+        draw_set_mapped_index_buffer(r300->draw, NULL);
     }
 }
 
@@ -726,8 +718,6 @@ struct r300_render {
     unsigned hwprim;
 
     /* VBO */
-    struct pipe_resource* vbo;
-    size_t vbo_size;
     size_t vbo_offset;
     size_t vbo_max_used;
     void * vbo_ptr;
@@ -759,31 +749,31 @@ static boolean r300_render_allocate_vertices(struct vbuf_render* render,
     struct pipe_screen* screen = r300->context.screen;
     size_t size = (size_t)vertex_size * (size_t)count;
 
-    if (size + r300render->vbo_offset > r300render->vbo_size)
+    if (size + r300render->vbo_offset > r300->draw_vbo_size)
     {
-        pipe_resource_reference(&r300->vbo, NULL);
-        r300render->vbo = pipe_buffer_create(screen,
-                                             PIPE_BIND_VERTEX_BUFFER,
-                                             R300_MAX_DRAW_VBO_SIZE);
+	pipe_resource_reference(&r300->vbo, NULL);
+        r300->vbo = pipe_buffer_create(screen,
+				       PIPE_BIND_VERTEX_BUFFER,
+				       R300_MAX_DRAW_VBO_SIZE);
         r300render->vbo_offset = 0;
-        r300render->vbo_size = R300_MAX_DRAW_VBO_SIZE;
+        r300->draw_vbo_size = R300_MAX_DRAW_VBO_SIZE;
     }
 
     r300render->vertex_size = vertex_size;
-    r300->vbo = r300render->vbo;
     r300->vbo_offset = r300render->vbo_offset;
 
-    return (r300render->vbo) ? TRUE : FALSE;
+    return (r300->vbo) ? TRUE : FALSE;
 }
 
 static void* r300_render_map_vertices(struct vbuf_render* render)
 {
     struct r300_render* r300render = r300_render(render);
+    struct r300_context* r300 = r300render->r300;
 
     assert(!r300render->vbo_transfer);
 
     r300render->vbo_ptr = pipe_buffer_map(&r300render->r300->context,
-					  r300render->vbo,
+					  r300->vbo,
                                           PIPE_TRANSFER_WRITE,
 					  &r300render->vbo_transfer);
 
@@ -798,12 +788,13 @@ static void r300_render_unmap_vertices(struct vbuf_render* render,
 {
     struct r300_render* r300render = r300_render(render);
     struct pipe_context* context = &r300render->r300->context;
+    struct r300_context* r300 = r300render->r300;
 
     assert(r300render->vbo_transfer);
 
     r300render->vbo_max_used = MAX2(r300render->vbo_max_used,
                                     r300render->vertex_size * (max + 1));
-    pipe_buffer_unmap(context, r300render->vbo, r300render->vbo_transfer);
+    pipe_buffer_unmap(context, r300->vbo, r300render->vbo_transfer);
 
     r300render->vbo_transfer = NULL;
 }
@@ -880,7 +871,7 @@ static void r300_render_draw_elements(struct vbuf_render* render,
     struct r300_context* r300 = r300render->r300;
     int i;
     unsigned end_cs_dwords;
-    unsigned max_index = (r300render->vbo_size - r300render->vbo_offset) /
+    unsigned max_index = (r300->draw_vbo_size - r300render->vbo_offset) /
                          (r300render->r300->vertex_info.size * 4) - 1;
     unsigned short_count;
     unsigned free_dwords;
@@ -956,8 +947,6 @@ static struct vbuf_render* r300_render_create(struct r300_context* r300)
     r300render->base.release_vertices = r300_render_release_vertices;
     r300render->base.destroy = r300_render_destroy;
 
-    r300render->vbo = NULL;
-    r300render->vbo_size = 0;
     r300render->vbo_offset = 0;
 
     return &r300render->base;
@@ -986,6 +975,12 @@ struct draw_stage* r300_draw_stage(struct r300_context* r300)
     return stage;
 }
 
+void r300_draw_flush_vbuf(struct r300_context *r300)
+{
+    pipe_resource_reference(&r300->vbo, NULL);
+    r300->draw_vbo_size = 0;
+}
+
 /****************************************************************************
  *                         End of SW TCL functions                          *
  ***************************************************************************/
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index 239edd98e3..8ccb63964e 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -23,7 +23,7 @@
 
 #include "draw/draw_context.h"
 
-#include "util/u_blitter.h"
+#include "util/u_framebuffer.h"
 #include "util/u_math.h"
 #include "util/u_mm.h"
 #include "util/u_memory.h"
@@ -748,7 +748,7 @@ static void
     /* The tiling flags are dependent on the surface miplevel, unfortunately. */
     r300_fb_set_tiling_flags(r300, state);
 
-    util_assign_framebuffer_state(r300->fb_state.state, state);
+    util_copy_framebuffer_state(r300->fb_state.state, state);
 
     r300_mark_fb_state_dirty(r300, R300_CHANGED_FB_STATE);
 
@@ -865,6 +865,9 @@ void r300_mark_fs_code_dirty(struct r300_context *r300)
         r300->fs_rc_constant_state.size = fs->shader->rc_state_count * 5;
         r300->fs_constants.size = fs->shader->externals_count * 4 + 1;
     }
+
+    ((struct r300_constant_buffer*)r300->fs_constants.state)->remap_table =
+            fs->shader->code.constants_remap_table;
 }
 
 /* Bind fragment shader state. */
@@ -937,9 +940,9 @@ static void* r300_create_rs_state(struct pipe_context* pipe,
     uint32_t stuffing_enable;       /* R300_GB_ENABLE: 0x4008 */
 
     /* Point sprites texture coordinates, 0: lower left, 1: upper right */
-    float point_texcoord_left;      /* R300_GA_POINT_S0: 0x4200 */
+    float point_texcoord_left = 0;  /* R300_GA_POINT_S0: 0x4200 */
     float point_texcoord_bottom = 0;/* R300_GA_POINT_T0: 0x4204 */
-    float point_texcoord_right;     /* R300_GA_POINT_S1: 0x4208 */
+    float point_texcoord_right = 1; /* R300_GA_POINT_S1: 0x4208 */
     float point_texcoord_top = 0;   /* R300_GA_POINT_T1: 0x420c */
     CB_LOCALS;
 
@@ -947,6 +950,11 @@ static void* r300_create_rs_state(struct pipe_context* pipe,
     rs->rs = *state;
     rs->rs_draw = *state;
 
+    /* Generate point sprite texture coordinates in GENERIC0
+     * if point_quad_rasterization is TRUE. */
+    rs->rs.sprite_coord_enable = state->point_quad_rasterization *
+                                 (state->sprite_coord_enable | 1);
+
     /* Override some states for Draw. */
     rs->rs_draw.sprite_coord_enable = 0; /* We can do this in HW. */
 
@@ -1048,16 +1056,13 @@ static void* r300_create_rs_state(struct pipe_context* pipe,
 
     /* Point sprites */
     stuffing_enable = 0;
-    if (state->sprite_coord_enable) {
+    if (rs->rs.sprite_coord_enable) {
         stuffing_enable = R300_GB_POINT_STUFF_ENABLE;
-	for (i = 0; i < 8; i++) {
-	    if (state->sprite_coord_enable & (1 << i))
+        for (i = 0; i < 8; i++) {
+            if (rs->rs.sprite_coord_enable & (1 << i))
                 stuffing_enable |=
                     R300_GB_TEX_ST << (R300_GB_TEX0_SOURCE_SHIFT + (i*2));
-	}
-
-        point_texcoord_left = 0.0f;
-        point_texcoord_right = 1.0f;
+        }
 
         switch (state->sprite_coord_mode) {
             case PIPE_SPRITE_COORD_UPPER_LEFT:
@@ -1208,8 +1213,8 @@ static void*
 
     /* Unfortunately, r300-r500 don't support floating-point mipmap lods. */
     /* We must pass these to the merge function to clamp them properly. */
-    sampler->min_lod = MAX2((unsigned)state->min_lod, 0);
-    sampler->max_lod = MAX2((unsigned)ceilf(state->max_lod), 0);
+    sampler->min_lod = (unsigned)MAX2(state->min_lod, 0);
+    sampler->max_lod = (unsigned)MAX2(ceilf(state->max_lod), 0);
 
     lod_bias = CLAMP((int)(state->lod_bias * 32 + 1), -(1 << 9), (1 << 9) - 1);
 
@@ -1548,7 +1553,12 @@ static void r300_set_index_buffer(struct pipe_context* pipe,
         memset(&r300->index_buffer, 0, sizeof(r300->index_buffer));
     }
 
-    /* TODO make this more like a state */
+    if (r300->screen->caps.has_tcl) {
+       /* TODO make this more like a state */
+    }
+    else {
+       draw_set_index_buffer(r300->draw, ib);
+    }
 }
 
 /* Initialize the PSC tables. */
@@ -1765,6 +1775,9 @@ static void r300_bind_vs_state(struct pipe_context* pipe, void* shader)
             r300->vs_constants.size = 0;
         }
 
+        ((struct r300_constant_buffer*)r300->vs_constants.state)->remap_table =
+                vs->code.constants_remap_table;
+
         r300->pvs_flush.dirty = TRUE;
     } else {
         draw_bind_vertex_shader(r300->draw,
@@ -1779,6 +1792,8 @@ static void r300_delete_vs_state(struct pipe_context* pipe, void* shader)
 
     if (r300->screen->caps.has_tcl) {
         rc_constants_destroy(&vs->code.constants);
+        if (vs->code.constants_remap_table)
+            FREE(vs->code.constants_remap_table);
     } else {
         draw_delete_vertex_shader(r300->draw,
                 (struct draw_vertex_shader*)vs->draw_vs);
@@ -1795,47 +1810,28 @@ static void r300_set_constant_buffer(struct pipe_context *pipe,
     struct r300_context* r300 = r300_context(pipe);
     struct r300_constant_buffer *cbuf;
     uint32_t *mapped = r300_buffer(buf)->user_buffer;
-    int max_size = 0, max_size_bytes = 0, clamped_size = 0;
 
     switch (shader) {
         case PIPE_SHADER_VERTEX:
             cbuf = (struct r300_constant_buffer*)r300->vs_constants.state;
-            max_size = 256;
             break;
         case PIPE_SHADER_FRAGMENT:
             cbuf = (struct r300_constant_buffer*)r300->fs_constants.state;
-            if (r300->screen->caps.is_r500) {
-                max_size = 256;
-            } else {
-                max_size = 32;
-            }
             break;
         default:
             assert(0);
             return;
     }
-    max_size_bytes = max_size * 4 * sizeof(float);
 
     if (buf == NULL || buf->width0 == 0 ||
         (mapped = r300_buffer(buf)->constant_buffer) == NULL) {
-        cbuf->count = 0;
         return;
     }
 
     if (shader == PIPE_SHADER_FRAGMENT ||
         (shader == PIPE_SHADER_VERTEX && r300->screen->caps.has_tcl)) {
         assert((buf->width0 % (4 * sizeof(float))) == 0);
-
-        /* Check the size of the constant buffer. */
-        /* XXX Subtract immediates and RC_STATE_* variables. */
-        if (buf->width0 > max_size_bytes) {
-            fprintf(stderr, "r300: Max size of the constant buffer is "
-                          "%i*4 floats.\n", max_size);
-        }
-
-        clamped_size = MIN2(buf->width0, max_size_bytes);
-        cbuf->count = clamped_size / (4 * sizeof(float));
-        cbuf->ptr = mapped;
+        cbuf->ptr = mapped + index*4;
     }
 
     if (shader == PIPE_SHADER_VERTEX) {
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index 4a63ed7fc1..960dfdbaf0 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -211,7 +211,7 @@ static void r300_rs_col(struct r300_rs_block* rs, int id, int ptr,
 static void r300_rs_col_write(struct r300_rs_block* rs, int id, int fp_offset,
                               enum r300_rs_col_write_type type)
 {
-    assert(type != WRITE_COLOR);
+    assert(type == WRITE_COLOR);
     rs->inst[id] |= R300_RS_INST_COL_CN_WRITE |
                     R300_RS_INST_COL_ADDR(fp_offset);
 }
@@ -592,6 +592,25 @@ static void r300_merge_textures_and_samplers(struct r300_context* r300)
             texstate->filter1 = sampler->filter1;
             texstate->border_color = sampler->border_color;
 
+            /* determine min/max levels */
+            max_level = MIN3(sampler->max_lod + view->base.first_level,
+                             tex->desc.b.b.last_level, view->base.last_level);
+            min_level = MIN2(sampler->min_lod + view->base.first_level,
+                             max_level);
+
+            if (tex->desc.is_npot && min_level > 0) {
+                /* Even though we do not implement mipmapping for NPOT
+                 * textures, we should at least honor the minimum level
+                 * which is allowed to be displayed. We do this by setting up
+                 * an i-th mipmap level as the zero level. */
+                r300_texture_setup_format_state(r300->screen, &tex->desc,
+                                                min_level,
+                                                &texstate->format);
+                texstate->format.tile_config |=
+                        tex->desc.offset_in_bytes[min_level] & 0xffffffe0;
+                assert((tex->desc.offset_in_bytes[min_level] & 0x1f) == 0);
+            }
+
             /* Assign a texture cache region. */
             texstate->format.format1 |= view->texcache_region;
 
@@ -654,12 +673,7 @@ static void r300_merge_textures_and_samplers(struct r300_context* r300)
                     texstate->filter0 |= R300_TX_WRAP_T(R300_TX_CLAMP_TO_EDGE);
                 }
             } else {
-                /* determine min/max levels */
                 /* the MAX_MIP level is the largest (finest) one */
-                max_level = MIN3(sampler->max_lod + view->base.first_level,
-                                 tex->desc.b.b.last_level, view->base.last_level);
-                min_level = MIN2(sampler->min_lod + view->base.first_level,
-                                 max_level);
                 texstate->format.format0 |= R300_TX_NUM_LEVELS(max_level);
                 texstate->filter0 |= R300_TX_MAX_MIP_LEVEL(min_level);
             }
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index da8eadd3b5..66f6d80bd0 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -541,48 +541,55 @@ boolean r300_is_sampler_format_supported(enum pipe_format format)
     return r300_translate_texformat(format, 0, TRUE) != ~0;
 }
 
-static void r300_texture_setup_immutable_state(struct r300_screen* screen,
-                                               struct r300_texture* tex)
+void r300_texture_setup_format_state(struct r300_screen *screen,
+                                     struct r300_texture_desc *desc,
+                                     unsigned level,
+                                     struct r300_texture_format_state *out)
 {
-    struct r300_texture_format_state* f = &tex->tx_format;
-    struct pipe_resource *pt = &tex->desc.b.b;
+    struct pipe_resource *pt = &desc->b.b;
     boolean is_r500 = screen->caps.is_r500;
 
+    /* Mask out all the fields we change. */
+    out->format0 = 0;
+    out->format1 &= ~R300_TX_FORMAT_TEX_COORD_TYPE_MASK;
+    out->format2 &= R500_TXFORMAT_MSB;
+    out->tile_config = 0;
+
     /* Set sampler state. */
-    f->format0 = R300_TX_WIDTH((pt->width0 - 1) & 0x7ff) |
-                 R300_TX_HEIGHT((pt->height0 - 1) & 0x7ff);
+    out->format0 = R300_TX_WIDTH((u_minify(pt->width0, level) - 1) & 0x7ff) |
+                   R300_TX_HEIGHT((u_minify(pt->height0, level) - 1) & 0x7ff);
 
-    if (tex->desc.uses_stride_addressing) {
+    if (desc->uses_stride_addressing) {
         /* rectangles love this */
-        f->format0 |= R300_TX_PITCH_EN;
-        f->format2 = (tex->desc.stride_in_pixels[0] - 1) & 0x1fff;
+        out->format0 |= R300_TX_PITCH_EN;
+        out->format2 = (desc->stride_in_pixels[level] - 1) & 0x1fff;
     } else {
         /* Power of two textures (3D, mipmaps, and no pitch),
          * also NPOT textures with a width being POT. */
-        f->format0 |= R300_TX_DEPTH(util_logbase2(pt->depth0) & 0xf);
+        out->format0 |=
+            R300_TX_DEPTH(util_logbase2(u_minify(pt->depth0, level)) & 0xf);
     }
 
-    f->format1 = 0;
     if (pt->target == PIPE_TEXTURE_CUBE) {
-        f->format1 |= R300_TX_FORMAT_CUBIC_MAP;
+        out->format1 |= R300_TX_FORMAT_CUBIC_MAP;
     }
     if (pt->target == PIPE_TEXTURE_3D) {
-        f->format1 |= R300_TX_FORMAT_3D;
+        out->format1 |= R300_TX_FORMAT_3D;
     }
 
     /* large textures on r500 */
     if (is_r500)
     {
         if (pt->width0 > 2048) {
-            f->format2 |= R500_TXWIDTH_BIT11;
+            out->format2 |= R500_TXWIDTH_BIT11;
         }
         if (pt->height0 > 2048) {
-            f->format2 |= R500_TXHEIGHT_BIT11;
+            out->format2 |= R500_TXHEIGHT_BIT11;
         }
     }
 
-    f->tile_config = R300_TXO_MACRO_TILE(tex->desc.macrotile[0]) |
-                     R300_TXO_MICRO_TILE(tex->desc.microtile);
+    out->tile_config = R300_TXO_MACRO_TILE(desc->macrotile[level]) |
+                       R300_TXO_MICRO_TILE(desc->microtile);
 }
 
 static void r300_texture_setup_fb_state(struct r300_screen* screen,
@@ -716,7 +723,7 @@ r300_texture_create_object(struct r300_screen *rscreen,
         return NULL;
     }
     /* Initialize the hardware state. */
-    r300_texture_setup_immutable_state(rscreen, tex);
+    r300_texture_setup_format_state(rscreen, &tex->desc, 0, &tex->tx_format);
     r300_texture_setup_fb_state(rscreen, tex);
 
     tex->desc.b.vtbl = &r300_texture_vtbl;
@@ -754,7 +761,8 @@ struct pipe_resource *r300_texture_create(struct pipe_screen *screen,
     /* Refuse to create a texture with size 0. */
     if (!base->width0 ||
         (!base->height0 && (base->target == PIPE_TEXTURE_2D ||
-                            base->target == PIPE_TEXTURE_CUBE)) ||
+                            base->target == PIPE_TEXTURE_CUBE ||
+                            base->target == PIPE_TEXTURE_RECT)) ||
         (!base->depth0 && base->target == PIPE_TEXTURE_3D)) {
         fprintf(stderr, "r300: texture_create: "
                 "Got invalid texture dimensions: %ix%ix%i\n",
@@ -787,7 +795,8 @@ struct pipe_resource *r300_texture_from_handle(struct pipe_screen *screen,
     unsigned stride, size;
 
     /* Support only 2D textures without mipmaps */
-    if (base->target != PIPE_TEXTURE_2D ||
+    if ((base->target != PIPE_TEXTURE_2D &&
+          base->target != PIPE_TEXTURE_RECT) ||
         base->depth0 != 1 ||
         base->last_level != 0) {
         return NULL;
diff --git a/src/gallium/drivers/r300/r300_texture.h b/src/gallium/drivers/r300/r300_texture.h
index a4524320fd..c4588a0c90 100644
--- a/src/gallium/drivers/r300/r300_texture.h
+++ b/src/gallium/drivers/r300/r300_texture.h
@@ -23,11 +23,14 @@
 #ifndef R300_TEXTURE_H
 #define R300_TEXTURE_H
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 
 struct pipe_screen;
 struct pipe_resource;
 struct winsys_handle;
+struct r300_texture_format_state;
+struct r300_texture_desc;
 struct r300_texture;
 struct r300_screen;
 
@@ -50,6 +53,10 @@ boolean r300_is_zs_format_supported(enum pipe_format format);
 
 boolean r300_is_sampler_format_supported(enum pipe_format format);
 
+void r300_texture_setup_format_state(struct r300_screen *screen,
+                                     struct r300_texture_desc *desc,
+                                     unsigned level,
+                                     struct r300_texture_format_state *out);
 
 struct pipe_resource*
 r300_texture_from_handle(struct pipe_screen* screen,
diff --git a/src/gallium/drivers/r300/r300_texture_desc.c b/src/gallium/drivers/r300/r300_texture_desc.c
index 5d690e8c33..2fe5d72188 100644
--- a/src/gallium/drivers/r300/r300_texture_desc.c
+++ b/src/gallium/drivers/r300/r300_texture_desc.c
@@ -184,7 +184,8 @@ static unsigned r300_texture_get_nblocksy(struct r300_texture_desc *desc,
 
         /* This is needed for the kernel checker, unfortunately. */
         if ((desc->b.b.target != PIPE_TEXTURE_1D &&
-             desc->b.b.target != PIPE_TEXTURE_2D) ||
+             desc->b.b.target != PIPE_TEXTURE_2D &&
+             desc->b.b.target != PIPE_TEXTURE_RECT) ||
             desc->b.b.last_level != 0) {
             height = util_next_power_of_two(height);
         }
@@ -202,7 +203,8 @@ static unsigned r300_texture_get_nblocksy(struct r300_texture_desc *desc,
                  * Do so for 3 or more macrotiles in the Y direction. */
                 if (level == 0 && desc->b.b.last_level == 0 &&
                     (desc->b.b.target == PIPE_TEXTURE_1D ||
-                     desc->b.b.target == PIPE_TEXTURE_2D) &&
+                     desc->b.b.target == PIPE_TEXTURE_2D ||
+                     desc->b.b.target == PIPE_TEXTURE_RECT) &&
                     height >= tile_height * 3) {
                     height = align(height, tile_height * 2);
                 }
diff --git a/src/gallium/drivers/r300/r300_texture_desc.h b/src/gallium/drivers/r300/r300_texture_desc.h
index 95de66f654..3d7fe1fb47 100644
--- a/src/gallium/drivers/r300/r300_texture_desc.h
+++ b/src/gallium/drivers/r300/r300_texture_desc.h
@@ -24,6 +24,7 @@
 #ifndef R300_TEXTURE_DESC_H
 #define R300_TEXTURE_DESC_H
 
+#include "pipe/p_format.h"
 #include "r300_defines.h"
 
 struct pipe_resource;
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
index dd697b9c37..a4911b9a2a 100644
--- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
@@ -97,13 +97,13 @@ static unsigned translate_opcode(unsigned opcode)
      /* case TGSI_OPCODE_BRA: return RC_OPCODE_BRA; */
      /* case TGSI_OPCODE_CAL: return RC_OPCODE_CAL; */
      /* case TGSI_OPCODE_RET: return RC_OPCODE_RET; */
-     /* case TGSI_OPCODE_SSG: return RC_OPCODE_SSG; */
+        case TGSI_OPCODE_SSG: return RC_OPCODE_SSG;
         case TGSI_OPCODE_CMP: return RC_OPCODE_CMP;
         case TGSI_OPCODE_SCS: return RC_OPCODE_SCS;
         case TGSI_OPCODE_TXB: return RC_OPCODE_TXB;
      /* case TGSI_OPCODE_NRM: return RC_OPCODE_NRM; */
      /* case TGSI_OPCODE_DIV: return RC_OPCODE_DIV; */
-     /* case TGSI_OPCODE_DP2: return RC_OPCODE_DP2; */
+        case TGSI_OPCODE_DP2: return RC_OPCODE_DP2;
         case TGSI_OPCODE_TXL: return RC_OPCODE_TXL;
         case TGSI_OPCODE_BRK: return RC_OPCODE_BRK;
         case TGSI_OPCODE_IF: return RC_OPCODE_IF;
diff --git a/src/gallium/drivers/r300/r300_vs.c b/src/gallium/drivers/r300/r300_vs.c
index 54c8de1241..5f8dbb28d0 100644
--- a/src/gallium/drivers/r300/r300_vs.c
+++ b/src/gallium/drivers/r300/r300_vs.c
@@ -196,6 +196,7 @@ void r300_translate_vertex_shader(struct r300_context *r300,
 {
     struct r300_vertex_program_compiler compiler;
     struct tgsi_to_rc ttr;
+    unsigned i;
 
     /* Setup the compiler */
     rc_init(&compiler.Base);
@@ -205,6 +206,7 @@ void r300_translate_vertex_shader(struct r300_context *r300,
     compiler.UserData = vs;
     compiler.Base.is_r500 = r300->screen->caps.is_r500;
     compiler.Base.max_temp_regs = 32;
+    compiler.Base.remove_unused_constants = TRUE;
 
     if (compiler.Base.Debug) {
         DBG(r300, DBG_VP, "r300: Initial vertex program\n");
@@ -227,9 +229,8 @@ void r300_translate_vertex_shader(struct r300_context *r300,
     /* Invoke the compiler */
     r3xx_compile_vertex_program(&compiler);
     if (compiler.Base.Error) {
-        DBG(r300, DBG_VP, "r300 VP: Compiler error:\n%sUsing a dummy shader"
-                " instead.\nIf there's an 'unknown opcode' message, please"
-                " file a bug report and attach this log.\n", compiler.Base.ErrorMsg);
+        fprintf(stderr, "r300 VP: Compiler error:\n%sUsing a dummy shader"
+                " instead.\n", compiler.Base.ErrorMsg);
 
         if (vs->dummy) {
             fprintf(stderr, "r300 VP: Cannot compile the dummy shader! "
@@ -243,7 +244,15 @@ void r300_translate_vertex_shader(struct r300_context *r300,
     }
 
     /* Initialize numbers of constants for each type. */
-    vs->externals_count = ttr.immediate_offset;
+    vs->externals_count = 0;
+    for (i = 0;
+         i < vs->code.constants.Count &&
+         vs->code.constants.Constants[i].Type == RC_CONSTANT_EXTERNAL; i++) {
+        vs->externals_count = i+1;
+    }
+    for (; i < vs->code.constants.Count; i++) {
+        assert(vs->code.constants.Constants[i].Type == RC_CONSTANT_IMMEDIATE);
+    }
     vs->immediates_count = vs->code.constants.Count - vs->externals_count;
 
     /* And, finally... */
diff --git a/src/gallium/drivers/r300/r300_winsys.h b/src/gallium/drivers/r300/r300_winsys.h
index 187780750f..4597332399 100644
--- a/src/gallium/drivers/r300/r300_winsys.h
+++ b/src/gallium/drivers/r300/r300_winsys.h
@@ -33,6 +33,7 @@
 
 #include "r300_defines.h"
 
+struct winsys_handle;
 struct r300_winsys_screen;
 
 struct r300_winsys_buffer;
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 9ea9d4354d..6483dac703 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -76,6 +76,27 @@ int r600_bc_init(struct r600_bc *bc, enum radeon_family family)
 {
 	LIST_INITHEAD(&bc->cf);
 	bc->family = family;
+	switch (bc->family) {
+	case CHIP_R600:
+	case CHIP_RV610:
+	case CHIP_RV630:
+	case CHIP_RV670:
+	case CHIP_RV620:
+	case CHIP_RV635:
+	case CHIP_RS780:
+	case CHIP_RS880:
+		bc->chiprev = 0;
+		break;
+	case CHIP_RV770:
+	case CHIP_RV730:
+	case CHIP_RV710:
+	case CHIP_RV740:
+		bc->chiprev = 1;
+		break;
+	default:
+		R600_ERR("unknown family %d\n", bc->family);
+		return -EINVAL;
+	}
 	return 0;
 }
 
@@ -107,7 +128,7 @@ int r600_bc_add_output(struct r600_bc *bc, const struct r600_bc_output *output)
 	return 0;
 }
 
-int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu)
+int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int type)
 {
 	struct r600_bc_alu *nalu = r600_bc_alu();
 	struct r600_bc_alu *lalu;
@@ -119,7 +140,7 @@ int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu)
 	nalu->nliteral = 0;
 
 	/* cf can contains only alu or only vtx or only tex */
-	if (bc->cf_last == NULL || bc->cf_last->inst != (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3) ||
+	if (bc->cf_last == NULL || bc->cf_last->inst != (type << 3) ||
 		bc->force_add_cf) {
 		/* at most 128 slots, one add alu can add 4 slots + 4 constant worst case */
 		r = r600_bc_add_cf(bc);
@@ -127,7 +148,7 @@ int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu)
 			free(nalu);
 			return r;
 		}
-		bc->cf_last->inst = V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3;
+		bc->cf_last->inst = (type << 3);
 	}
 	if (alu->last && (bc->cf_last->ndw >> 1) >= 124) {
 		bc->force_add_cf = 1;
@@ -162,6 +183,11 @@ int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu)
 	return 0;
 }
 
+int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu)
+{
+	return r600_bc_add_alu_type(bc, alu, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU);
+}
+
 int r600_bc_add_literal(struct r600_bc *bc, const u32 *value)
 {
 	struct r600_bc_alu *alu;
@@ -172,7 +198,17 @@ int r600_bc_add_literal(struct r600_bc *bc, const u32 *value)
 	if (bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_TEX) {
 		return 0;
 	}
-	if (bc->cf_last->inst != (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3) ||
+	if (bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_JUMP ||
+	    bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_ELSE ||
+	    bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL ||
+	    bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK ||
+	    bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE ||
+	    bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END ||
+	    bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_POP) {
+		return 0;
+	}
+	if (((bc->cf_last->inst != (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3)) &&
+	     (bc->cf_last->inst != (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3))) ||
 		LIST_IS_EMPTY(&bc->cf_last->alu)) {
 		R600_ERR("last CF is not ALU (%p)\n", bc->cf_last);
 		return -EINVAL;
@@ -241,6 +277,18 @@ int r600_bc_add_tex(struct r600_bc *bc, const struct r600_bc_tex *tex)
 	return 0;
 }
 
+int r600_bc_add_cfinst(struct r600_bc *bc, int inst)
+{
+	int r;
+	r = r600_bc_add_cf(bc);
+	if (r)
+		return r;
+
+	bc->cf_last->cond = V_SQ_CF_COND_ACTIVE;
+	bc->cf_last->inst = inst;
+	return 0;
+}
+
 static int r600_bc_vtx_build(struct r600_bc *bc, struct r600_bc_vtx *vtx, unsigned id)
 {
 	bc->bytecode[id++] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
@@ -292,38 +340,44 @@ static int r600_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsign
 	unsigned i;
 
 	/* don't replace gpr by pv or ps for destination register */
+	bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
+				S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) |
+				S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
+				S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
+				S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
+				S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
+				S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
+				S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
+				S_SQ_ALU_WORD0_LAST(alu->last);
+
 	if (alu->is_op3) {
-		bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
-					S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
-					S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
-					S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
-					S_SQ_ALU_WORD0_LAST(alu->last);
 		bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
 					S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
+					S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
 					S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
 					S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) |
+					S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) |
 					S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) |
 					S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) |
 					S_SQ_ALU_WORD1_OP3_ALU_INST(alu->inst) |
 					S_SQ_ALU_WORD1_BANK_SWIZZLE(0);
 	} else {
-		bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
-					S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
-					S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
-					S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
-					S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
-					S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
-					S_SQ_ALU_WORD0_LAST(alu->last);
 		bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
 					S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
+					S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
 					S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
 					S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) |
 					S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) |
 					S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) |
 					S_SQ_ALU_WORD1_OP2_ALU_INST(alu->inst) |
-					S_SQ_ALU_WORD1_BANK_SWIZZLE(0);
+					S_SQ_ALU_WORD1_BANK_SWIZZLE(0) |
+			                S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->predicate) |
+		 	                S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->predicate);
 	}
 	if (alu->last) {
+		if (alu->nliteral && !alu->literal_added) {
+			R600_ERR("Bug in ALU processing for instruction 0x%08x, literal not added correctly\n");
+		}
 		for (i = 0; i < alu->nliteral; i++) {
 			bc->bytecode[id++] = alu->value[i];
 		}
@@ -337,6 +391,7 @@ static int r600_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf)
 
 	switch (cf->inst) {
 	case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
+	case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
 		bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1);
 		bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(cf->inst >> 3) |
 					S_SQ_CF_ALU_WORD1_BARRIER(1) |
@@ -364,6 +419,20 @@ static int r600_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf)
 			S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->output.inst) |
 			S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
 		break;
+	case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
+	case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
+	case V_SQ_CF_WORD1_SQ_CF_INST_POP:
+	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
+	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
+	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
+	case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
+		bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
+		bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(cf->inst) |
+					S_SQ_CF_WORD1_BARRIER(1) |
+			                S_SQ_CF_WORD1_COND(cf->cond) |
+			                S_SQ_CF_WORD1_POP_COUNT(cf->pop_count);
+
+		break;
 	default:
 		R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
 		return -EINVAL;
@@ -380,6 +449,8 @@ int r600_bc_build(struct r600_bc *bc)
 	unsigned addr;
 	int r;
 
+	if (bc->callstack[0].max > 0)
+	    bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
 
 	/* first path compute addr of each CF block */
 	/* addr start after all the CF instructions */
@@ -387,6 +458,7 @@ int r600_bc_build(struct r600_bc *bc)
 	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
 		switch (cf->inst) {
 		case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
+		case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
 			break;
 		case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
 		case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
@@ -398,6 +470,14 @@ int r600_bc_build(struct r600_bc *bc)
 		case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
 		case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
 			break;
+		case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
+		case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
+		case V_SQ_CF_WORD1_SQ_CF_INST_POP:
+		case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
+		case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
+		case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
+		case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
+			break;
 		default:
 			R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
 			return -EINVAL;
@@ -417,22 +497,13 @@ int r600_bc_build(struct r600_bc *bc)
 			return r;
 		switch (cf->inst) {
 		case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
+		case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
 			LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
-				switch (bc->family) {
-				case CHIP_R600:
-				case CHIP_RV610:
-				case CHIP_RV630:
-				case CHIP_RV670:
-				case CHIP_RV620:
-				case CHIP_RV635:
-				case CHIP_RS780:
-				case CHIP_RS880:
+				switch(bc->chiprev) {
+				case 0:
 					r = r600_bc_alu_build(bc, alu, addr);
 					break;
-				case CHIP_RV770:
-				case CHIP_RV730:
-				case CHIP_RV710:
-				case CHIP_RV740:
+				case 1:
 					r = r700_bc_alu_build(bc, alu, addr);
 					break;
 				default:
@@ -466,6 +537,13 @@ int r600_bc_build(struct r600_bc *bc)
 			break;
 		case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
 		case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
+		case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
+		case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
+		case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
+		case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
+		case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
+		case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
+		case V_SQ_CF_WORD1_SQ_CF_INST_POP:
 			break;
 		default:
 			R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index 10d98afaf0..9e65fcdd4f 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -31,6 +31,7 @@ struct r600_bc_alu_src {
 	unsigned			chan;
 	unsigned			neg;
 	unsigned			abs;
+	unsigned			rel;
 };
 
 struct r600_bc_alu_dst {
@@ -38,6 +39,7 @@ struct r600_bc_alu_dst {
 	unsigned			chan;
 	unsigned			clamp;
 	unsigned			write;
+	unsigned			rel;
 };
 
 struct r600_bc_alu {
@@ -47,6 +49,7 @@ struct r600_bc_alu {
 	unsigned			inst;
 	unsigned			last;
 	unsigned			is_op3;
+	unsigned                        predicate;
 	unsigned			nliteral;
 	unsigned			literal_added;
 	u32				value[4];
@@ -114,22 +117,55 @@ struct r600_bc_cf {
 	unsigned			addr;
 	unsigned			ndw;
 	unsigned			id;
+	unsigned                        cond;
+	unsigned                        pop_count;
+	unsigned                        cf_addr; /* control flow addr */
 	struct list_head		alu;
 	struct list_head		tex;
 	struct list_head		vtx;
 	struct r600_bc_output		output;
 };
 
+#define FC_NONE 0
+#define FC_IF 1
+#define FC_LOOP 2
+#define FC_REP 3
+#define FC_PUSH_VPM 4
+#define FC_PUSH_WQM 5
+
+struct r600_cf_stack_entry {
+	int type;
+	struct r600_bc_cf *start;
+	struct r600_bc_cf **mid; /* used to store the else point */
+	int num_mid;
+};
+
+#define SQ_MAX_CALL_DEPTH 0x00000020
+struct r600_cf_callstack {
+	unsigned fc_sp_before_entry;
+	int sub_desc_index;
+	int current;
+	int max;
+};
+	
 struct r600_bc {
 	enum radeon_family		family;
+	int chiprev; /* 0 - r600, 1 - r700, 2 - evergreen */
 	struct list_head		cf;
 	struct r600_bc_cf		*cf_last;
 	unsigned			ndw;
 	unsigned			ncf;
 	unsigned			ngpr;
+	unsigned                        nstack;
 	unsigned			nresource;
 	unsigned			force_add_cf;
 	u32				*bytecode;
+
+	u32 fc_sp;
+	struct r600_cf_stack_entry fc_stack[32];
+
+	unsigned call_sp;
+	struct r600_cf_callstack callstack[SQ_MAX_CALL_DEPTH];
 };
 
 int r600_bc_init(struct r600_bc *bc, enum radeon_family family);
@@ -139,5 +175,6 @@ int r600_bc_add_vtx(struct r600_bc *bc, const struct r600_bc_vtx *vtx);
 int r600_bc_add_tex(struct r600_bc *bc, const struct r600_bc_tex *tex);
 int r600_bc_add_output(struct r600_bc *bc, const struct r600_bc_output *output);
 int r600_bc_build(struct r600_bc *bc);
-
+int r600_bc_add_cfinst(struct r600_bc *bc, int inst);
+int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int type);
 #endif
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index f4eedfe4cb..e6ded342e5 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -24,6 +24,7 @@
  *      Jerome Glisse
  *      Marek Olšák
  */
+#include <errno.h>
 #include <pipe/p_screen.h>
 #include <util/u_blitter.h>
 #include <util/u_inlines.h>
@@ -31,9 +32,12 @@
 #include "util/u_surface.h"
 #include "r600_screen.h"
 #include "r600_context.h"
+#include "r600d.h"
 
-static void r600_blitter_save_states(struct r600_context *rctx)
+static void r600_blitter_save_states(struct pipe_context *ctx)
 {
+	struct r600_context *rctx = r600_context(ctx);
+
 	util_blitter_save_blend(rctx->blitter, rctx->blend);
 	util_blitter_save_depth_stencil_alpha(rctx->blitter, rctx->dsa);
 	if (rctx->stencil_ref) {
@@ -47,48 +51,58 @@ static void r600_blitter_save_states(struct r600_context *rctx)
 	if (rctx->viewport) {
 		util_blitter_save_viewport(rctx->blitter, &rctx->viewport->state.viewport);
 	}
-	/* XXX util_blitter_save_clip(rctx->blitter, &rctx->clip); */
+	if (rctx->clip) {
+		util_blitter_save_clip(rctx->blitter, &rctx->clip->state.clip);
+	}
 	util_blitter_save_vertex_buffers(rctx->blitter, rctx->nvertex_buffer,
 					rctx->vertex_buffer);
 
 	/* remove ptr so they don't get deleted */
 	rctx->blend = NULL;
+	rctx->clip = NULL;
 	rctx->vs_shader = NULL;
 	rctx->ps_shader = NULL;
 	rctx->rasterizer = NULL;
 	rctx->dsa = NULL;
 	rctx->vertex_elements = NULL;
+
+	/* suspend queries */
+	r600_queries_suspend(ctx);
 }
 
 static void r600_clear(struct pipe_context *ctx, unsigned buffers,
-		       const float *rgba, double depth, unsigned stencil)
+			const float *rgba, double depth, unsigned stencil)
 {
 	struct r600_context *rctx = r600_context(ctx);
 	struct pipe_framebuffer_state *fb = &rctx->framebuffer->state.framebuffer;
 
-	r600_blitter_save_states(rctx);
+	r600_blitter_save_states(ctx);
 	util_blitter_clear(rctx->blitter, fb->width, fb->height,
 				fb->nr_cbufs, buffers, rgba, depth,
 				stencil);
+	/* resume queries */
+	r600_queries_resume(ctx);
 }
 
-static void r600_clear_render_target(struct pipe_context *pipe,
+static void r600_clear_render_target(struct pipe_context *ctx,
 				     struct pipe_surface *dst,
 				     const float *rgba,
 				     unsigned dstx, unsigned dsty,
 				     unsigned width, unsigned height)
 {
-	struct r600_context *rctx = r600_context(pipe);
+	struct r600_context *rctx = r600_context(ctx);
 	struct pipe_framebuffer_state *fb = &rctx->framebuffer->state.framebuffer;
 
-	r600_blitter_save_states(rctx);
+	r600_blitter_save_states(ctx);
 	util_blitter_save_framebuffer(rctx->blitter, fb);
 
 	util_blitter_clear_render_target(rctx->blitter, dst, rgba,
 					 dstx, dsty, width, height);
+	/* resume queries */
+	r600_queries_resume(ctx);
 }
 
-static void r600_clear_depth_stencil(struct pipe_context *pipe,
+static void r600_clear_depth_stencil(struct pipe_context *ctx,
 				     struct pipe_surface *dst,
 				     unsigned clear_flags,
 				     double depth,
@@ -96,17 +110,20 @@ static void r600_clear_depth_stencil(struct pipe_context *pipe,
 				     unsigned dstx, unsigned dsty,
 				     unsigned width, unsigned height)
 {
-	struct r600_context *rctx = r600_context(pipe);
+	struct r600_context *rctx = r600_context(ctx);
 	struct pipe_framebuffer_state *fb = &rctx->framebuffer->state.framebuffer;
 
-	r600_blitter_save_states(rctx);
+	r600_blitter_save_states(ctx);
 	util_blitter_save_framebuffer(rctx->blitter, fb);
 
 	util_blitter_clear_depth_stencil(rctx->blitter, dst, clear_flags, depth, stencil,
 					 dstx, dsty, width, height);
+	/* resume queries */
+	r600_queries_resume(ctx);
 }
 
-static void r600_resource_copy_region(struct pipe_context *pipe,
+
+static void r600_resource_copy_region(struct pipe_context *ctx,
 				      struct pipe_resource *dst,
 				      struct pipe_subresource subdst,
 				      unsigned dstx, unsigned dsty, unsigned dstz,
@@ -115,7 +132,7 @@ static void r600_resource_copy_region(struct pipe_context *pipe,
 				      unsigned srcx, unsigned srcy, unsigned srcz,
 				      unsigned width, unsigned height)
 {
-	util_resource_copy_region(pipe, dst, subdst, dstx, dsty, dstz,
+	util_resource_copy_region(ctx, dst, subdst, dstx, dsty, dstz,
 				  src, subsrc, srcx, srcy, srcz, width, height);
 }
 
@@ -126,3 +143,446 @@ void r600_init_blit_functions(struct r600_context *rctx)
 	rctx->context.clear_depth_stencil = r600_clear_depth_stencil;
 	rctx->context.resource_copy_region = r600_resource_copy_region;
 }
+
+
+struct r600_blit_states {
+	struct radeon_state	rasterizer;
+	struct radeon_state	dsa;
+	struct radeon_state	blend;
+	struct radeon_state	cb_cntl;
+	struct radeon_state	vgt;
+	struct radeon_state	draw;
+	struct radeon_state	vs_constant0;
+	struct radeon_state	vs_constant1;
+	struct radeon_state	vs_constant2;
+	struct radeon_state	vs_constant3;
+	struct radeon_state	ps_shader;
+	struct radeon_state	vs_shader;
+	struct radeon_state	vs_resource0;
+	struct radeon_state	vs_resource1;
+};
+
+static int r600_blit_state_vs_resources(struct r600_screen *rscreen, struct r600_blit_states *bstates)
+{
+	struct radeon_state *rstate;
+	struct radeon_bo *bo;
+	u32 vbo[] = {
+		0xBF800000, 0xBF800000, 0x3F800000, 0x3F800000,
+		0x3F000000, 0x3F000000, 0x3F000000, 0x00000000,
+		0x3F800000, 0xBF800000, 0x3F800000, 0x3F800000,
+		0x3F000000, 0x3F000000, 0x3F000000, 0x00000000,
+		0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000,
+		0x3F000000, 0x3F000000, 0x3F000000, 0x00000000,
+		0xBF800000, 0x3F800000, 0x3F800000, 0x3F800000,
+		0x3F000000, 0x3F000000, 0x3F000000, 0x00000000
+	};
+
+	/* simple shader */
+	bo = radeon_bo(rscreen->rw, 0, 128, 4096, NULL);
+	if (bo == NULL) {
+		return -ENOMEM;
+	}
+	if (radeon_bo_map(rscreen->rw, bo)) {
+		radeon_bo_decref(rscreen->rw, bo);
+		return -ENOMEM;
+	}
+	memcpy(bo->data, vbo, 128);
+	radeon_bo_unmap(rscreen->rw, bo);
+
+	rstate = &bstates->vs_resource0;
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_RESOURCE, 0, R600_SHADER_VS);
+
+	/* set states (most default value are 0 and struct already
+	 * initialized to 0, thus avoid resetting them)
+	 */
+	rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD0] = 0x00000000;
+	rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD1] = 0x00000080;
+	rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD2] = 0x02302000;
+	rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD3] = 0x00000000;
+	rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD4] = 0x00000000;
+	rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD5] = 0x00000000;
+	rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD6] = 0xC0000000;
+	rstate->bo[0] = bo;
+	rstate->nbo = 1;
+	rstate->placement[0] = RADEON_GEM_DOMAIN_GTT;
+	if (radeon_state_pm4(rstate)) {
+		radeon_state_fini(rstate);
+		return -ENOMEM;
+	}
+
+	rstate = &bstates->vs_resource1;
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_RESOURCE, 1, R600_SHADER_VS);
+	rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD0] = 0x00000010;
+	rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD1] = 0x00000070;
+	rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD2] = 0x02302000;
+	rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD3] = 0x00000000;
+	rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD4] = 0x00000000;
+	rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD5] = 0x00000000;
+	rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD6] = 0xC0000000;
+	rstate->bo[0] = radeon_bo_incref(rscreen->rw, bo);
+	rstate->nbo = 1;
+	rstate->placement[0] = RADEON_GEM_DOMAIN_GTT;
+	if (radeon_state_pm4(rstate)) {
+		radeon_state_fini(rstate);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void r600_blit_state_vs_shader(struct r600_screen *rscreen, struct radeon_state *rstate)
+{
+	struct radeon_bo *bo;
+	u32 shader_bc_r600[] = {
+		0x00000004, 0x81000400,
+		0x00000008, 0xA01C0000,
+		0xC001A03C, 0x94000688,
+		0xC0024000, 0x94200688,
+		0x7C000000, 0x002D1001,
+		0x00080000, 0x00000000,
+		0x7C000100, 0x002D1002,
+		0x00080000, 0x00000000,
+		0x00000001, 0x00601910,
+		0x00000401, 0x20601910,
+		0x00000801, 0x40601910,
+		0x80000C01, 0x60601910,
+		0x00000002, 0x00801910,
+		0x00000402, 0x20801910,
+		0x00000802, 0x40801910,
+		0x80000C02, 0x60801910
+	};
+	u32 shader_bc_r700[] = {
+		0x00000004, 0x81000400,
+		0x00000008, 0xA01C0000,
+		0xC001A03C, 0x94000688,
+		0xC0024000, 0x94200688,
+		0x7C000000, 0x002D1001,
+		0x00080000, 0x00000000,
+		0x7C000100, 0x002D1002,
+		0x00080000, 0x00000000,
+		0x00000001, 0x00600C90,
+		0x00000401, 0x20600C90,
+		0x00000801, 0x40600C90,
+		0x80000C01, 0x60600C90,
+		0x00000002, 0x00800C90,
+		0x00000402, 0x20800C90,
+		0x00000802, 0x40800C90,
+		0x80000C02, 0x60800C90
+	};
+
+	/* simple shader */
+	bo = radeon_bo(rscreen->rw, 0, 128, 4096, NULL);
+	if (bo == NULL) {
+		return;
+	}
+	if (radeon_bo_map(rscreen->rw, bo)) {
+		radeon_bo_decref(rscreen->rw, bo);
+		return;
+	}
+	switch (rscreen->chip_class) {
+	case R600:
+		memcpy(bo->data, shader_bc_r600, 128);
+		break;
+	case R700:
+		memcpy(bo->data, shader_bc_r700, 128);
+		break;
+	default:
+		R600_ERR("unsupported chip family\n");
+		radeon_bo_unmap(rscreen->rw, bo);
+		radeon_bo_decref(rscreen->rw, bo);
+		return;
+	}
+	radeon_bo_unmap(rscreen->rw, bo);
+
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_SHADER, 0, R600_SHADER_VS);
+
+	/* set states (most default value are 0 and struct already
+	 * initialized to 0, thus avoid resetting them)
+	 */
+	rstate->states[R600_VS_SHADER__SPI_VS_OUT_ID_0] = 0x03020100;
+	rstate->states[R600_VS_SHADER__SPI_VS_OUT_ID_1] = 0x07060504;
+	rstate->states[R600_VS_SHADER__SQ_PGM_RESOURCES_VS] = 0x00000005;
+
+	rstate->bo[0] = bo;
+	rstate->bo[1] = radeon_bo_incref(rscreen->rw, bo);
+	rstate->nbo = 2;
+	rstate->placement[0] = RADEON_GEM_DOMAIN_GTT;
+	rstate->placement[2] = RADEON_GEM_DOMAIN_GTT;
+
+	radeon_state_pm4(rstate);
+}
+
+static void r600_blit_state_ps_shader(struct r600_screen *rscreen, struct radeon_state *rstate)
+{
+	struct radeon_bo *bo;
+	u32 shader_bc_r600[] = {
+		0x00000002, 0xA00C0000,
+		0xC0008000, 0x94200688,
+		0x00000000, 0x00201910,
+		0x00000400, 0x20201910,
+		0x00000800, 0x40201910,
+		0x80000C00, 0x60201910
+	};
+	u32 shader_bc_r700[] = {
+		0x00000002, 0xA00C0000,
+		0xC0008000, 0x94200688,
+		0x00000000, 0x00200C90,
+		0x00000400, 0x20200C90,
+		0x00000800, 0x40200C90,
+		0x80000C00, 0x60200C90
+	};
+
+	/* simple shader */
+	bo = radeon_bo(rscreen->rw, 0, 128, 4096, NULL);
+	if (bo == NULL) {
+		radeon_bo_decref(rscreen->rw, bo);
+		return;
+	}
+	if (radeon_bo_map(rscreen->rw, bo)) {
+		return;
+	}
+	switch (rscreen->chip_class) {
+	case R600:
+		memcpy(bo->data, shader_bc_r600, 48);
+		break;
+	case R700:
+		memcpy(bo->data, shader_bc_r700, 48);
+		break;
+	default:
+		R600_ERR("unsupported chip family\n");
+		radeon_bo_unmap(rscreen->rw, bo);
+		radeon_bo_decref(rscreen->rw, bo);
+		return;
+	}
+	radeon_bo_unmap(rscreen->rw, bo);
+
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_SHADER, 0, R600_SHADER_PS);
+
+	/* set states (most default value are 0 and struct already
+	 * initialized to 0, thus avoid resetting them)
+	 */
+	rstate->states[R600_PS_SHADER__SPI_PS_INPUT_CNTL_0] = 0x00000C00;
+	rstate->states[R600_PS_SHADER__SPI_PS_IN_CONTROL_0] = 0x10000001;
+	rstate->states[R600_PS_SHADER__SQ_PGM_EXPORTS_PS] = 0x00000002;
+	rstate->states[R600_PS_SHADER__SQ_PGM_RESOURCES_PS] = 0x00000002;
+
+	rstate->bo[0] = bo;
+	rstate->nbo = 1;
+	rstate->placement[0] = RADEON_GEM_DOMAIN_GTT;
+
+	radeon_state_pm4(rstate);
+}
+
+static void r600_blit_state_vgt(struct r600_screen *rscreen, struct radeon_state *rstate)
+{
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_VGT, 0, 0);
+
+	/* set states (most default value are 0 and struct already
+	 * initialized to 0, thus avoid resetting them)
+	 */
+	rstate->states[R600_VGT__VGT_DMA_NUM_INSTANCES] = 0x00000001;
+	rstate->states[R600_VGT__VGT_MAX_VTX_INDX] = 0x00FFFFFF;
+	rstate->states[R600_VGT__VGT_PRIMITIVE_TYPE] = 0x00000005;
+
+	radeon_state_pm4(rstate);
+}
+
+static void r600_blit_state_draw(struct r600_screen *rscreen, struct radeon_state *rstate)
+{
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_DRAW, 0, 0);
+
+	/* set states (most default value are 0 and struct already
+	 * initialized to 0, thus avoid resetting them)
+	 */
+	rstate->states[R600_DRAW__VGT_DRAW_INITIATOR] = 0x00000002;
+	rstate->states[R600_DRAW__VGT_NUM_INDICES] = 0x00000004;
+
+	radeon_state_pm4(rstate);
+}
+
+static void r600_blit_state_vs_constant(struct r600_screen *rscreen, struct radeon_state *rstate,
+					unsigned id, float c0, float c1, float c2, float c3)
+{
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_CONSTANT, id, R600_SHADER_VS);
+
+	/* set states (most default value are 0 and struct already
+	 * initialized to 0, thus avoid resetting them)
+	 */
+	rstate->states[R600_VS_CONSTANT__SQ_ALU_CONSTANT0_256] = fui(c0);
+	rstate->states[R600_VS_CONSTANT__SQ_ALU_CONSTANT1_256] = fui(c1);
+	rstate->states[R600_VS_CONSTANT__SQ_ALU_CONSTANT2_256] = fui(c2);
+	rstate->states[R600_VS_CONSTANT__SQ_ALU_CONSTANT3_256] = fui(c3);
+
+	radeon_state_pm4(rstate);
+}
+
+static void r600_blit_state_rasterizer(struct r600_screen *rscreen, struct radeon_state *rstate)
+{
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_RASTERIZER, 0, 0);
+
+	/* set states (most default value are 0 and struct already
+	 * initialized to 0, thus avoid resetting them)
+	 */
+	rstate->states[R600_RASTERIZER__PA_CL_GB_HORZ_CLIP_ADJ] = 0x3F800000;
+	rstate->states[R600_RASTERIZER__PA_CL_GB_HORZ_DISC_ADJ] = 0x3F800000;
+	rstate->states[R600_RASTERIZER__PA_CL_GB_VERT_CLIP_ADJ] = 0x3F800000;
+	rstate->states[R600_RASTERIZER__PA_CL_GB_VERT_DISC_ADJ] = 0x3F800000;
+	rstate->states[R600_RASTERIZER__PA_SC_LINE_CNTL] = 0x00000400;
+	rstate->states[R600_RASTERIZER__PA_SC_LINE_STIPPLE] = 0x00000005;
+	rstate->states[R600_RASTERIZER__PA_SU_LINE_CNTL] = 0x00000008;
+	rstate->states[R600_RASTERIZER__PA_SU_POINT_MINMAX] = 0x80000000;
+	rstate->states[R600_RASTERIZER__PA_SU_SC_MODE_CNTL] = 0x00080004;
+	rstate->states[R600_RASTERIZER__SPI_INTERP_CONTROL_0] = 0x00000001;
+
+	radeon_state_pm4(rstate);
+}
+
+static void r600_blit_state_dsa(struct r600_screen *rscreen, struct radeon_state *rstate)
+{
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_DSA, 0, 0);
+
+	/* set states (most default value are 0 and struct already
+	 * initialized to 0, thus avoid resetting them)
+	 */
+	rstate->states[R600_DSA__DB_ALPHA_TO_MASK] = 0x0000AA00;
+	rstate->states[R600_DSA__DB_DEPTH_CLEAR] = 0x3F800000;
+	rstate->states[R600_DSA__DB_RENDER_CONTROL] = 0x00000060;
+	rstate->states[R600_DSA__DB_RENDER_OVERRIDE] = 0x0000002A;
+	rstate->states[R600_DSA__DB_SHADER_CONTROL] = 0x00000210;
+
+	radeon_state_pm4(rstate);
+}
+
+static void r600_blit_state_blend(struct r600_screen *rscreen, struct radeon_state *rstate)
+{
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_BLEND, 0, 0);
+	radeon_state_pm4(rstate);
+}
+
+static void r600_blit_state_cb_cntl(struct r600_screen *rscreen, struct radeon_state *rstate)
+{
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_CB_CNTL, 0, 0);
+	rstate->states[R600_CB_CNTL__CB_CLRCMP_CONTROL] = 0x01000000;
+	rstate->states[R600_CB_CNTL__CB_CLRCMP_DST] = 0x000000FF;
+	rstate->states[R600_CB_CNTL__CB_CLRCMP_MSK] = 0xFFFFFFFF;
+	rstate->states[R600_CB_CNTL__CB_COLOR_CONTROL] = 0x00CC0080;
+	rstate->states[R600_CB_CNTL__CB_SHADER_MASK] = 0x0000000F;
+	rstate->states[R600_CB_CNTL__CB_TARGET_MASK] = 0x0000000F;
+	rstate->states[R600_CB_CNTL__PA_SC_AA_MASK] = 0xFFFFFFFF;
+	radeon_state_pm4(rstate);
+}
+
+static int r600_blit_states_init(struct pipe_context *ctx, struct r600_blit_states *bstates)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+
+	r600_blit_state_ps_shader(rscreen, &bstates->ps_shader);
+	r600_blit_state_vs_shader(rscreen, &bstates->vs_shader);
+	r600_blit_state_vgt(rscreen, &bstates->vgt);
+	r600_blit_state_draw(rscreen, &bstates->draw);
+	r600_blit_state_vs_constant(rscreen, &bstates->vs_constant0, 0, 1.0, 0.0, 0.0, 0.0);
+	r600_blit_state_vs_constant(rscreen, &bstates->vs_constant1, 1, 0.0, 1.0, 0.0, 0.0);
+	r600_blit_state_vs_constant(rscreen, &bstates->vs_constant2, 2, 0.0, 0.0, -0.00199900055, 0.0);
+	r600_blit_state_vs_constant(rscreen, &bstates->vs_constant3, 3, 0.0, 0.0, -0.99900049, 1.0);
+	r600_blit_state_rasterizer(rscreen, &bstates->rasterizer);
+	r600_blit_state_dsa(rscreen, &bstates->dsa);
+	r600_blit_state_blend(rscreen, &bstates->blend);
+	r600_blit_state_cb_cntl(rscreen, &bstates->cb_cntl);
+	r600_blit_state_vs_resources(rscreen, bstates);
+	return 0;
+}
+
+static void r600_blit_states_destroy(struct pipe_context *ctx, struct r600_blit_states *bstates)
+{
+	radeon_state_fini(&bstates->ps_shader);
+	radeon_state_fini(&bstates->vs_shader);
+	radeon_state_fini(&bstates->vs_resource0);
+	radeon_state_fini(&bstates->vs_resource1);
+}
+
+int r600_blit_uncompress_depth(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+	struct r600_context *rctx = r600_context(ctx);
+	struct radeon_draw draw;
+	struct r600_blit_states bstates;
+	int r;
+
+	r = r600_texture_scissor(ctx, rtexture, level);
+	if (r) {
+		return r;
+	}
+	r = r600_texture_cb(ctx, rtexture, 0, level);
+	if (r) {
+		return r;
+	}
+	r = r600_texture_db(ctx, rtexture, level);
+	if (r) {
+		return r;
+	}
+	r = r600_texture_viewport(ctx, rtexture, level);
+	if (r) {
+		return r;
+	}
+
+	r = r600_blit_states_init(ctx, &bstates);
+	if (r) {
+		return r;
+	}
+	bstates.dsa.states[R600_DSA__DB_RENDER_CONTROL] = 0x0000008C;
+	bstates.cb_cntl.states[R600_CB_CNTL__CB_TARGET_MASK] = 0x00000001;
+	/* force rebuild */
+	bstates.dsa.cpm4 = bstates.cb_cntl.cpm4 = 0;
+	if (radeon_state_pm4(&bstates.dsa)) {
+		goto out;
+	}
+	if (radeon_state_pm4(&bstates.cb_cntl)) {
+		goto out;
+	}
+
+	r = radeon_draw_init(&draw, rscreen->rw);
+	if (r) {
+		R600_ERR("failed creating draw for uncompressing textures\n");
+		goto out;
+	}
+
+	radeon_draw_bind(&draw, &bstates.vs_shader);
+	radeon_draw_bind(&draw, &bstates.ps_shader);
+	radeon_draw_bind(&draw, &bstates.rasterizer);
+	radeon_draw_bind(&draw, &bstates.dsa);
+	radeon_draw_bind(&draw, &bstates.blend);
+	radeon_draw_bind(&draw, &bstates.cb_cntl);
+	radeon_draw_bind(&draw, &rctx->config);
+	radeon_draw_bind(&draw, &bstates.vgt);
+	radeon_draw_bind(&draw, &bstates.draw);
+	radeon_draw_bind(&draw, &bstates.vs_resource0);
+	radeon_draw_bind(&draw, &bstates.vs_resource1);
+	radeon_draw_bind(&draw, &bstates.vs_constant0);
+	radeon_draw_bind(&draw, &bstates.vs_constant1);
+	radeon_draw_bind(&draw, &bstates.vs_constant2);
+	radeon_draw_bind(&draw, &bstates.vs_constant3);
+	radeon_draw_bind(&draw, &rtexture->viewport[level]);
+	radeon_draw_bind(&draw, &rtexture->scissor[level]);
+	radeon_draw_bind(&draw, &rtexture->cb[0][level]);
+	radeon_draw_bind(&draw, &rtexture->db[level]);
+
+	/* suspend queries */
+	r600_queries_suspend(ctx);
+
+	/* schedule draw*/
+	r = radeon_ctx_set_draw(&rctx->ctx, &draw);
+	if (r == -EBUSY) {
+		r600_flush(ctx, 0, NULL);
+		r = radeon_ctx_set_draw(&rctx->ctx, &draw);
+	}
+	if (r) {
+		goto out;
+	}
+
+	/* resume queries */
+	r600_queries_resume(ctx);
+
+out:
+	r600_blit_states_destroy(ctx, &bstates);
+	return r;
+}
diff --git a/src/gallium/drivers/r600/r600_context.c b/src/gallium/drivers/r600/r600_context.c
index edde80c660..7a0e5b4049 100644
--- a/src/gallium/drivers/r600/r600_context.c
+++ b/src/gallium/drivers/r600/r600_context.c
@@ -34,10 +34,26 @@
 #include "r600_resource.h"
 #include "r600d.h"
 
+
 static void r600_destroy_context(struct pipe_context *context)
 {
 	struct r600_context *rctx = r600_context(context);
 
+	rctx->rasterizer = r600_context_state_decref(rctx->rasterizer);
+	rctx->poly_stipple = r600_context_state_decref(rctx->poly_stipple);
+	rctx->scissor = r600_context_state_decref(rctx->scissor);
+	rctx->clip = r600_context_state_decref(rctx->clip);
+	rctx->ps_shader = r600_context_state_decref(rctx->ps_shader);
+	rctx->vs_shader = r600_context_state_decref(rctx->vs_shader);
+	rctx->depth = r600_context_state_decref(rctx->depth);
+	rctx->stencil = r600_context_state_decref(rctx->stencil);
+	rctx->alpha = r600_context_state_decref(rctx->alpha);
+	rctx->dsa = r600_context_state_decref(rctx->dsa);
+	rctx->blend = r600_context_state_decref(rctx->blend);
+	rctx->stencil_ref = r600_context_state_decref(rctx->stencil_ref);
+	rctx->viewport = r600_context_state_decref(rctx->viewport);
+	rctx->framebuffer = r600_context_state_decref(rctx->framebuffer);
+	radeon_ctx_fini(&rctx->ctx);
 	FREE(rctx);
 }
 
@@ -45,27 +61,35 @@ void r600_flush(struct pipe_context *ctx, unsigned flags,
 			struct pipe_fence_handle **fence)
 {
 	struct r600_context *rctx = r600_context(ctx);
-	struct r600_screen *rscreen = rctx->screen;
+	struct r600_query *rquery;
 	static int dc = 0;
 	char dname[256];
 
-	if (radeon_ctx_pm4(rctx->ctx))
-		return;
+	/* suspend queries */
+	r600_queries_suspend(ctx);
 	/* FIXME dumping should be removed once shader support instructions
 	 * without throwing bad code
 	 */
-	if (!rctx->ctx->cpm4)
+	if (!rctx->ctx.cdwords)
 		goto out;
+#if 0
 	sprintf(dname, "gallium-%08d.bof", dc);
-	if (dc < 1)
-		radeon_ctx_dump_bof(rctx->ctx, dname);
+	if (dc < 2) {
+		radeon_ctx_dump_bof(&rctx->ctx, dname);
+		R600_ERR("dumped %s\n", dname);
+	}
+#endif
 #if 1
-	radeon_ctx_submit(rctx->ctx);
+	radeon_ctx_submit(&rctx->ctx);
 #endif
+	LIST_FOR_EACH_ENTRY(rquery, &rctx->query_list, list) {
+		rquery->flushed = true;
+	}
 	dc++;
 out:
-	rctx->ctx = radeon_ctx_decref(rctx->ctx);
-	rctx->ctx = radeon_ctx(rscreen->rw);
+	radeon_ctx_clear(&rctx->ctx);
+	/* resume queries */
+	r600_queries_resume(ctx);
 }
 
 static void r600_init_config(struct r600_context *rctx)
@@ -207,9 +231,9 @@ static void r600_init_config(struct r600_context *rctx)
 		num_es_stack_entries = 0;
 		break;
 	}
-	rctx->hw_states.config = radeon_state(rctx->rw, R600_CONFIG_TYPE, R600_CONFIG);
+	radeon_state_init(&rctx->config, rctx->rw, R600_STATE_CONFIG, 0, 0);
 
-	rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] = 0x00000000;
+	rctx->config.states[R600_CONFIG__SQ_CONFIG] = 0x00000000;
 	switch (family) {
 	case CHIP_RV610:
 	case CHIP_RV620:
@@ -218,75 +242,85 @@ static void r600_init_config(struct r600_context *rctx)
 	case CHIP_RV710:
 		break;
 	default:
-		rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] |= S_008C00_VC_ENABLE(1);
+		rctx->config.states[R600_CONFIG__SQ_CONFIG] |= S_008C00_VC_ENABLE(1);
 		break;
 	}
-	rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] |= S_008C00_DX9_CONSTS(1);
-	rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] |= S_008C00_ALU_INST_PREFER_VECTOR(1);
-	rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] |= S_008C00_PS_PRIO(ps_prio);
-	rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] |= S_008C00_VS_PRIO(vs_prio);
-	rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] |= S_008C00_GS_PRIO(gs_prio);
-	rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] |= S_008C00_ES_PRIO(es_prio);
+	rctx->config.states[R600_CONFIG__SQ_CONFIG] |= S_008C00_DX9_CONSTS(1);
+	rctx->config.states[R600_CONFIG__SQ_CONFIG] |= S_008C00_ALU_INST_PREFER_VECTOR(1);
+	rctx->config.states[R600_CONFIG__SQ_CONFIG] |= S_008C00_PS_PRIO(ps_prio);
+	rctx->config.states[R600_CONFIG__SQ_CONFIG] |= S_008C00_VS_PRIO(vs_prio);
+	rctx->config.states[R600_CONFIG__SQ_CONFIG] |= S_008C00_GS_PRIO(gs_prio);
+	rctx->config.states[R600_CONFIG__SQ_CONFIG] |= S_008C00_ES_PRIO(es_prio);
+
+	rctx->config.states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] = 0;
+	rctx->config.states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] |= S_008C04_NUM_PS_GPRS(num_ps_gprs);
+	rctx->config.states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] |= S_008C04_NUM_VS_GPRS(num_vs_gprs);
+	rctx->config.states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] |= S_008C04_NUM_CLAUSE_TEMP_GPRS(num_temp_gprs);
 
-	rctx->hw_states.config->states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] = 0;
-	rctx->hw_states.config->states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] |= S_008C04_NUM_PS_GPRS(num_ps_gprs);
-	rctx->hw_states.config->states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] |= S_008C04_NUM_VS_GPRS(num_vs_gprs);
-	rctx->hw_states.config->states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] |= S_008C04_NUM_CLAUSE_TEMP_GPRS(num_temp_gprs);
+	rctx->config.states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_2] = 0;
+	rctx->config.states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_2] |= S_008C08_NUM_GS_GPRS(num_gs_gprs);
+	rctx->config.states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_2] |= S_008C08_NUM_GS_GPRS(num_es_gprs);
 
-	rctx->hw_states.config->states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_2] = 0;
-	rctx->hw_states.config->states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_2] |= S_008C08_NUM_GS_GPRS(num_gs_gprs);
-	rctx->hw_states.config->states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_2] |= S_008C08_NUM_GS_GPRS(num_es_gprs);
+	rctx->config.states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] = 0;
+	rctx->config.states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] |= S_008C0C_NUM_PS_THREADS(num_ps_threads);
+	rctx->config.states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] |= S_008C0C_NUM_VS_THREADS(num_vs_threads);
+	rctx->config.states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] |= S_008C0C_NUM_GS_THREADS(num_gs_threads);
+	rctx->config.states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] |= S_008C0C_NUM_ES_THREADS(num_es_threads);
 
-	rctx->hw_states.config->states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] = 0;
-	rctx->hw_states.config->states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] |= S_008C0C_NUM_PS_THREADS(num_ps_threads);
-	rctx->hw_states.config->states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] |= S_008C0C_NUM_VS_THREADS(num_vs_threads);
-	rctx->hw_states.config->states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] |= S_008C0C_NUM_GS_THREADS(num_gs_threads);
-	rctx->hw_states.config->states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] |= S_008C0C_NUM_ES_THREADS(num_es_threads);
+	rctx->config.states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_1] = 0;
+	rctx->config.states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_1] |= S_008C10_NUM_PS_STACK_ENTRIES(num_ps_stack_entries);
+	rctx->config.states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_1] |= S_008C10_NUM_VS_STACK_ENTRIES(num_vs_stack_entries);
 
-	rctx->hw_states.config->states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_1] = 0;
-	rctx->hw_states.config->states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_1] |= S_008C10_NUM_PS_STACK_ENTRIES(num_ps_stack_entries);
-	rctx->hw_states.config->states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_1] |= S_008C10_NUM_VS_STACK_ENTRIES(num_vs_stack_entries);
+	rctx->config.states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_2] = 0;
+	rctx->config.states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_2] |= S_008C14_NUM_GS_STACK_ENTRIES(num_gs_stack_entries);
+	rctx->config.states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_2] |= S_008C14_NUM_ES_STACK_ENTRIES(num_es_stack_entries);
 
-	rctx->hw_states.config->states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_2] = 0;
-	rctx->hw_states.config->states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_2] |= S_008C14_NUM_GS_STACK_ENTRIES(num_gs_stack_entries);
-	rctx->hw_states.config->states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_2] |= S_008C14_NUM_ES_STACK_ENTRIES(num_es_stack_entries);
+	rctx->config.states[R600_CONFIG__VC_ENHANCE] = 0x00000000;
+	rctx->config.states[R600_CONFIG__SX_MISC] = 0x00000000;
 
-	rctx->hw_states.config->states[R600_CONFIG__SQ_DYN_GPR_CNTL_PS_FLUSH_REQ] = 0x00004000;
-	rctx->hw_states.config->states[R600_CONFIG__TA_CNTL_AUX] = 0x07000002;
-	rctx->hw_states.config->states[R600_CONFIG__VC_ENHANCE] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__DB_DEBUG] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__DB_WATERMARKS] = 0x00420204;
-	rctx->hw_states.config->states[R600_CONFIG__SX_MISC] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__SPI_THREAD_GROUPING] = 0x00000001;
-	rctx->hw_states.config->states[R600_CONFIG__CB_SHADER_CONTROL] = 0x00000003;
-	rctx->hw_states.config->states[R600_CONFIG__SQ_ESGS_RING_ITEMSIZE] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__SQ_GSVS_RING_ITEMSIZE] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__SQ_ESTMP_RING_ITEMSIZE] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__SQ_GSTMP_RING_ITEMSIZE] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__SQ_VSTMP_RING_ITEMSIZE] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__SQ_PSTMP_RING_ITEMSIZE] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__SQ_FBUF_RING_ITEMSIZE] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__SQ_REDUC_RING_ITEMSIZE] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__SQ_GS_VERT_ITEMSIZE] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_OUTPUT_PATH_CNTL] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_HOS_CNTL] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_HOS_MAX_TESS_LEVEL] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_HOS_MIN_TESS_LEVEL] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_HOS_REUSE_DEPTH] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_GROUP_PRIM_TYPE] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_GROUP_FIRST_DECR] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_GROUP_DECR] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_GROUP_VECT_0_CNTL] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_GROUP_VECT_1_CNTL] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_GROUP_VECT_0_FMT_CNTL] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_GROUP_VECT_1_FMT_CNTL] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_GS_MODE] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__PA_SC_MODE_CNTL] = 0x00514000;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_STRMOUT_EN] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_REUSE_OFF] = 0x00000001;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_VTX_CNT_EN] = 0x00000000;
-	rctx->hw_states.config->states[R600_CONFIG__VGT_STRMOUT_BUFFER_EN] = 0x00000000;
-	radeon_state_pm4(rctx->hw_states.config);
+	if (family >= CHIP_RV770) {
+		rctx->config.states[R600_CONFIG__SQ_DYN_GPR_CNTL_PS_FLUSH_REQ] = 0x00004000;
+		rctx->config.states[R600_CONFIG__TA_CNTL_AUX] = 0x07000002;
+		rctx->config.states[R600_CONFIG__DB_DEBUG] = 0x00000000;
+		rctx->config.states[R600_CONFIG__DB_WATERMARKS] = 0x00420204;
+		rctx->config.states[R600_CONFIG__SPI_THREAD_GROUPING] = 0x00000000;
+		rctx->config.states[R600_CONFIG__PA_SC_MODE_CNTL] = 0x00514000;
+	} else {
+		rctx->config.states[R600_CONFIG__SQ_DYN_GPR_CNTL_PS_FLUSH_REQ] = 0x00000000;
+		rctx->config.states[R600_CONFIG__TA_CNTL_AUX] = 0x07000003;
+		rctx->config.states[R600_CONFIG__DB_DEBUG] = 0x82000000;
+		rctx->config.states[R600_CONFIG__DB_WATERMARKS] = 0x01020204;
+		rctx->config.states[R600_CONFIG__SPI_THREAD_GROUPING] = 0x00000001;
+		rctx->config.states[R600_CONFIG__PA_SC_MODE_CNTL] = 0x00004010;
+	}
+	rctx->config.states[R600_CONFIG__CB_SHADER_CONTROL] = 0x00000003;
+	rctx->config.states[R600_CONFIG__SQ_ESGS_RING_ITEMSIZE] = 0x00000000;
+	rctx->config.states[R600_CONFIG__SQ_GSVS_RING_ITEMSIZE] = 0x00000000;
+	rctx->config.states[R600_CONFIG__SQ_ESTMP_RING_ITEMSIZE] = 0x00000000;
+	rctx->config.states[R600_CONFIG__SQ_GSTMP_RING_ITEMSIZE] = 0x00000000;
+	rctx->config.states[R600_CONFIG__SQ_VSTMP_RING_ITEMSIZE] = 0x00000000;
+	rctx->config.states[R600_CONFIG__SQ_PSTMP_RING_ITEMSIZE] = 0x00000000;
+	rctx->config.states[R600_CONFIG__SQ_FBUF_RING_ITEMSIZE] = 0x00000000;
+	rctx->config.states[R600_CONFIG__SQ_REDUC_RING_ITEMSIZE] = 0x00000000;
+	rctx->config.states[R600_CONFIG__SQ_GS_VERT_ITEMSIZE] = 0x00000000;
+	rctx->config.states[R600_CONFIG__VGT_OUTPUT_PATH_CNTL] = 0x00000000;
+	rctx->config.states[R600_CONFIG__VGT_HOS_CNTL] = 0x00000000;
+	rctx->config.states[R600_CONFIG__VGT_HOS_MAX_TESS_LEVEL] = 0x00000000;
+	rctx->config.states[R600_CONFIG__VGT_HOS_MIN_TESS_LEVEL] = 0x00000000;
+	rctx->config.states[R600_CONFIG__VGT_HOS_REUSE_DEPTH] = 0x00000000;
+	rctx->config.states[R600_CONFIG__VGT_GROUP_PRIM_TYPE] = 0x00000000;
+	rctx->config.states[R600_CONFIG__VGT_GROUP_FIRST_DECR] = 0x00000000;
+	rctx->config.states[R600_CONFIG__VGT_GROUP_DECR] = 0x00000000;
+	rctx->config.states[R600_CONFIG__VGT_GROUP_VECT_0_CNTL] = 0x00000000;
+	rctx->config.states[R600_CONFIG__VGT_GROUP_VECT_1_CNTL] = 0x00000000;
+	rctx->config.states[R600_CONFIG__VGT_GROUP_VECT_0_FMT_CNTL] = 0x00000000;
+	rctx->config.states[R600_CONFIG__VGT_GROUP_VECT_1_FMT_CNTL] = 0x00000000;
+	rctx->config.states[R600_CONFIG__VGT_GS_MODE] = 0x00000000;
+	rctx->config.states[R600_CONFIG__VGT_STRMOUT_EN] = 0x00000000;
+	rctx->config.states[R600_CONFIG__VGT_REUSE_OFF] = 0x00000001;
+	rctx->config.states[R600_CONFIG__VGT_VTX_CNT_EN] = 0x00000000;
+	rctx->config.states[R600_CONFIG__VGT_STRMOUT_BUFFER_EN] = 0x00000000;
+	radeon_state_pm4(&rctx->config);
 }
 
 struct pipe_context *r600_create_context(struct pipe_screen *screen, void *priv)
@@ -320,7 +354,7 @@ struct pipe_context *r600_create_context(struct pipe_screen *screen, void *priv)
 
 	r600_init_config(rctx);
 
-	rctx->ctx = radeon_ctx(rscreen->rw);
-	rctx->draw = radeon_draw(rscreen->rw);
+	radeon_ctx_init(&rctx->ctx, rscreen->rw);
+	radeon_draw_init(&rctx->draw, rscreen->rw);
 	return &rctx->context;
 }
diff --git a/src/gallium/drivers/r600/r600_context.h b/src/gallium/drivers/r600/r600_context.h
index 76d5de8653..cea0813054 100644
--- a/src/gallium/drivers/r600/r600_context.h
+++ b/src/gallium/drivers/r600/r600_context.h
@@ -30,9 +30,32 @@
 #include <tgsi/tgsi_parse.h>
 #include <tgsi/tgsi_util.h>
 #include <util/u_blitter.h>
+#include <util/u_double_list.h>
 #include "radeon.h"
 #include "r600_shader.h"
 
+#define R600_QUERY_STATE_STARTED	(1 << 0)
+#define R600_QUERY_STATE_ENDED		(1 << 1)
+#define R600_QUERY_STATE_SUSPENDED	(1 << 2)
+
+struct r600_query {
+	u64					result;
+	/* The kind of query. Currently only OQ is supported. */
+	unsigned				type;
+	/* How many results have been written, in dwords. It's incremented
+	 * after end_query and flush. */
+	unsigned				num_results;
+	/* if we've flushed the query */
+	boolean					flushed;
+	unsigned				state;
+	/* The buffer where query results are stored. */
+	struct radeon_bo			*buffer;
+	unsigned				buffer_size;
+	/* linked list of queries */
+	struct list_head			list;
+	struct radeon_state			rstate;
+};
+
 /* XXX move this to a more appropriate place */
 union pipe_states {
 	struct pipe_rasterizer_state		rasterizer;
@@ -72,13 +95,16 @@ enum pipe_state_type {
 	pipe_type_count
 };
 
+#define R600_MAX_RSTATE		16
+
 struct r600_context_state {
 	union pipe_states		state;
 	unsigned			refcount;
 	unsigned			type;
-	struct radeon_state		*rstate;
+	struct radeon_state		rstate[R600_MAX_RSTATE];
 	struct r600_shader		shader;
 	struct radeon_bo		*bo;
+	unsigned			nrstate;
 };
 
 struct r600_vertex_element
@@ -89,28 +115,25 @@ struct r600_vertex_element
 };
 
 struct r600_context_hw_states {
-	struct radeon_state	*rasterizer;
-	struct radeon_state	*scissor;
-	struct radeon_state	*dsa;
-	struct radeon_state	*blend;
-	struct radeon_state	*viewport;
-	struct radeon_state	*cb[8];
-	struct radeon_state	*config;
-	struct radeon_state	*cb_cntl;
-	struct radeon_state	*db;
-	unsigned		ps_nresource;
-	unsigned		ps_nsampler;
-	struct radeon_state	*ps_resource[160];
-	struct radeon_state	*ps_sampler[16];
+	struct radeon_state	rasterizer;
+	struct radeon_state	scissor;
+	struct radeon_state	dsa;
+	struct radeon_state	cb_cntl;
 };
 
 struct r600_context {
 	struct pipe_context		context;
 	struct r600_screen		*screen;
 	struct radeon			*rw;
-	struct radeon_ctx		*ctx;
+	struct radeon_ctx		ctx;
 	struct blitter_context		*blitter;
-	struct radeon_draw		*draw;
+	struct radeon_draw		draw;
+	struct radeon_state		config;
+	/* FIXME get rid of those vs_resource,vs/ps_constant */
+	struct radeon_state		vs_resource[160];
+	unsigned			vs_nresource;
+	struct radeon_state		vs_constant[256];
+	struct radeon_state		ps_constant[256];
 	/* hw states */
 	struct r600_context_hw_states	hw_states;
 	/* pipe states */
@@ -134,14 +157,15 @@ struct r600_context {
 	struct r600_context_state	*stencil_ref;
 	struct r600_context_state	*viewport;
 	struct r600_context_state	*framebuffer;
-	struct r600_context_state	*ps_sampler[PIPE_MAX_ATTRIBS];
-	struct r600_context_state	*vs_sampler[PIPE_MAX_ATTRIBS];
-	struct r600_context_state	*ps_sampler_view[PIPE_MAX_ATTRIBS];
-	struct r600_context_state	*vs_sampler_view[PIPE_MAX_ATTRIBS];
+	struct radeon_state		*ps_sampler[PIPE_MAX_ATTRIBS];
+	struct radeon_state		*vs_sampler[PIPE_MAX_ATTRIBS];
+	struct radeon_state		*ps_sampler_view[PIPE_MAX_ATTRIBS];
+	struct radeon_state		*vs_sampler_view[PIPE_MAX_ATTRIBS];
 	struct r600_vertex_element	*vertex_elements;
 	struct pipe_vertex_buffer	vertex_buffer[PIPE_MAX_ATTRIBS];
 	struct pipe_index_buffer	index_buffer;
-	struct pipe_blend_color         blend_color;
+	struct pipe_blend_color		blend_color;
+	struct list_head		query_list;
 };
 
 /* Convenience cast wrapper. */
@@ -150,13 +174,18 @@ static INLINE struct r600_context *r600_context(struct pipe_context *pipe)
     return (struct r600_context*)pipe;
 }
 
+static INLINE struct r600_query* r600_query(struct pipe_query* q)
+{
+    return (struct r600_query*)q;
+}
+
 struct r600_context_state *r600_context_state(struct r600_context *rctx, unsigned type, const void *state);
 struct r600_context_state *r600_context_state_incref(struct r600_context_state *rstate);
 struct r600_context_state *r600_context_state_decref(struct r600_context_state *rstate);
 void r600_flush(struct pipe_context *ctx, unsigned flags,
 			struct pipe_fence_handle **fence);
 
-int r600_context_hw_states(struct r600_context *rctx);
+int r600_context_hw_states(struct pipe_context *ctx);
 
 void r600_draw_vbo(struct pipe_context *ctx,
                    const struct pipe_draw_info *info);
@@ -178,4 +207,10 @@ extern int r600_pipe_shader_update(struct pipe_context *ctx,
 uint32_t r600_translate_texformat(enum pipe_format format,
 				  const unsigned char *swizzle_view, 
 				  uint32_t *word4_p, uint32_t *yuv_format_p);
+
+/* query */
+extern void r600_queries_resume(struct pipe_context *ctx);
+extern void r600_queries_suspend(struct pipe_context *ctx);
+
+
 #endif
diff --git a/src/gallium/drivers/r600/r600_draw.c b/src/gallium/drivers/r600/r600_draw.c
index f058455162..fabd337d23 100644
--- a/src/gallium/drivers/r600/r600_draw.c
+++ b/src/gallium/drivers/r600/r600_draw.c
@@ -31,6 +31,7 @@
 #include <util/u_math.h>
 #include <util/u_inlines.h>
 #include <util/u_memory.h>
+#include "radeon.h"
 #include "r600_screen.h"
 #include "r600_context.h"
 #include "r600_resource.h"
@@ -38,8 +39,8 @@
 
 struct r600_draw {
 	struct pipe_context	*ctx;
-	struct radeon_state	*draw;
-	struct radeon_state	*vgt;
+	struct radeon_state	draw;
+	struct radeon_state	vgt;
 	unsigned		mode;
 	unsigned		start;
 	unsigned		count;
@@ -51,6 +52,7 @@ static int r600_draw_common(struct r600_draw *draw)
 {
 	struct r600_context *rctx = r600_context(draw->ctx);
 	struct r600_screen *rscreen = rctx->screen;
+	/* FIXME vs_resource */
 	struct radeon_state *vs_resource;
 	struct r600_resource *rbuffer;
 	unsigned i, j, offset, format, prim;
@@ -58,7 +60,7 @@ static int r600_draw_common(struct r600_draw *draw)
 	struct pipe_vertex_buffer *vertex_buffer;
 	int r;
 
-	r = r600_context_hw_states(rctx);
+	r = r600_context_hw_states(draw->ctx);
 	if (r)
 		return r;
 	switch (draw->index_size) {
@@ -81,6 +83,7 @@ static int r600_draw_common(struct r600_draw *draw)
 	r = r600_conv_pipe_prim(draw->mode, &prim);
 	if (r)
 		return r;
+
 	/* rebuild vertex shader if input format changed */
 	r = r600_pipe_shader_update(draw->ctx, rctx->vs_shader);
 	if (r)
@@ -88,26 +91,24 @@ static int r600_draw_common(struct r600_draw *draw)
 	r = r600_pipe_shader_update(draw->ctx, rctx->ps_shader);
 	if (r)
 		return r;
-	r = radeon_draw_set(rctx->draw, rctx->vs_shader->rstate);
-	if (r)
-		return r;
-	r = radeon_draw_set(rctx->draw, rctx->ps_shader->rstate);
-	if (r)
-		return r;
+	radeon_draw_bind(&rctx->draw, &rctx->vs_shader->rstate[0]);
+	radeon_draw_bind(&rctx->draw, &rctx->ps_shader->rstate[0]);
 
+	for (i = 0 ; i < rctx->vs_nresource; i++) {
+		radeon_state_fini(&rctx->vs_resource[i]);
+	}
 	for (i = 0 ; i < rctx->vertex_elements->count; i++) {
+		vs_resource = &rctx->vs_resource[i];
 		j = rctx->vertex_elements->elements[i].vertex_buffer_index;
 		vertex_buffer = &rctx->vertex_buffer[j];
 		rbuffer = (struct r600_resource*)vertex_buffer->buffer;
 		offset = rctx->vertex_elements->elements[i].src_offset + vertex_buffer->buffer_offset;
 		format = r600_translate_colorformat(rctx->vertex_elements->elements[i].src_format);
-		vs_resource = radeon_state(rscreen->rw, R600_VS_RESOURCE_TYPE, R600_VS_RESOURCE + i);
-		if (vs_resource == NULL)
-			return -ENOMEM;
+		radeon_state_init(vs_resource, rscreen->rw, R600_STATE_RESOURCE, i, R600_SHADER_VS);
 		vs_resource->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
 		vs_resource->nbo = 1;
 		vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD0] = offset;
-		vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD1] = rbuffer->bo->size - offset;
+		vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD1] = rbuffer->bo->size - offset - 1;
 		vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD2] = S_038008_STRIDE(vertex_buffer->stride) |
 								S_038008_DATA_FORMAT(format);
 		vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD3] = 0x00000000;
@@ -116,59 +117,61 @@ static int r600_draw_common(struct r600_draw *draw)
 		vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD6] = 0xC0000000;
 		vs_resource->placement[0] = RADEON_GEM_DOMAIN_GTT;
 		vs_resource->placement[1] = RADEON_GEM_DOMAIN_GTT;
-		r = radeon_draw_set_new(rctx->draw, vs_resource);
-		if (r)
+		r = radeon_state_pm4(vs_resource);
+		if (r) {
 			return r;
+		}
+		radeon_draw_bind(&rctx->draw, vs_resource);
 	}
+	rctx->vs_nresource = rctx->vertex_elements->count;
 	/* FIXME start need to change winsys */
-	draw->draw = radeon_state(rscreen->rw, R600_DRAW_TYPE, R600_DRAW);
-	if (draw->draw == NULL)
-		return -ENOMEM;
-	draw->draw->states[R600_DRAW__VGT_NUM_INDICES] = draw->count;
-	draw->draw->states[R600_DRAW__VGT_DRAW_INITIATOR] = vgt_draw_initiator;
+	radeon_state_init(&draw->draw, rscreen->rw, R600_STATE_DRAW, 0, 0);
+	draw->draw.states[R600_DRAW__VGT_NUM_INDICES] = draw->count;
+	draw->draw.states[R600_DRAW__VGT_DRAW_INITIATOR] = vgt_draw_initiator;
 	if (draw->index_buffer) {
 		rbuffer = (struct r600_resource*)draw->index_buffer;
-		draw->draw->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
-		draw->draw->placement[0] = RADEON_GEM_DOMAIN_GTT;
-		draw->draw->placement[1] = RADEON_GEM_DOMAIN_GTT;
-		draw->draw->nbo = 1;
+		draw->draw.bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
+		draw->draw.placement[0] = RADEON_GEM_DOMAIN_GTT;
+		draw->draw.placement[1] = RADEON_GEM_DOMAIN_GTT;
+		draw->draw.nbo = 1;
 	}
-	r = radeon_draw_set_new(rctx->draw, draw->draw);
-	if (r)
+	r = radeon_state_pm4(&draw->draw);
+	if (r) {
 		return r;
-	draw->vgt = radeon_state(rscreen->rw, R600_VGT_TYPE, R600_VGT);
-	if (draw->vgt == NULL)
-		return -ENOMEM;
-	draw->vgt->states[R600_VGT__VGT_PRIMITIVE_TYPE] = prim;
-	draw->vgt->states[R600_VGT__VGT_MAX_VTX_INDX] = 0x00FFFFFF;
-	draw->vgt->states[R600_VGT__VGT_MIN_VTX_INDX] = 0x00000000;
-	draw->vgt->states[R600_VGT__VGT_INDX_OFFSET] = draw->start;
-	draw->vgt->states[R600_VGT__VGT_MULTI_PRIM_IB_RESET_INDX] = 0x00000000;
-	draw->vgt->states[R600_VGT__VGT_DMA_INDEX_TYPE] = vgt_dma_index_type;
-	draw->vgt->states[R600_VGT__VGT_PRIMITIVEID_EN] = 0x00000000;
-	draw->vgt->states[R600_VGT__VGT_DMA_NUM_INSTANCES] = 0x00000001;
-	draw->vgt->states[R600_VGT__VGT_MULTI_PRIM_IB_RESET_EN] = 0x00000000;
-	draw->vgt->states[R600_VGT__VGT_INSTANCE_STEP_RATE_0] = 0x00000000;
-	draw->vgt->states[R600_VGT__VGT_INSTANCE_STEP_RATE_1] = 0x00000000;
-	r = radeon_draw_set_new(rctx->draw, draw->vgt);
-	if (r)
+	}
+	radeon_draw_bind(&rctx->draw, &draw->draw);
+
+	radeon_state_init(&draw->vgt, rscreen->rw, R600_STATE_VGT, 0, 0);
+	draw->vgt.states[R600_VGT__VGT_PRIMITIVE_TYPE] = prim;
+	draw->vgt.states[R600_VGT__VGT_MAX_VTX_INDX] = 0x00FFFFFF;
+	draw->vgt.states[R600_VGT__VGT_MIN_VTX_INDX] = 0x00000000;
+	draw->vgt.states[R600_VGT__VGT_INDX_OFFSET] = draw->start;
+	draw->vgt.states[R600_VGT__VGT_MULTI_PRIM_IB_RESET_INDX] = 0x00000000;
+	draw->vgt.states[R600_VGT__VGT_DMA_INDEX_TYPE] = vgt_dma_index_type;
+	draw->vgt.states[R600_VGT__VGT_PRIMITIVEID_EN] = 0x00000000;
+	draw->vgt.states[R600_VGT__VGT_DMA_NUM_INSTANCES] = 0x00000001;
+	draw->vgt.states[R600_VGT__VGT_MULTI_PRIM_IB_RESET_EN] = 0x00000000;
+	draw->vgt.states[R600_VGT__VGT_INSTANCE_STEP_RATE_0] = 0x00000000;
+	draw->vgt.states[R600_VGT__VGT_INSTANCE_STEP_RATE_1] = 0x00000000;
+	r = radeon_state_pm4(&draw->vgt);
+	if (r) {
 		return r;
-	/* FIXME */
-	r = radeon_ctx_set_draw_new(rctx->ctx, rctx->draw);
+	}
+	radeon_draw_bind(&rctx->draw, &draw->vgt);
+
+	r = radeon_ctx_set_draw(&rctx->ctx, &rctx->draw);
 	if (r == -EBUSY) {
 		r600_flush(draw->ctx, 0, NULL);
-		r = radeon_ctx_set_draw_new(rctx->ctx, rctx->draw);
+		r = radeon_ctx_set_draw(&rctx->ctx, &rctx->draw);
 	}
-	if (r)
-		return r;
-	rctx->draw = radeon_draw_duplicate(rctx->draw);
-	return 0;
+	return r;
 }
 
 void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
 	struct r600_context *rctx = r600_context(ctx);
 	struct r600_draw draw;
+	int r;
 
 	assert(info->index_bias == 0);
 
@@ -189,5 +192,7 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		draw.index_size = 0;
 		draw.index_buffer = NULL;
 	}
-	r600_draw_common(&draw);
+	r = r600_draw_common(&draw);
+	if (r)
+	  fprintf(stderr,"draw common failed %d\n", r);
 }
diff --git a/src/gallium/drivers/r600/r600_query.c b/src/gallium/drivers/r600/r600_query.c
index 9b02ae680e..530940ed84 100644
--- a/src/gallium/drivers/r600/r600_query.c
+++ b/src/gallium/drivers/r600/r600_query.c
@@ -24,39 +24,225 @@
  *      Jerome Glisse
  *      Corbin Simpson
  */
+#include <errno.h>
 #include <util/u_inlines.h>
 #include <util/u_format.h>
 #include <util/u_memory.h>
 #include "r600_screen.h"
 #include "r600_context.h"
 
-static struct pipe_query *r600_create_query(struct pipe_context *pipe, unsigned query_type)
+static void r600_query_begin(struct r600_context *rctx, struct r600_query *rquery)
 {
-	return NULL;
+	struct r600_screen *rscreen = rctx->screen;
+	struct radeon_state *rstate = &rquery->rstate;
+
+	radeon_state_fini(rstate);
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_QUERY_BEGIN, 0, 0);
+	rstate->states[R600_QUERY__OFFSET] = rquery->num_results;
+	rstate->bo[0] = radeon_bo_incref(rscreen->rw, rquery->buffer);
+	rstate->nbo = 1;
+	rstate->placement[0] = RADEON_GEM_DOMAIN_GTT;
+	if (radeon_state_pm4(rstate)) {
+		radeon_state_fini(rstate);
+	}
+}
+
+static void r600_query_end(struct r600_context *rctx, struct r600_query *rquery)
+{
+	struct r600_screen *rscreen = rctx->screen;
+	struct radeon_state *rstate = &rquery->rstate;
+
+	radeon_state_fini(rstate);
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_QUERY_END, 0, 0);
+	rstate->states[R600_QUERY__OFFSET] = rquery->num_results + 8;
+	rstate->bo[0] = radeon_bo_incref(rscreen->rw, rquery->buffer);
+	rstate->nbo = 1;
+	rstate->placement[0] = RADEON_GEM_DOMAIN_GTT;
+	if (radeon_state_pm4(rstate)) {
+		radeon_state_fini(rstate);
+	}
 }
 
-static void r600_destroy_query(struct pipe_context *pipe, struct pipe_query *query)
+static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type)
 {
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+	struct r600_context *rctx = r600_context(ctx);
+	struct r600_query *q;
+
+	if (query_type != PIPE_QUERY_OCCLUSION_COUNTER)
+		return NULL;
+
+	q = CALLOC_STRUCT(r600_query);
+	if (!q)
+		return NULL;
+
+	q->type = query_type;
+	LIST_ADDTAIL(&q->list, &rctx->query_list);
+	q->buffer_size = 4096;
+
+	q->buffer = radeon_bo(rscreen->rw, 0, q->buffer_size, 1, NULL);
+	if (!q->buffer) {
+		FREE(q);
+		return NULL;
+	}
+	return (struct pipe_query *)q;
+}
+
+static void r600_destroy_query(struct pipe_context *ctx,
+			       struct pipe_query *query)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+	struct r600_query *q = r600_query(query);
+
+	radeon_bo_decref(rscreen->rw, q->buffer);
+	LIST_DEL(&q->list);
 	FREE(query);
 }
 
-static void r600_begin_query(struct pipe_context *pipe, struct pipe_query *query)
+static void r600_query_result(struct pipe_context *ctx, struct r600_query *rquery)
 {
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+	u64 start, end;
+	u32 *results;
+	int i;
+
+	radeon_bo_wait(rscreen->rw, rquery->buffer);
+	radeon_bo_map(rscreen->rw, rquery->buffer);
+	results = rquery->buffer->data;
+	for (i = 0; i < rquery->num_results; i += 4) {
+		start = (u64)results[i] | (u64)results[i + 1] << 32;
+		end = (u64)results[i + 2] | (u64)results[i + 3] << 32;
+		if ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL)) {
+			rquery->result += end - start;
+		}
+	}
+	radeon_bo_unmap(rscreen->rw, rquery->buffer);
+	rquery->num_results = 0;
 }
 
-static void r600_end_query(struct pipe_context *pipe, struct pipe_query *query)
+static void r600_query_resume(struct pipe_context *ctx, struct r600_query *rquery)
 {
+	struct r600_context *rctx = r600_context(ctx);
+
+	if (rquery->num_results >= ((rquery->buffer_size >> 2) - 2)) {
+		/* running out of space */
+		if (!rquery->flushed) {
+			ctx->flush(ctx, 0, NULL);
+		}
+		r600_query_result(ctx, rquery);
+	}
+	r600_query_begin(rctx, rquery);
+	rquery->flushed = false;
+}
+
+static void r600_query_suspend(struct pipe_context *ctx, struct r600_query *rquery)
+{
+	struct r600_context *rctx = r600_context(ctx);
+
+	r600_query_end(rctx, rquery);
+	rquery->num_results += 16;
 }
 
-static boolean r600_get_query_result(struct pipe_context *pipe,
+static void r600_begin_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+	struct r600_context *rctx = r600_context(ctx);
+	struct r600_query *rquery = r600_query(query);
+	int r;
+
+	rquery->state = R600_QUERY_STATE_STARTED;
+	rquery->num_results = 0;
+	rquery->flushed = false;
+	r600_query_resume(ctx, rquery);
+	r = radeon_ctx_set_query_state(&rctx->ctx, &rquery->rstate);
+	if (r == -EBUSY) {
+		/* this shouldn't happen */
+		R600_ERR("had to flush while emitting end query\n");
+		ctx->flush(ctx, 0, NULL);
+		r = radeon_ctx_set_query_state(&rctx->ctx, &rquery->rstate);
+	}
+}
+
+static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+	struct r600_context *rctx = r600_context(ctx);
+	struct r600_query *rquery = r600_query(query);
+	int r;
+
+	rquery->state &= ~R600_QUERY_STATE_STARTED;
+	rquery->state |= R600_QUERY_STATE_ENDED;
+	r600_query_suspend(ctx, rquery);
+	r = radeon_ctx_set_query_state(&rctx->ctx, &rquery->rstate);
+	if (r == -EBUSY) {
+		/* this shouldn't happen */
+		R600_ERR("had to flush while emitting end query\n");
+		ctx->flush(ctx, 0, NULL);
+		r = radeon_ctx_set_query_state(&rctx->ctx, &rquery->rstate);
+	}
+}
+
+void r600_queries_suspend(struct pipe_context *ctx)
+{
+	struct r600_context *rctx = r600_context(ctx);
+	struct r600_query *rquery;
+	int r;
+
+	LIST_FOR_EACH_ENTRY(rquery, &rctx->query_list, list) {
+		if (rquery->state & R600_QUERY_STATE_STARTED) {
+			r600_query_suspend(ctx, rquery);
+			r = radeon_ctx_set_query_state(&rctx->ctx, &rquery->rstate);
+			if (r == -EBUSY) {
+				/* this shouldn't happen */
+				R600_ERR("had to flush while emitting end query\n");
+				ctx->flush(ctx, 0, NULL);
+				r = radeon_ctx_set_query_state(&rctx->ctx, &rquery->rstate);
+			}
+		}
+		rquery->state |= R600_QUERY_STATE_SUSPENDED;
+	}
+}
+
+void r600_queries_resume(struct pipe_context *ctx)
+{
+	struct r600_context *rctx = r600_context(ctx);
+	struct r600_query *rquery;
+	int r;
+
+	LIST_FOR_EACH_ENTRY(rquery, &rctx->query_list, list) {
+		if (rquery->state & R600_QUERY_STATE_STARTED) {
+			r600_query_resume(ctx, rquery);
+			r = radeon_ctx_set_query_state(&rctx->ctx, &rquery->rstate);
+			if (r == -EBUSY) {
+				/* this shouldn't happen */
+				R600_ERR("had to flush while emitting end query\n");
+				ctx->flush(ctx, 0, NULL);
+				r = radeon_ctx_set_query_state(&rctx->ctx, &rquery->rstate);
+			}
+		}
+		rquery->state &= ~R600_QUERY_STATE_SUSPENDED;
+	}
+}
+
+static boolean r600_get_query_result(struct pipe_context *ctx,
 					struct pipe_query *query,
-					boolean wait, void *result)
+					boolean wait, void *vresult)
 {
+	struct r600_query *rquery = r600_query(query);
+	uint64_t *result = (uint64_t*)vresult;
+
+	if (!rquery->flushed) {
+		ctx->flush(ctx, 0, NULL);
+		rquery->flushed = true;
+	}
+	r600_query_result(ctx, rquery);
+	*result = rquery->result;
+	rquery->result = 0;
 	return TRUE;
 }
 
 void r600_init_query_functions(struct r600_context* rctx)
 {
+	LIST_INITHEAD(&rctx->query_list);
+
 	rctx->context.create_query = r600_create_query;
 	rctx->context.destroy_query = r600_destroy_query;
 	rctx->context.begin_query = r600_begin_query;
diff --git a/src/gallium/drivers/r600/r600_resource.h b/src/gallium/drivers/r600/r600_resource.h
index bb90e76fb7..129667ad20 100644
--- a/src/gallium/drivers/r600/r600_resource.h
+++ b/src/gallium/drivers/r600/r600_resource.h
@@ -44,10 +44,22 @@ struct r600_resource_texture {
 	struct r600_resource		resource;
 	unsigned long			offset[PIPE_MAX_TEXTURE_LEVELS];
 	unsigned long			pitch[PIPE_MAX_TEXTURE_LEVELS];
+	unsigned long			width[PIPE_MAX_TEXTURE_LEVELS];
+	unsigned long			height[PIPE_MAX_TEXTURE_LEVELS];
 	unsigned long			layer_size[PIPE_MAX_TEXTURE_LEVELS];
 	unsigned long			pitch_override;
 	unsigned long			bpt;
 	unsigned long			size;
+	unsigned			tilled;
+	unsigned			array_mode;
+	unsigned			tile_type;
+	unsigned			depth;
+	unsigned			dirty;
+	struct radeon_bo		*uncompressed;
+	struct radeon_state		scissor[PIPE_MAX_TEXTURE_LEVELS];
+	struct radeon_state		cb[8][PIPE_MAX_TEXTURE_LEVELS];
+	struct radeon_state		db[PIPE_MAX_TEXTURE_LEVELS];
+	struct radeon_state		viewport[PIPE_MAX_TEXTURE_LEVELS];
 };
 
 void r600_init_context_resource_functions(struct r600_context *r600);
diff --git a/src/gallium/drivers/r600/r600_screen.c b/src/gallium/drivers/r600/r600_screen.c
index cdaca9ed7d..a047a49a6c 100644
--- a/src/gallium/drivers/r600/r600_screen.c
+++ b/src/gallium/drivers/r600/r600_screen.c
@@ -69,6 +69,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_SWIZZLE:
 	case PIPE_CAP_INDEP_BLEND_ENABLE:
 	case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
+	case PIPE_CAP_DEPTH_CLAMP:
 		return 1;
 
 	/* Unsupported features (boolean caps). */
@@ -77,7 +78,6 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_STREAM_OUTPUT:
 	case PIPE_CAP_INDEP_BLEND_FUNC: /* FIXME allow this */
 	case PIPE_CAP_GEOMETRY_SHADER4:
-	case PIPE_CAP_DEPTH_CLAMP: /* FIXME allow this */
 		return 0;
 
 	/* Texturing. */
@@ -234,11 +234,34 @@ static void r600_destroy_screen(struct pipe_screen* pscreen)
 struct pipe_screen *r600_screen_create(struct radeon *rw)
 {
 	struct r600_screen* rscreen;
+	enum radeon_family family = radeon_get_family(rw);
 
 	rscreen = CALLOC_STRUCT(r600_screen);
 	if (rscreen == NULL) {
 		return NULL;
 	}
+
+	switch (family) {
+	case CHIP_R600:
+	case CHIP_RV610:
+	case CHIP_RV630:
+	case CHIP_RV670:
+	case CHIP_RV620:
+	case CHIP_RV635:
+	case CHIP_RS780:
+	case CHIP_RS880:
+		rscreen->chip_class = R600;
+		break;
+	case CHIP_RV770:
+	case CHIP_RV730:
+	case CHIP_RV710:
+	case CHIP_RV740:
+		rscreen->chip_class = R700;
+		break;
+	default:
+		FREE(rscreen);
+		return NULL;
+	}
 	rscreen->rw = rw;
 	rscreen->screen.winsys = (struct pipe_winsys*)rw;
 	rscreen->screen.destroy = r600_destroy_screen;
diff --git a/src/gallium/drivers/r600/r600_screen.h b/src/gallium/drivers/r600/r600_screen.h
index 53b560c617..b9938f117a 100644
--- a/src/gallium/drivers/r600/r600_screen.h
+++ b/src/gallium/drivers/r600/r600_screen.h
@@ -30,6 +30,7 @@
 #include <radeon_drm.h>
 #include "radeon.h"
 #include "util/u_transfer.h"
+#include "r600_resource.h"
 
 /* Texture transfer. */
 struct r600_transfer {
@@ -38,11 +39,19 @@ struct r600_transfer {
 	/* Buffer transfer. */
 	struct pipe_transfer		*buffer_transfer;
 	unsigned			offset;
+	struct pipe_resource		*linear_texture;
+};
+
+enum chip_class {
+	R600,
+	R700,
+	EVERGREEN,
 };
 
 struct r600_screen {
 	struct pipe_screen		screen;
 	struct radeon			*rw;
+	enum chip_class			chip_class;
 };
 
 static INLINE struct r600_screen *r600_screen(struct pipe_screen *screen)
@@ -62,7 +71,7 @@ unsigned r600_buffer_is_referenced_by_cs(struct pipe_context *context,
 struct pipe_resource *r600_buffer_from_handle(struct pipe_screen *screen,
 					      struct winsys_handle *whandle);
 
-/* Texture transfer functions. */
+/* r600_texture.c texture transfer functions. */
 struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx,
 						struct pipe_resource *texture,
 						struct pipe_subresource sr,
@@ -74,7 +83,14 @@ void* r600_texture_transfer_map(struct pipe_context *ctx,
 				struct pipe_transfer* transfer);
 void r600_texture_transfer_unmap(struct pipe_context *ctx,
 				 struct pipe_transfer* transfer);
+int r600_texture_scissor(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level);
+int r600_texture_cb(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned cb, unsigned level);
+int r600_texture_db(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level);
+int r600_texture_from_depth(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level);
+int r600_texture_viewport(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level);
 
+/* r600_blit.c */
+int r600_blit_uncompress_depth(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level);
 
 /* helpers */
 int r600_conv_pipe_format(unsigned pformat, unsigned *format);
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 956c7e7930..0ba26a2311 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -48,6 +48,9 @@ struct r600_shader_ctx {
 	struct r600_bc				*bc;
 	struct r600_shader			*shader;
 	u32					value[4];
+	u32					*literals;
+	u32					nliterals;
+	u32                                     max_driver_temp_used;
 };
 
 struct r600_shader_tgsi_instruction {
@@ -105,8 +108,8 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
 	struct r600_screen *rscreen = r600_screen(ctx->screen);
 	int r;
 
-fprintf(stderr, "--------------------------------------------------------------\n");
-tgsi_dump(tokens, 0);
+//fprintf(stderr, "--------------------------------------------------------------\n");
+//tgsi_dump(tokens, 0);
 	if (rpshader == NULL)
 		return -ENOMEM;
 	rpshader->shader.family = radeon_get_family(rscreen->rw);
@@ -120,7 +123,7 @@ tgsi_dump(tokens, 0);
 		R600_ERR("building bytecode failed !\n");
 		return r;
 	}
-fprintf(stderr, "______________________________________________________________\n");
+//fprintf(stderr, "______________________________________________________________\n");
 	return 0;
 }
 
@@ -131,10 +134,9 @@ static int r600_pipe_shader_vs(struct pipe_context *ctx, struct r600_context_sta
 	struct radeon_state *state;
 	unsigned i, tmp;
 
-	rpshader->rstate = radeon_state_decref(rpshader->rstate);
-	state = radeon_state(rscreen->rw, R600_VS_SHADER_TYPE, R600_VS_SHADER);
-	if (state == NULL)
-		return -ENOMEM;
+	state = &rpshader->rstate[0];
+	radeon_state_fini(&rpshader->rstate[0]);
+	radeon_state_init(state, rscreen->rw, R600_STATE_SHADER, 0, R600_SHADER_VS);
 	for (i = 0; i < 10; i++) {
 		state->states[R600_VS_SHADER__SPI_VS_OUT_ID_0 + i] = 0;
 	}
@@ -144,12 +146,13 @@ static int r600_pipe_shader_vs(struct pipe_context *ctx, struct r600_context_sta
 		state->states[R600_VS_SHADER__SPI_VS_OUT_ID_0 + i / 4] |= tmp;
 	}
 	state->states[R600_VS_SHADER__SPI_VS_OUT_CONFIG] = S_0286C4_VS_EXPORT_COUNT(rshader->noutput - 2);
-	state->states[R600_VS_SHADER__SQ_PGM_RESOURCES_VS] = S_028868_NUM_GPRS(rshader->bc.ngpr);
-	rpshader->rstate = state;
-	rpshader->rstate->bo[0] = radeon_bo_incref(rscreen->rw, rpshader->bo);
-	rpshader->rstate->bo[1] = radeon_bo_incref(rscreen->rw, rpshader->bo);
-	rpshader->rstate->nbo = 2;
-	rpshader->rstate->placement[0] = RADEON_GEM_DOMAIN_GTT;
+	state->states[R600_VS_SHADER__SQ_PGM_RESOURCES_VS] = S_028868_NUM_GPRS(rshader->bc.ngpr) |
+		S_028868_STACK_SIZE(rshader->bc.nstack);
+	state->bo[0] = radeon_bo_incref(rscreen->rw, rpshader->bo);
+	state->bo[1] = radeon_bo_incref(rscreen->rw, rpshader->bo);
+	state->nbo = 2;
+	state->placement[0] = RADEON_GEM_DOMAIN_GTT;
+	state->placement[2] = RADEON_GEM_DOMAIN_GTT;
 	return radeon_state_pm4(state);
 }
 
@@ -161,17 +164,20 @@ static int r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_context_sta
 	struct r600_context *rctx = r600_context(ctx);
 	struct radeon_state *state;
 	unsigned i, tmp, exports_ps, num_cout;
+	boolean have_pos = FALSE;
 
+	state = &rpshader->rstate[0];
 	rasterizer = &rctx->rasterizer->state.rasterizer;
-	rpshader->rstate = radeon_state_decref(rpshader->rstate);
-	state = radeon_state(rscreen->rw, R600_PS_SHADER_TYPE, R600_PS_SHADER);
-	if (state == NULL)
-		return -ENOMEM;
+	radeon_state_fini(state);
+	radeon_state_init(state, rscreen->rw, R600_STATE_SHADER, 0, R600_SHADER_PS);
 	for (i = 0; i < rshader->ninput; i++) {
 		tmp = S_028644_SEMANTIC(i);
 		tmp |= S_028644_SEL_CENTROID(1);
+		if (rshader->input[i].name == TGSI_SEMANTIC_POSITION)
+			have_pos = TRUE;
 		if (rshader->input[i].name == TGSI_SEMANTIC_COLOR ||
-			rshader->input[i].name == TGSI_SEMANTIC_BCOLOR) {
+		    rshader->input[i].name == TGSI_SEMANTIC_BCOLOR ||
+		    rshader->input[i].name == TGSI_SEMANTIC_POSITION) {
 			tmp |= S_028644_FLAT_SHADE(rshader->flat_shade);
 		}
 		if (rasterizer->sprite_coord_enable & (1 << i)) {
@@ -190,15 +196,24 @@ static int r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_context_sta
 			num_cout++;
 		}
 	}
+	if (!exports_ps) {
+		/* always at least export 1 component per pixel */
+		exports_ps = 2;
+	}
 	state->states[R600_PS_SHADER__SPI_PS_IN_CONTROL_0] = S_0286CC_NUM_INTERP(rshader->ninput) |
 							S_0286CC_PERSP_GRADIENT_ENA(1);
+	if (have_pos) {
+		state->states[R600_PS_SHADER__SPI_PS_IN_CONTROL_0] |=  S_0286CC_POSITION_ENA(1) |
+		                                                       S_0286CC_BARYC_SAMPLE_CNTL(1);
+		state->states[R600_PS_SHADER__SPI_INPUT_Z] |= 1;
+	}
 	state->states[R600_PS_SHADER__SPI_PS_IN_CONTROL_1] = 0x00000000;
-	state->states[R600_PS_SHADER__SQ_PGM_RESOURCES_PS] = S_028868_NUM_GPRS(rshader->bc.ngpr);
+	state->states[R600_PS_SHADER__SQ_PGM_RESOURCES_PS] = S_028868_NUM_GPRS(rshader->bc.ngpr) |
+		S_028868_STACK_SIZE(rshader->bc.nstack);
 	state->states[R600_PS_SHADER__SQ_PGM_EXPORTS_PS] = exports_ps;
-	rpshader->rstate = state;
-	rpshader->rstate->bo[0] = radeon_bo_incref(rscreen->rw, rpshader->bo);
-	rpshader->rstate->nbo = 1;
-	rpshader->rstate->placement[0] = RADEON_GEM_DOMAIN_GTT;
+	state->bo[0] = radeon_bo_incref(rscreen->rw, rpshader->bo);
+	state->nbo = 1;
+	state->placement[0] = RADEON_GEM_DOMAIN_GTT;
 	return radeon_state_pm4(state);
 }
 
@@ -268,21 +283,24 @@ static int tgsi_is_supported(struct r600_shader_ctx *ctx)
 		R600_ERR("predicate unsupported\n");
 		return -EINVAL;
 	}
+#if 0
 	if (i->Instruction.Label) {
 		R600_ERR("label unsupported\n");
 		return -EINVAL;
 	}
+#endif
 	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
-		if (i->Src[j].Register.Indirect ||
-			i->Src[j].Register.Dimension ||
+		if (i->Src[j].Register.Dimension ||
 			i->Src[j].Register.Absolute) {
-			R600_ERR("unsupported src (indirect|dimension|absolute)\n");
+			R600_ERR("unsupported src %d (dimension %d|absolute %d)\n", j,
+				 i->Src[j].Register.Dimension,
+				 i->Src[j].Register.Absolute);
 			return -EINVAL;
 		}
 	}
 	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
-		if (i->Dst[j].Register.Indirect || i->Dst[j].Register.Dimension) {
-			R600_ERR("unsupported dst (indirect|dimension)\n");
+		if (i->Dst[j].Register.Dimension) {
+			R600_ERR("unsupported dst (dimension)\n");
 			return -EINVAL;
 		}
 	}
@@ -333,6 +351,7 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 	case TGSI_FILE_CONSTANT:
 	case TGSI_FILE_TEMPORARY:
 	case TGSI_FILE_SAMPLER:
+	case TGSI_FILE_ADDRESS:
 		break;
 	default:
 		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
@@ -341,6 +360,11 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 	return 0;
 }
 
+static int r600_get_temp(struct r600_shader_ctx *ctx)
+{
+	return ctx->temp_reg + ctx->max_driver_temp_used++;
+}
+
 int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *shader)
 {
 	struct tgsi_full_immediate *immediate;
@@ -362,9 +386,15 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s
 	shader->processor_type = ctx.type;
 
 	/* register allocations */
-	/* Values [0,127] correspond to GPR[0..127]. 
-	 * Values [256,511] correspond to cfile constants c[0..255]. 
+	/* Values [0,127] correspond to GPR[0..127].
+	 * Values [128,159] correspond to constant buffer bank 0
+	 * Values [160,191] correspond to constant buffer bank 1
+	 * Values [256,511] correspond to cfile constants c[0..255].
 	 * Other special values are shown in the list below.
+	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
+	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
+	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
+	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
 	 * 248	SQ_ALU_SRC_0: special constant 0.0.
 	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
 	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
@@ -389,15 +419,24 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s
 	ctx.temp_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
 			ctx.info.file_count[TGSI_FILE_TEMPORARY];
 
+	ctx.nliterals = 0;
+	ctx.literals = NULL;
+
 	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
 		tgsi_parse_token(&ctx.parse);
 		switch (ctx.parse.FullToken.Token.Type) {
 		case TGSI_TOKEN_TYPE_IMMEDIATE:
 			immediate = &ctx.parse.FullToken.FullImmediate;
-			ctx.value[0] = immediate->u[0].Uint;
-			ctx.value[1] = immediate->u[1].Uint;
-			ctx.value[2] = immediate->u[2].Uint;
-			ctx.value[3] = immediate->u[3].Uint;
+			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
+			if(ctx.literals == NULL) {
+				r = -ENOMEM;
+				goto out_err;
+			}
+			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
+			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
+			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
+			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
+			ctx.nliterals++;
 			break;
 		case TGSI_TOKEN_TYPE_DECLARATION:
 			r = tgsi_declaration(&ctx);
@@ -408,6 +447,9 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s
 			r = tgsi_is_supported(&ctx);
 			if (r)
 				goto out_err;
+			ctx.max_driver_temp_used = 0;
+			/* reserve first tmp for everyone */
+			r600_get_temp(&ctx);
 			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
 			ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
 			r = ctx.inst_info->process(&ctx);
@@ -458,6 +500,8 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s
 				output[i].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 			} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
 				output[i].array_base = 61;
+				output[i].swizzle_x = 2;
+				output[i].swizzle_y = output[i].swizzle_z = output[i].swizzle_w = 7;
 				output[i].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 			} else {
 				R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
@@ -504,7 +548,7 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s
 		output[0].swizzle_z = 7;
 		output[0].swizzle_w = 7;
 		output[0].barrier = 1;
-		output[0].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
+		output[0].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 		output[0].array_base = 0;
 		output[0].inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT;
 		noutput++;
@@ -525,9 +569,11 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s
 		if (r)
 			goto out_err;
 	}
+	free(ctx.literals);
 	tgsi_parse_free(&ctx.parse);
 	return 0;
 out_err:
+	free(ctx.literals);
 	tgsi_parse_free(&ctx.parse);
 	return r;
 }
@@ -547,11 +593,19 @@ static int tgsi_src(struct r600_shader_ctx *ctx,
 			const struct tgsi_full_src_register *tgsi_src,
 			struct r600_bc_alu_src *r600_src)
 {
+	int index;
 	memset(r600_src, 0, sizeof(struct r600_bc_alu_src));
 	r600_src->sel = tgsi_src->Register.Index;
 	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
 		r600_src->sel = 0;
+		index = tgsi_src->Register.Index;
+		ctx->value[0] = ctx->literals[index * 4 + 0];
+		ctx->value[1] = ctx->literals[index * 4 + 1];
+		ctx->value[2] = ctx->literals[index * 4 + 2];
+		ctx->value[3] = ctx->literals[index * 4 + 3];
 	}
+	if (tgsi_src->Register.Indirect)
+		r600_src->rel = V_SQ_REL_RELATIVE;
 	r600_src->neg = tgsi_src->Register.Negate;
 	r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
 	return 0;
@@ -568,6 +622,8 @@ static int tgsi_dst(struct r600_shader_ctx *ctx,
 	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
 	r600_dst->chan = swizzle;
 	r600_dst->write = 1;
+	if (tgsi_dst->Register.Indirect)
+		r600_dst->rel = V_SQ_REL_RELATIVE;
 	if (inst->Instruction.Saturate) {
 		r600_dst->clamp = 1;
 	}
@@ -607,12 +663,13 @@ static int tgsi_split_constant(struct r600_shader_ctx *ctx, struct r600_bc_alu_s
 	}
 	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
 		if (inst->Src[j].Register.File == TGSI_FILE_CONSTANT && j > 0) {
+			int treg = r600_get_temp(ctx);
 			for (k = 0; k < 4; k++) {
 				memset(&alu, 0, sizeof(struct r600_bc_alu));
 				alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
-				alu.src[0].sel = r600_src[0].sel;
+				alu.src[0].sel = r600_src[j].sel;
 				alu.src[0].chan = k;
-				alu.dst.sel = ctx->temp_reg + j;
+				alu.dst.sel = treg;
 				alu.dst.chan = k;
 				alu.dst.write = 1;
 				if (k == 3)
@@ -621,37 +678,90 @@ static int tgsi_split_constant(struct r600_shader_ctx *ctx, struct r600_bc_alu_s
 				if (r)
 					return r;
 			}
-			r600_src[0].sel = ctx->temp_reg + j;
+			r600_src[j].sel = treg;
 			j--;
 		}
 	}
 	return 0;
 }
 
-static int tgsi_op2(struct r600_shader_ctx *ctx)
+/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
+static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx, struct r600_bc_alu_src r600_src[3])
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	struct r600_bc_alu alu;
+	int i, j, k, nliteral, r;
+
+	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
+		if (inst->Src[i].Register.File == TGSI_FILE_IMMEDIATE) {
+			nliteral++;
+		}
+	}
+	for (i = 0, j = 0; i < inst->Instruction.NumSrcRegs; i++) {
+		if (inst->Src[j].Register.File == TGSI_FILE_IMMEDIATE) {
+			int treg = r600_get_temp(ctx);
+			for (k = 0; k < 4; k++) {
+				memset(&alu, 0, sizeof(struct r600_bc_alu));
+				alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
+				alu.src[0].sel = r600_src[j].sel;
+				alu.src[0].chan = k;
+				alu.dst.sel = treg;
+				alu.dst.chan = k;
+				alu.dst.write = 1;
+				if (k == 3)
+					alu.last = 1;
+				r = r600_bc_add_alu(ctx->bc, &alu);
+				if (r)
+					return r;
+			}
+			r = r600_bc_add_literal(ctx->bc, ctx->value);
+			if (r)
+				return r;
+			r600_src[j].sel = treg;
+			j++;
+		}
+	}
+	return 0;
+}
+
+static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
 	struct r600_bc_alu_src r600_src[3];
 	struct r600_bc_alu alu;
 	int i, j, r;
+	int lasti = 0;
+
+	for (i = 0; i < 4; i++) {
+		if (inst->Dst[0].Register.WriteMask & (1 << i)) {
+			lasti = i;
+		}
+	}
 
 	r = tgsi_split_constant(ctx, r600_src);
 	if (r)
 		return r;
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < lasti + 1; i++) {
+		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
+			continue;
+
 		memset(&alu, 0, sizeof(struct r600_bc_alu));
-		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
-			alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP;
-			alu.dst.chan = i;
-		} else {
-			alu.inst = ctx->inst_info->r600_opcode;
+		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+		if (r)
+			return r;
+		
+		alu.inst = ctx->inst_info->r600_opcode;
+		if (!swap) {
 			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
 				alu.src[j] = r600_src[j];
 				alu.src[j].chan = tgsi_chan(&inst->Src[j], i);
 			}
-			r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
-			if (r)
-				return r;
+		} else {
+			alu.src[0] = r600_src[1];
+			alu.src[0].chan = tgsi_chan(&inst->Src[1], i);
+
+			alu.src[1] = r600_src[0];
+			alu.src[1].chan = tgsi_chan(&inst->Src[0], i);
 		}
 		/* handle some special cases */
 		switch (ctx->inst_info->tgsi_opcode) {
@@ -664,7 +774,7 @@ static int tgsi_op2(struct r600_shader_ctx *ctx)
 		default:
 			break;
 		}
-		if (i == 3) {
+		if (i == lasti) {
 			alu.last = 1;
 		}
 		r = r600_bc_add_alu(ctx->bc, &alu);
@@ -674,24 +784,154 @@ static int tgsi_op2(struct r600_shader_ctx *ctx)
 	return 0;
 }
 
-static int tgsi_kill(struct r600_shader_ctx *ctx)
+static int tgsi_op2(struct r600_shader_ctx *ctx)
+{
+	return tgsi_op2_s(ctx, 0);
+}
+
+static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
+{
+	return tgsi_op2_s(ctx, 1);
+}
+
+/* 
+ * r600 - trunc to -PI..PI range
+ * r700 - normalize by dividing by 2PI
+ * see fdo bug 27901
+ */
+static int tgsi_setup_trig(struct r600_shader_ctx *ctx,
+			   struct r600_bc_alu_src r600_src[3])
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	int r;
+	uint32_t lit_vals[4];
+	struct r600_bc_alu alu;
+	
+	memset(lit_vals, 0, 4*4);
+	r = tgsi_split_constant(ctx, r600_src);
+	if (r)
+		return r;
+
+	r = tgsi_split_literal_constant(ctx, r600_src);
+	if (r)
+		return r;
+
+	lit_vals[0] = fui(1.0 /(3.1415926535 * 2));
+	lit_vals[1] = fui(0.5f);
+
+	memset(&alu, 0, sizeof(struct r600_bc_alu));
+	alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD;
+	alu.is_op3 = 1;
+
+	alu.dst.chan = 0;
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.write = 1;
+
+	alu.src[0] = r600_src[0];
+	alu.src[0].chan = tgsi_chan(&inst->Src[0], 0);
+		
+	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
+	alu.src[1].chan = 0;
+	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
+	alu.src[2].chan = 1;
+	alu.last = 1;
+	r = r600_bc_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	r = r600_bc_add_literal(ctx->bc, lit_vals);
+	if (r)
+		return r;
+
+	memset(&alu, 0, sizeof(struct r600_bc_alu));
+	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT;
+		
+	alu.dst.chan = 0;
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.write = 1;
+
+	alu.src[0].sel = ctx->temp_reg;
+	alu.src[0].chan = 0;
+	alu.last = 1;
+	r = r600_bc_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+
+	if (ctx->bc->chiprev == 0) {
+		lit_vals[0] = fui(3.1415926535897f * 2.0f);
+		lit_vals[1] = fui(-3.1415926535897f);
+	} else {
+		lit_vals[0] = fui(1.0f);
+		lit_vals[1] = fui(-0.5f);
+	}
+
+	memset(&alu, 0, sizeof(struct r600_bc_alu));
+	alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD;
+	alu.is_op3 = 1;
+
+	alu.dst.chan = 0;
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.write = 1;
+
+	alu.src[0].sel = ctx->temp_reg;
+	alu.src[0].chan = 0;
+		
+	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
+	alu.src[1].chan = 0;
+	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
+	alu.src[2].chan = 1;
+	alu.last = 1;
+	r = r600_bc_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	r = r600_bc_add_literal(ctx->bc, lit_vals);
+	if (r)
+		return r;
+	return 0;
+}
+
+static int tgsi_trig(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	struct r600_bc_alu_src r600_src[3];
 	struct r600_bc_alu alu;
 	int i, r;
+	int lasti = 0;
+
+	r = tgsi_setup_trig(ctx, r600_src);
+	if (r)
+		return r;
+
+	memset(&alu, 0, sizeof(struct r600_bc_alu));
+	alu.inst = ctx->inst_info->r600_opcode;
+	alu.dst.chan = 0;
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.write = 1;
+
+	alu.src[0].sel = ctx->temp_reg;
+	alu.src[0].chan = 0;
+	alu.last = 1;
+	r = r600_bc_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
 
+	/* replicate result */
 	for (i = 0; i < 4; i++) {
+		if (inst->Dst[0].Register.WriteMask & (1 << i))
+			lasti = i;
+	}
+	for (i = 0; i < lasti + 1; i++) {
+		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
+			continue;
+
 		memset(&alu, 0, sizeof(struct r600_bc_alu));
-		alu.inst = ctx->inst_info->r600_opcode;
-		alu.dst.chan = i;
-		alu.src[0].sel = 248;
-		r = tgsi_src(ctx, &inst->Src[0], &alu.src[1]);
+		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
+
+		alu.src[0].sel = ctx->temp_reg;
+		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
 		if (r)
 			return r;
-		alu.src[1].chan = tgsi_chan(&inst->Src[0], i);
-		if (i == 3) {
+		if (i == lasti)
 			alu.last = 1;
-		}
 		r = r600_bc_add_alu(ctx->bc, &alu);
 		if (r)
 			return r;
@@ -699,30 +939,70 @@ static int tgsi_kill(struct r600_shader_ctx *ctx)
 	return 0;
 }
 
-static int tgsi_slt(struct r600_shader_ctx *ctx)
+static int tgsi_scs(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
 	struct r600_bc_alu_src r600_src[3];
 	struct r600_bc_alu alu;
-	int i, r;
+	int r;
 
-	r = tgsi_split_constant(ctx, r600_src);
+	r = tgsi_setup_trig(ctx, r600_src);
+	if (r)
+		return r;
+
+
+	/* dst.x = COS */
+	memset(&alu, 0, sizeof(struct r600_bc_alu));
+	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS;
+	r = tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
 	if (r)
 		return r;
+
+	alu.src[0].sel = ctx->temp_reg;
+	alu.src[0].chan = 0;
+	alu.last = 1;
+	r = r600_bc_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+
+	/* dst.y = SIN */
+	memset(&alu, 0, sizeof(struct r600_bc_alu));
+	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN;
+	r = tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
+	if (r)
+		return r;
+
+	alu.src[0].sel = ctx->temp_reg;
+	alu.src[0].chan = 0;
+	alu.last = 1;
+	r = r600_bc_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	return 0;
+}
+
+static int tgsi_kill(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	struct r600_bc_alu alu;
+	int i, r;
+
 	for (i = 0; i < 4; i++) {
 		memset(&alu, 0, sizeof(struct r600_bc_alu));
-		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
-			alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP;
-			alu.dst.chan = i;
+		alu.inst = ctx->inst_info->r600_opcode;
+
+		alu.dst.chan = i;
+
+		alu.src[0].sel = V_SQ_ALU_SRC_0;
+
+		if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILP) {
+			alu.src[1].sel = V_SQ_ALU_SRC_1;
+			alu.src[1].neg = 1;
 		} else {
-			alu.inst = ctx->inst_info->r600_opcode;
-			alu.src[1] = r600_src[0];
-			alu.src[1].chan = tgsi_chan(&inst->Src[0], i);
-			alu.src[0] = r600_src[1];
-			alu.src[0].chan = tgsi_chan(&inst->Src[1], i);
-			r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+			r = tgsi_src(ctx, &inst->Src[0], &alu.src[1]);
 			if (r)
 				return r;
+			alu.src[1].chan = tgsi_chan(&inst->Src[0], i);
 		}
 		if (i == 3) {
 			alu.last = 1;
@@ -731,6 +1011,13 @@ static int tgsi_slt(struct r600_shader_ctx *ctx)
 		if (r)
 			return r;
 	}
+	r = r600_bc_add_literal(ctx->bc, ctx->value);
+	if (r)
+		return r;
+
+	/* kill must be last in ALU */
+	ctx->bc->force_add_cf = 1;
+	ctx->shader->uses_kill = TRUE;
 	return 0;
 }
 
@@ -738,12 +1025,20 @@ static int tgsi_lit(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
 	struct r600_bc_alu alu;
+	struct r600_bc_alu_src r600_src[3];
 	int r;
 
+	r = tgsi_split_constant(ctx, r600_src);
+	if (r)
+		return r;
+	r = tgsi_split_literal_constant(ctx, r600_src);
+	if (r)
+		return r;
+
 	/* dst.x, <- 1.0  */
 	memset(&alu, 0, sizeof(struct r600_bc_alu));
 	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
-	alu.src[0].sel  = 249; /*1.0*/
+	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
 	alu.src[0].chan = 0;
 	r = tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
 	if (r)
@@ -756,11 +1051,9 @@ static int tgsi_lit(struct r600_shader_ctx *ctx)
 	/* dst.y = max(src.x, 0.0) */
 	memset(&alu, 0, sizeof(struct r600_bc_alu));
 	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX;
-	r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]);
-	if (r)
-		return r;
-	alu.src[1].sel  = 248; /*0.0*/
-	alu.src[1].chan = tgsi_chan(&inst->Src[0], 0);
+	alu.src[0] = r600_src[0];
+	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
+	alu.src[1].chan = 0;
 	r = tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
 	if (r)
 		return r;
@@ -769,18 +1062,10 @@ static int tgsi_lit(struct r600_shader_ctx *ctx)
 	if (r)
 		return r;
 
-	/* dst.z = NOP - fill Z slot */
-	memset(&alu, 0, sizeof(struct r600_bc_alu));
-	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP;
-	alu.dst.chan = 2;
-	r = r600_bc_add_alu(ctx->bc, &alu);
-	if (r)
-		return r;
-
 	/* dst.w, <- 1.0  */
 	memset(&alu, 0, sizeof(struct r600_bc_alu));
 	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
-	alu.src[0].sel  = 249;
+	alu.src[0].sel  = V_SQ_ALU_SRC_1;
 	alu.src[0].chan = 0;
 	r = tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
 	if (r)
@@ -791,6 +1076,10 @@ static int tgsi_lit(struct r600_shader_ctx *ctx)
 	if (r)
 		return r;
 
+	r = r600_bc_add_literal(ctx->bc, ctx->value);
+	if (r)
+		return r;
+
 	if (inst->Dst[0].Register.WriteMask & (1 << 2))
 	{
 		int chan;
@@ -799,9 +1088,7 @@ static int tgsi_lit(struct r600_shader_ctx *ctx)
 		/* dst.z = log(src.y) */
 		memset(&alu, 0, sizeof(struct r600_bc_alu));
 		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED;
-		r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]);
-		if (r)
-			return r;
+		alu.src[0] = r600_src[0];
 		alu.src[0].chan = tgsi_chan(&inst->Src[0], 1);
 		r = tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
 		if (r)
@@ -811,21 +1098,22 @@ static int tgsi_lit(struct r600_shader_ctx *ctx)
 		if (r)
 			return r;
 
+		r = r600_bc_add_literal(ctx->bc, ctx->value);
+		if (r)
+			return r;
+
 		chan = alu.dst.chan;
 		sel = alu.dst.sel;
 
 		/* tmp.x = amd MUL_LIT(src.w, dst.z, src.x ) */
 		memset(&alu, 0, sizeof(struct r600_bc_alu));
 		alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT;
-		r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]);
-		if (r)
-			return r;
+		alu.src[0] = r600_src[0];
 		alu.src[0].chan = tgsi_chan(&inst->Src[0], 3);
 		alu.src[1].sel  = sel;
 		alu.src[1].chan = chan;
-		r = tgsi_src(ctx, &inst->Src[0], &alu.src[2]);
-		if (r)
-			return r;
+
+		alu.src[2] = r600_src[0];
 		alu.src[2].chan = tgsi_chan(&inst->Src[0], 0);
 		alu.dst.sel = ctx->temp_reg;
 		alu.dst.chan = 0;
@@ -836,6 +1124,9 @@ static int tgsi_lit(struct r600_shader_ctx *ctx)
 		if (r)
 			return r;
 
+		r = r600_bc_add_literal(ctx->bc, ctx->value);
+		if (r)
+			return r;
 		/* dst.z = exp(tmp.x) */
 		memset(&alu, 0, sizeof(struct r600_bc_alu));
 		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE;
@@ -880,19 +1171,43 @@ static int tgsi_trans(struct r600_shader_ctx *ctx)
 	return 0;
 }
 
+static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	struct r600_bc_alu alu;
+	int i, r;
+
+	for (i = 0; i < 4; i++) {
+		memset(&alu, 0, sizeof(struct r600_bc_alu));
+		alu.src[0].sel = ctx->temp_reg;
+		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
+		alu.dst.chan = i;
+		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+		if (r)
+			return r;
+		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
+		if (i == 3)
+			alu.last = 1;
+		r = r600_bc_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+	return 0;
+}
+
 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
 	struct r600_bc_alu alu;
-	int i, j, r;
+	int i, r;
 
 	memset(&alu, 0, sizeof(struct r600_bc_alu));
 	alu.inst = ctx->inst_info->r600_opcode;
-	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
-		r = tgsi_src(ctx, &inst->Src[j], &alu.src[j]);
+	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+		r = tgsi_src(ctx, &inst->Src[i], &alu.src[i]);
 		if (r)
 			return r;
-		alu.src[j].chan = tgsi_chan(&inst->Src[j], 0);
+		alu.src[i].chan = tgsi_chan(&inst->Src[i], 0);
 	}
 	alu.dst.sel = ctx->temp_reg;
 	alu.dst.write = 1;
@@ -900,16 +1215,124 @@ static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
 	r = r600_bc_add_alu(ctx->bc, &alu);
 	if (r)
 		return r;
+	r = r600_bc_add_literal(ctx->bc, ctx->value);
+	if (r)
+		return r;
 	/* replicate result */
+	return tgsi_helper_tempx_replicate(ctx);
+}
+
+static int tgsi_pow(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	struct r600_bc_alu alu;
+	int r;
+
+	/* LOG2(a) */
+	memset(&alu, 0, sizeof(struct r600_bc_alu));
+	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE;
+	r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]);
+	if (r)
+		return r;
+	alu.src[0].chan = tgsi_chan(&inst->Src[0], 0);
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bc_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	r = r600_bc_add_literal(ctx->bc,ctx->value);
+	if (r)
+		return r;
+	/* b * LOG2(a) */
+	memset(&alu, 0, sizeof(struct r600_bc_alu));
+	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL_IEEE;
+	r = tgsi_src(ctx, &inst->Src[1], &alu.src[0]);
+	if (r)
+		return r;
+	alu.src[0].chan = tgsi_chan(&inst->Src[1], 0);
+	alu.src[1].sel = ctx->temp_reg;
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bc_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	r = r600_bc_add_literal(ctx->bc,ctx->value);
+	if (r)
+		return r;
+	/* POW(a,b) = EXP2(b * LOG2(a))*/
+	memset(&alu, 0, sizeof(struct r600_bc_alu));
+	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE;
+	alu.src[0].sel = ctx->temp_reg;
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bc_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	r = r600_bc_add_literal(ctx->bc,ctx->value);
+	if (r)
+		return r;
+	return tgsi_helper_tempx_replicate(ctx);
+}
+
+static int tgsi_ssg(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	struct r600_bc_alu alu;
+	struct r600_bc_alu_src r600_src[3];
+	int i, r;
+
+	r = tgsi_split_constant(ctx, r600_src);
+	if (r)
+		return r;
+
+	/* tmp = (src > 0 ? 1 : src) */
 	for (i = 0; i < 4; i++) {
 		memset(&alu, 0, sizeof(struct r600_bc_alu));
-		alu.src[0].sel = ctx->temp_reg;
-		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
+		alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT;
+		alu.is_op3 = 1;
+
+		alu.dst.sel = ctx->temp_reg;
 		alu.dst.chan = i;
+
+		alu.src[0] = r600_src[0];
+		alu.src[0].chan = tgsi_chan(&inst->Src[0], i);
+
+		alu.src[1].sel = V_SQ_ALU_SRC_1;
+
+		alu.src[2] = r600_src[0];
+		alu.src[2].chan = tgsi_chan(&inst->Src[0], i);
+		if (i == 3)
+			alu.last = 1;
+		r = r600_bc_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+	r = r600_bc_add_literal(ctx->bc, ctx->value);
+	if (r)
+		return r;
+
+	/* dst = (-tmp > 0 ? -1 : tmp) */
+	for (i = 0; i < 4; i++) {
+		memset(&alu, 0, sizeof(struct r600_bc_alu));
+		alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT;
+		alu.is_op3 = 1;
 		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
 		if (r)
 			return r;
-		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
+
+		alu.src[0].sel = ctx->temp_reg;
+		alu.src[0].chan = i;
+		alu.src[0].neg = 1;
+
+		alu.src[1].sel = V_SQ_ALU_SRC_1;
+		alu.src[1].neg = 1;
+
+		alu.src[2].sel = ctx->temp_reg;
+		alu.src[2].chan = i;
+
 		if (i == 3)
 			alu.last = 1;
 		r = r600_bc_add_alu(ctx->bc, &alu);
@@ -1006,16 +1429,23 @@ static int tgsi_dp(struct r600_shader_ctx *ctx)
 		switch (ctx->inst_info->tgsi_opcode) {
 		case TGSI_OPCODE_DP2:
 			if (i > 1) {
-				alu.src[0].sel = alu.src[1].sel = 248;
+				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
 				alu.src[0].chan = alu.src[1].chan = 0;
 			}
 			break;
 		case TGSI_OPCODE_DP3:
 			if (i > 2) {
-				alu.src[0].sel = alu.src[1].sel = 248;
+				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
 				alu.src[0].chan = alu.src[1].chan = 0;
 			}
 			break;
+		case TGSI_OPCODE_DPH:
+			if (i == 3) {
+				alu.src[0].sel = V_SQ_ALU_SRC_1;
+				alu.src[0].chan = 0;
+				alu.src[0].neg = 0;
+			}
+			break;
 		default:
 			break;
 		}
@@ -1035,75 +1465,197 @@ static int tgsi_tex(struct r600_shader_ctx *ctx)
 	struct r600_bc_tex tex;
 	struct r600_bc_alu alu;
 	unsigned src_gpr;
-	int r;
+	int r, i;
+	int opcode;
+	boolean src_not_temp = inst->Src[0].Register.File != TGSI_FILE_TEMPORARY;
+	uint32_t lit_vals[4];
 
 	src_gpr = ctx->file_offset[inst->Src[0].Register.File] + inst->Src[0].Register.Index;
 
-	/* Add perspective divide */
-	memset(&alu, 0, sizeof(struct r600_bc_alu));
-	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE;
-	alu.src[0].sel = src_gpr;
-	alu.src[0].chan = tgsi_chan(&inst->Src[0], 3);
-	alu.dst.sel = ctx->temp_reg;
-	alu.dst.chan = 3;
-	alu.last = 1;
-	alu.dst.write = 1;
-	r = r600_bc_add_alu(ctx->bc, &alu);
-	if (r)
-		return r;
+	if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
+		/* Add perspective divide */
+		memset(&alu, 0, sizeof(struct r600_bc_alu));
+		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE;
+		r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]);
+		if (r)
+			return r;
 
-	memset(&alu, 0, sizeof(struct r600_bc_alu));
-	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL;
-	alu.src[0].sel = ctx->temp_reg;
-	alu.src[0].chan = 3;
-	alu.src[1].sel = src_gpr;
-	alu.src[1].chan = tgsi_chan(&inst->Src[0], 0);
-	alu.dst.sel = ctx->temp_reg;
-	alu.dst.chan = 0;
-	alu.dst.write = 1;
-	r = r600_bc_add_alu(ctx->bc, &alu);
-	if (r)
-		return r;
-	memset(&alu, 0, sizeof(struct r600_bc_alu));
-	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL;
-	alu.src[0].sel = ctx->temp_reg;
-	alu.src[0].chan = 3;
-	alu.src[1].sel = src_gpr;
-	alu.src[1].chan = tgsi_chan(&inst->Src[0], 1);
-	alu.dst.sel = ctx->temp_reg;
-	alu.dst.chan = 1;
-	alu.dst.write = 1;
-	r = r600_bc_add_alu(ctx->bc, &alu);
-	if (r)
-		return r;
-	memset(&alu, 0, sizeof(struct r600_bc_alu));
-	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL;
-	alu.src[0].sel = ctx->temp_reg;
-	alu.src[0].chan = 3;
-	alu.src[1].sel = src_gpr;
-	alu.src[1].chan = tgsi_chan(&inst->Src[0], 2);
-	alu.dst.sel = ctx->temp_reg;
-	alu.dst.chan = 2;
-	alu.dst.write = 1;
-	r = r600_bc_add_alu(ctx->bc, &alu);
-	if (r)
-		return r;
-	memset(&alu, 0, sizeof(struct r600_bc_alu));
-	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
-	alu.src[0].sel = 249;
-	alu.src[0].chan = 0;
-	alu.dst.sel = ctx->temp_reg;
-	alu.dst.chan = 3;
-	alu.last = 1;
-	alu.dst.write = 1;
-	r = r600_bc_add_alu(ctx->bc, &alu);
-	if (r)
-		return r;
-	src_gpr = ctx->temp_reg;
+		alu.src[0].chan = tgsi_chan(&inst->Src[0], 3);
+		alu.dst.sel = ctx->temp_reg;
+		alu.dst.chan = 3;
+		alu.last = 1;
+		alu.dst.write = 1;
+		r = r600_bc_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+		
+		for (i = 0; i < 3; i++) {
+			memset(&alu, 0, sizeof(struct r600_bc_alu));
+			alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL;
+			alu.src[0].sel = ctx->temp_reg;
+			alu.src[0].chan = 3;
+			r = tgsi_src(ctx, &inst->Src[0], &alu.src[1]);
+			if (r)
+				return r;
+			alu.src[1].chan = tgsi_chan(&inst->Src[0], i);
+			alu.dst.sel = ctx->temp_reg;
+			alu.dst.chan = i;
+			alu.dst.write = 1;
+			r = r600_bc_add_alu(ctx->bc, &alu);
+			if (r)
+				return r;
+		}
+		memset(&alu, 0, sizeof(struct r600_bc_alu));
+		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
+		alu.src[0].sel = V_SQ_ALU_SRC_1;
+		alu.src[0].chan = 0;
+		alu.dst.sel = ctx->temp_reg;
+		alu.dst.chan = 3;
+		alu.last = 1;
+		alu.dst.write = 1;
+		r = r600_bc_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+		src_not_temp = false;
+		src_gpr = ctx->temp_reg;
+	}
+
+	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE) {
+		int src_chan, src2_chan;
+
+		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
+		for (i = 0; i < 4; i++) {
+			memset(&alu, 0, sizeof(struct r600_bc_alu));
+			alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
+			switch (i) {
+			case 0:
+				src_chan = 2;
+				src2_chan = 1;
+				break;
+			case 1:
+				src_chan = 2;
+				src2_chan = 0;
+				break;
+			case 2:
+				src_chan = 0;
+				src2_chan = 2;
+				break;
+			case 3:
+				src_chan = 1;
+				src2_chan = 2;
+				break;
+			}
+			r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]);
+			if (r)
+				return r;
+			alu.src[0].chan = tgsi_chan(&inst->Src[0], src_chan);
+			r = tgsi_src(ctx, &inst->Src[0], &alu.src[1]);
+			if (r)
+				return r;
+			alu.src[1].chan = tgsi_chan(&inst->Src[0], src2_chan);
+			alu.dst.sel = ctx->temp_reg;
+			alu.dst.chan = i;
+			if (i == 3)
+				alu.last = 1;
+			alu.dst.write = 1;
+			r = r600_bc_add_alu(ctx->bc, &alu);
+			if (r)
+				return r;
+		}
+
+		/* tmp1.z = RCP_e(|tmp1.z|) */
+		memset(&alu, 0, sizeof(struct r600_bc_alu));
+		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE;
+		alu.src[0].sel = ctx->temp_reg;
+		alu.src[0].chan = 2;
+		alu.src[0].abs = 1;
+		alu.dst.sel = ctx->temp_reg;
+		alu.dst.chan = 2;
+		alu.dst.write = 1;
+		alu.last = 1;
+		r = r600_bc_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+		
+		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
+		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
+		 * muladd has no writemask, have to use another temp 
+		 */
+		memset(&alu, 0, sizeof(struct r600_bc_alu));
+		alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD;
+		alu.is_op3 = 1;
+
+		alu.src[0].sel = ctx->temp_reg;
+		alu.src[0].chan = 0;
+		alu.src[1].sel = ctx->temp_reg;
+		alu.src[1].chan = 2;
+		
+		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
+		alu.src[2].chan = 0;
+
+		alu.dst.sel = ctx->temp_reg;
+		alu.dst.chan = 0;
+		alu.dst.write = 1;
+
+		r = r600_bc_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+
+		memset(&alu, 0, sizeof(struct r600_bc_alu));
+		alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD;
+		alu.is_op3 = 1;
+
+		alu.src[0].sel = ctx->temp_reg;
+		alu.src[0].chan = 1;
+		alu.src[1].sel = ctx->temp_reg;
+		alu.src[1].chan = 2;
+		
+		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
+		alu.src[2].chan = 0;
+
+		alu.dst.sel = ctx->temp_reg;
+		alu.dst.chan = 1;
+		alu.dst.write = 1;
+
+		alu.last = 1;
+		r = r600_bc_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+
+		lit_vals[0] = fui(1.5f);
+
+		r = r600_bc_add_literal(ctx->bc, lit_vals);
+		if (r)
+			return r;
+		src_not_temp = false;
+		src_gpr = ctx->temp_reg;
+	}
+
+	if (src_not_temp) {
+		for (i = 0; i < 4; i++) {
+			memset(&alu, 0, sizeof(struct r600_bc_alu));
+			alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
+			alu.src[0].sel = src_gpr;
+			alu.src[0].chan = i;
+			alu.dst.sel = ctx->temp_reg;
+			alu.dst.chan = i;
+			if (i == 3)
+				alu.last = 1;
+			alu.dst.write = 1;
+			r = r600_bc_add_alu(ctx->bc, &alu);
+			if (r)
+				return r;
+		}
+		src_gpr = ctx->temp_reg;
+	}
+	
+	opcode = ctx->inst_info->r600_opcode;
+	if (opcode == SQ_TEX_INST_SAMPLE &&
+	    (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D))
+		opcode = SQ_TEX_INST_SAMPLE_C;
 
-	/* TODO use temp if src_gpr is not a temporary reg (File != TEMPORARY) */
 	memset(&tex, 0, sizeof(struct r600_bc_tex));
-	tex.inst = ctx->inst_info->r600_opcode;
+	tex.inst = opcode;
 	tex.resource_id = ctx->file_offset[inst->Src[1].Register.File] + inst->Src[1].Register.Index;
 	tex.sampler_id = tex.resource_id;
 	tex.src_gpr = src_gpr;
@@ -1117,13 +1669,30 @@ static int tgsi_tex(struct r600_shader_ctx *ctx)
 	tex.src_sel_z = 2;
 	tex.src_sel_w = 3;
 
+	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE) {
+		tex.src_sel_x = 1;
+		tex.src_sel_y = 0;
+		tex.src_sel_z = 3;
+		tex.src_sel_w = 1;
+	}
+
 	if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
 		tex.coord_type_x = 1;
 		tex.coord_type_y = 1;
 		tex.coord_type_z = 1;
 		tex.coord_type_w = 1;
 	}
-	return r600_bc_add_tex(ctx->bc, &tex);
+
+	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D)
+		tex.src_sel_w = 2;
+
+	r = r600_bc_add_tex(ctx->bc, &tex);
+	if (r)
+		return r;
+
+	/* add shadow ambient support  - gallium doesn't do it yet */
+	return 0;
+	
 }
 
 static int tgsi_lrp(struct r600_shader_ctx *ctx)
@@ -1141,7 +1710,7 @@ static int tgsi_lrp(struct r600_shader_ctx *ctx)
 	for (i = 0; i < 4; i++) {
 		memset(&alu, 0, sizeof(struct r600_bc_alu));
 		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD;
-		alu.src[0].sel = 249;
+		alu.src[0].sel = V_SQ_ALU_SRC_1;
 		alu.src[0].chan = 0;
 		alu.src[1] = r600_src[0];
 		alu.src[1].chan = tgsi_chan(&inst->Src[0], i);
@@ -1205,23 +1774,654 @@ static int tgsi_lrp(struct r600_shader_ctx *ctx)
 	return tgsi_helper_copy(ctx, inst);
 }
 
+static int tgsi_cmp(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	struct r600_bc_alu_src r600_src[3];
+	struct r600_bc_alu alu;
+	int use_temp = 0;
+	int i, r;
+
+	r = tgsi_split_constant(ctx, r600_src);
+	if (r)
+		return r;
+
+	if (inst->Dst[0].Register.WriteMask != 0xf)
+		use_temp = 1;
+
+	for (i = 0; i < 4; i++) {
+		memset(&alu, 0, sizeof(struct r600_bc_alu));
+		alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE;
+		alu.src[0] = r600_src[0];
+		alu.src[0].chan = tgsi_chan(&inst->Src[0], i);
+
+		alu.src[1] = r600_src[2];
+		alu.src[1].chan = tgsi_chan(&inst->Src[2], i);
+
+		alu.src[2] = r600_src[1];
+		alu.src[2].chan = tgsi_chan(&inst->Src[1], i);
+
+		if (use_temp)
+			alu.dst.sel = ctx->temp_reg;
+		else {
+			r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+			if (r)
+				return r;
+		}
+		alu.dst.chan = i;
+		alu.dst.write = 1;
+		alu.is_op3 = 1;
+		if (i == 3)
+			alu.last = 1;
+		r = r600_bc_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}       
+	if (use_temp)
+		return tgsi_helper_copy(ctx, inst);
+	return 0;
+}
+
+static int tgsi_xpd(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	struct r600_bc_alu_src r600_src[3];
+	struct r600_bc_alu alu;
+	uint32_t use_temp = 0;
+	int i, r;
+
+	if (inst->Dst[0].Register.WriteMask != 0xf)
+		use_temp = 1;
+
+	r = tgsi_split_constant(ctx, r600_src);
+	if (r)
+		return r;
+	
+	for (i = 0; i < 4; i++) {
+		memset(&alu, 0, sizeof(struct r600_bc_alu));
+		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL;
+
+		alu.src[0] = r600_src[0];
+		switch (i) {
+		case 0:
+			alu.src[0].chan = tgsi_chan(&inst->Src[0], 2);
+			break;
+		case 1:
+			alu.src[0].chan = tgsi_chan(&inst->Src[0], 0);
+			break;
+		case 2:
+			alu.src[0].chan = tgsi_chan(&inst->Src[0], 1);
+			break;
+		case 3:
+			alu.src[0].sel = V_SQ_ALU_SRC_0;
+			alu.src[0].chan = i;
+		}
+
+		alu.src[1] = r600_src[1];
+		switch (i) {
+		case 0:
+			alu.src[1].chan = tgsi_chan(&inst->Src[1], 1);
+			break;
+		case 1:
+			alu.src[1].chan = tgsi_chan(&inst->Src[1], 2);
+			break;
+		case 2:
+			alu.src[1].chan = tgsi_chan(&inst->Src[1], 0);
+			break;
+		case 3:
+			alu.src[1].sel = V_SQ_ALU_SRC_0;
+			alu.src[1].chan = i;
+		}
+
+		alu.dst.sel = ctx->temp_reg;
+		alu.dst.chan = i;
+		alu.dst.write = 1;
+
+		if (i == 3)
+			alu.last = 1;
+		r = r600_bc_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+
+	for (i = 0; i < 4; i++) {
+		memset(&alu, 0, sizeof(struct r600_bc_alu));
+		alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD;
+
+		alu.src[0] = r600_src[0];
+		switch (i) {
+		case 0:
+			alu.src[0].chan = tgsi_chan(&inst->Src[0], 1);
+			break;
+		case 1:
+			alu.src[0].chan = tgsi_chan(&inst->Src[0], 2);
+			break;
+		case 2:
+			alu.src[0].chan = tgsi_chan(&inst->Src[0], 0);
+			break;
+		case 3:
+			alu.src[0].sel = V_SQ_ALU_SRC_0;
+			alu.src[0].chan = i;
+		}
+
+		alu.src[1] = r600_src[1];
+		switch (i) {
+		case 0:
+			alu.src[1].chan = tgsi_chan(&inst->Src[1], 2);
+			break;
+		case 1:
+			alu.src[1].chan = tgsi_chan(&inst->Src[1], 0);
+			break;
+		case 2:
+			alu.src[1].chan = tgsi_chan(&inst->Src[1], 1);
+			break;
+		case 3:
+			alu.src[1].sel = V_SQ_ALU_SRC_0;
+			alu.src[1].chan = i;
+		}
+
+		alu.src[2].sel = ctx->temp_reg;
+		alu.src[2].neg = 1;
+		alu.src[2].chan = i;
+
+		if (use_temp)
+			alu.dst.sel = ctx->temp_reg;
+		else {
+			r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+			if (r)
+				return r;
+		}
+		alu.dst.chan = i;
+		alu.dst.write = 1;
+		alu.is_op3 = 1;
+		if (i == 3)
+			alu.last = 1;
+		r = r600_bc_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+	if (use_temp)
+		return tgsi_helper_copy(ctx, inst);
+	return 0;
+}
+
+static int tgsi_exp(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	struct r600_bc_alu_src r600_src[3];
+	struct r600_bc_alu alu;
+	int r;
+
+	/* result.x = 2^floor(src); */
+	if (inst->Dst[0].Register.WriteMask & 1) {
+		memset(&alu, 0, sizeof(struct r600_bc_alu));
+
+		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR;
+		r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]);
+		if (r)
+			return r;
+
+		alu.src[0].chan = tgsi_chan(&inst->Src[0], 0);
+
+		alu.dst.sel = ctx->temp_reg;
+		alu.dst.chan = 0;
+		alu.dst.write = 1;
+		alu.last = 1;
+		r = r600_bc_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+
+		r = r600_bc_add_literal(ctx->bc, ctx->value);
+		if (r)
+			return r;
+
+		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE;
+		alu.src[0].sel = ctx->temp_reg;
+		alu.src[0].chan = 0;
+
+		alu.dst.sel = ctx->temp_reg;
+		alu.dst.chan = 0;
+		alu.dst.write = 1;
+		alu.last = 1;
+		r = r600_bc_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+
+		r = r600_bc_add_literal(ctx->bc, ctx->value);
+		if (r)
+			return r;
+	}
+		
+	/* result.y = tmp - floor(tmp); */
+	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
+		memset(&alu, 0, sizeof(struct r600_bc_alu));
+
+		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT;
+		alu.src[0] = r600_src[0];
+		r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]);
+		if (r)
+			return r;
+		alu.src[0].chan = tgsi_chan(&inst->Src[0], 0);
+
+		alu.dst.sel = ctx->temp_reg;
+//		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+//		if (r)
+//			return r;
+		alu.dst.write = 1;
+		alu.dst.chan = 1;
+
+		alu.last = 1;
+
+		r = r600_bc_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+		r = r600_bc_add_literal(ctx->bc, ctx->value);
+		if (r)
+			return r;
+	}
+
+	/* result.z = RoughApprox2ToX(tmp);*/
+	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
+		memset(&alu, 0, sizeof(struct r600_bc_alu));
+		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE;
+		r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]);
+		if (r)
+			return r;
+		alu.src[0].chan = tgsi_chan(&inst->Src[0], 0);
+
+		alu.dst.sel = ctx->temp_reg;
+		alu.dst.write = 1;
+		alu.dst.chan = 2;
+
+		alu.last = 1;
+
+		r = r600_bc_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+		r = r600_bc_add_literal(ctx->bc, ctx->value);
+		if (r)
+			return r;
+	}
+
+	/* result.w = 1.0;*/
+	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
+		memset(&alu, 0, sizeof(struct r600_bc_alu));
+
+		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
+		alu.src[0].sel = V_SQ_ALU_SRC_1;
+		alu.src[0].chan = 0;
+
+		alu.dst.sel = ctx->temp_reg;
+		alu.dst.chan = 3;
+		alu.dst.write = 1;
+		alu.last = 1;
+		r = r600_bc_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+		r = r600_bc_add_literal(ctx->bc, ctx->value);
+		if (r)
+			return r;
+	}
+	return tgsi_helper_copy(ctx, inst);
+}
+
+static int tgsi_arl(struct r600_shader_ctx *ctx)
+{
+	/* TODO from r600c, ar values don't persist between clauses */
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	struct r600_bc_alu alu;
+	int r;
+	memset(&alu, 0, sizeof(struct r600_bc_alu));
+
+	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR;
+
+	r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]);
+	if (r)
+		return r;
+	alu.src[0].chan = tgsi_chan(&inst->Src[0], 0);
+
+	alu.last = 1;
+
+	r = r600_bc_add_alu_type(ctx->bc, &alu, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU);
+	if (r)
+		return r;
+	return 0;
+}
+
+static int tgsi_opdst(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	struct r600_bc_alu alu;
+	int i, r = 0;
+
+	for (i = 0; i < 4; i++) {
+		memset(&alu, 0, sizeof(struct r600_bc_alu));
+
+		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL;
+		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+		if (r)
+			return r;
+	
+	        if (i == 0 || i == 3) {
+			alu.src[0].sel = V_SQ_ALU_SRC_1;
+		} else {
+			r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]);
+			if (r)
+				return r;
+			alu.src[0].chan = tgsi_chan(&inst->Src[0], i);
+		}
+
+	        if (i == 0 || i == 2) {
+			alu.src[1].sel = V_SQ_ALU_SRC_1;
+		} else {
+			r = tgsi_src(ctx, &inst->Src[1], &alu.src[1]);
+			if (r)
+				return r;
+			alu.src[1].chan = tgsi_chan(&inst->Src[1], i);
+		}
+		if (i == 3)
+			alu.last = 1;
+		r = r600_bc_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+	return 0;
+}
+
+static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	struct r600_bc_alu alu;
+	int r;
+
+	memset(&alu, 0, sizeof(struct r600_bc_alu));
+	alu.inst = opcode;
+	alu.predicate = 1;
+
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.write = 1;
+	alu.dst.chan = 0;
+
+	r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]);
+	if (r)
+		return r;
+	alu.src[0].chan = tgsi_chan(&inst->Src[0], 0);
+	alu.src[1].sel = V_SQ_ALU_SRC_0;
+	alu.src[1].chan = 0;
+	
+	alu.last = 1;
+
+	r = r600_bc_add_alu_type(ctx->bc, &alu, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE);
+	if (r)
+		return r;
+	return 0;
+}
+
+static int pops(struct r600_shader_ctx *ctx, int pops)
+{
+	r600_bc_add_cfinst(ctx->bc, V_SQ_CF_WORD1_SQ_CF_INST_POP);	
+	ctx->bc->cf_last->pop_count = pops;
+	return 0;
+}
+
+static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason)
+{
+	switch(reason) {
+	case FC_PUSH_VPM:
+		ctx->bc->callstack[ctx->bc->call_sp].current--;
+		break;
+	case FC_PUSH_WQM:
+	case FC_LOOP:
+		ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
+		break;
+	case FC_REP:
+		/* TOODO : for 16 vp asic should -= 2; */
+		ctx->bc->callstack[ctx->bc->call_sp].current --;
+		break;
+	}
+}
+
+static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only)
+{
+	if (check_max_only) {
+		int diff;
+		switch (reason) {
+		case FC_PUSH_VPM:
+			diff = 1;
+			break;
+		case FC_PUSH_WQM:
+			diff = 4;
+			break;
+		}
+		if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
+		    ctx->bc->callstack[ctx->bc->call_sp].max) {
+			ctx->bc->callstack[ctx->bc->call_sp].max =
+				ctx->bc->callstack[ctx->bc->call_sp].current + diff;
+		}
+		return;
+	}					
+	switch (reason) {
+	case FC_PUSH_VPM:
+		ctx->bc->callstack[ctx->bc->call_sp].current++;
+		break;
+	case FC_PUSH_WQM:
+	case FC_LOOP:
+		ctx->bc->callstack[ctx->bc->call_sp].current += 4;
+		break;
+	case FC_REP:
+		ctx->bc->callstack[ctx->bc->call_sp].current++;
+		break;
+	}
+
+	if ((ctx->bc->callstack[ctx->bc->call_sp].current) >
+	    ctx->bc->callstack[ctx->bc->call_sp].max) {
+		ctx->bc->callstack[ctx->bc->call_sp].max =
+			ctx->bc->callstack[ctx->bc->call_sp].current;
+	}
+}
+
+static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
+{
+	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
+
+	sp->mid = (struct r600_bc_cf **)realloc((void *)sp->mid,
+						sizeof(struct r600_bc_cf *) * (sp->num_mid + 1));
+	sp->mid[sp->num_mid] = ctx->bc->cf_last;
+	sp->num_mid++;
+}
+
+static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
+{
+	ctx->bc->fc_sp++;
+	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
+	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
+}
+
+static void fc_poplevel(struct r600_shader_ctx *ctx)
+{
+	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
+	if (sp->mid) {
+		free(sp->mid);
+		sp->mid = NULL;
+	}
+	sp->num_mid = 0;
+	sp->start = NULL;
+	sp->type = 0;
+	ctx->bc->fc_sp--;
+}
+
+#if 0
+static int emit_return(struct r600_shader_ctx *ctx)
+{
+	r600_bc_add_cfinst(ctx->bc, V_SQ_CF_WORD1_SQ_CF_INST_RETURN);
+	return 0;
+}
+
+static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
+{
+
+	r600_bc_add_cfinst(ctx->bc, V_SQ_CF_WORD1_SQ_CF_INST_JUMP);
+	ctx->bc->cf_last->pop_count = pops;
+	/* TODO work out offset */
+	return 0;
+}
+
+static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
+{
+	return 0;
+}
+
+static void emit_testflag(struct r600_shader_ctx *ctx)
+{
+	
+}
+
+static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
+{
+	emit_testflag(ctx);
+	emit_jump_to_offset(ctx, 1, 4);
+	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
+	pops(ctx, ifidx + 1);
+	emit_return(ctx);
+}
+
+static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
+{
+	emit_testflag(ctx);
+
+	r600_bc_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
+	ctx->bc->cf_last->pop_count = 1;
+
+	fc_set_mid(ctx, fc_sp);
+
+	pops(ctx, 1);
+}
+#endif
+
+static int tgsi_if(struct r600_shader_ctx *ctx)
+{
+	emit_logic_pred(ctx, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE);
+
+	r600_bc_add_cfinst(ctx->bc, V_SQ_CF_WORD1_SQ_CF_INST_JUMP);
+
+	fc_pushlevel(ctx, FC_IF);
+
+	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
+	return 0;
+}
+
+static int tgsi_else(struct r600_shader_ctx *ctx)
+{
+	r600_bc_add_cfinst(ctx->bc, V_SQ_CF_WORD1_SQ_CF_INST_ELSE);
+	ctx->bc->cf_last->pop_count = 1;
+
+	fc_set_mid(ctx, ctx->bc->fc_sp);
+	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
+	return 0;
+}
+
+static int tgsi_endif(struct r600_shader_ctx *ctx)
+{
+	pops(ctx, 1);
+	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
+		R600_ERR("if/endif unbalanced in shader\n");
+		return -1;
+	}
+
+	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
+		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
+		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
+	} else {
+		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
+	}
+	fc_poplevel(ctx);
+
+	callstack_decrease_current(ctx, FC_PUSH_VPM);
+	return 0;
+}
+
+static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
+{
+	r600_bc_add_cfinst(ctx->bc, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL);
+
+	fc_pushlevel(ctx, FC_LOOP);
+
+	/* check stack depth */
+	callstack_check_depth(ctx, FC_LOOP, 0);
+	return 0;
+}
+
+static int tgsi_endloop(struct r600_shader_ctx *ctx)
+{
+	int i;
+
+	r600_bc_add_cfinst(ctx->bc, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END);
+
+	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
+		R600_ERR("loop/endloop in shader code are not paired.\n");
+		return -EINVAL;
+	}
+
+	/* fixup loop pointers - from r600isa
+	   LOOP END points to CF after LOOP START,
+	   LOOP START point to CF after LOOP END
+	   BRK/CONT point to LOOP END CF
+	*/
+	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
+
+	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
+
+	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
+		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
+	}
+	/* TODO add LOOPRET support */
+	fc_poplevel(ctx);
+	callstack_decrease_current(ctx, FC_LOOP);
+	return 0;
+}
+
+static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
+{
+	unsigned int fscp;
+
+	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
+	{
+		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
+			break;
+	}
+
+	if (fscp == 0) {
+		R600_ERR("Break not inside loop/endloop pair\n");
+		return -EINVAL;
+	}
+
+	r600_bc_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
+	ctx->bc->cf_last->pop_count = 1;
+
+	fc_set_mid(ctx, fscp);
+
+	pops(ctx, 1);
+	callstack_check_depth(ctx, FC_PUSH_VPM, 1);
+	return 0;
+}
+
 static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
-	{TGSI_OPCODE_ARL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_ARL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_arl},
 	{TGSI_OPCODE_MOV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
 	{TGSI_OPCODE_LIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
 	{TGSI_OPCODE_RCP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, tgsi_trans_srcx_replicate},
 	{TGSI_OPCODE_RSQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, tgsi_trans_srcx_replicate},
-	{TGSI_OPCODE_EXP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_EXP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
 	{TGSI_OPCODE_LOG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_MUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
 	{TGSI_OPCODE_ADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
 	{TGSI_OPCODE_DP3,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
 	{TGSI_OPCODE_DP4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
-	{TGSI_OPCODE_DST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_DST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
 	{TGSI_OPCODE_MIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
 	{TGSI_OPCODE_MAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
-	{TGSI_OPCODE_SLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_slt},
-	{TGSI_OPCODE_SGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_SLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
+	{TGSI_OPCODE_SGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
 	{TGSI_OPCODE_MAD,	1, V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
 	{TGSI_OPCODE_SUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
 	{TGSI_OPCODE_LRP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
@@ -1232,38 +2432,38 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
 	/* gap */
 	{22,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{23,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_FRC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_FRC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
 	{TGSI_OPCODE_CLAMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_FLR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_FLR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
 	{TGSI_OPCODE_ROUND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_EX2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
-	{TGSI_OPCODE_LG2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_POW,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_XPD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_LG2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
+	{TGSI_OPCODE_POW,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
+	{TGSI_OPCODE_XPD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
 	/* gap */
 	{32,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_ABS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
 	{TGSI_OPCODE_RCC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_DPH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_COS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_DDX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_DDY,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_KILP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},  /* predicated kill */
+	{TGSI_OPCODE_DPH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
+	{TGSI_OPCODE_COS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
+	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
+	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
+	{TGSI_OPCODE_KILP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
 	{TGSI_OPCODE_PK2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_PK2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_PK4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_PK4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_RFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_SEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_SEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
 	{TGSI_OPCODE_SFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_SGT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_SIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_SLE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_SNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_SGT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
+	{TGSI_OPCODE_SIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
+	{TGSI_OPCODE_SLE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
+	{TGSI_OPCODE_SNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
 	{TGSI_OPCODE_STR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_TEX,	0, 0x10, tgsi_tex},
+	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
 	{TGSI_OPCODE_TXD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_TXP,	0, 0x10, tgsi_tex},
+	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
 	{TGSI_OPCODE_UP2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_UP2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_UP4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
@@ -1274,21 +2474,21 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
 	{TGSI_OPCODE_BRA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_CAL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_RET,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_SSG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, /* SGN */
-	{TGSI_OPCODE_CMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_SCS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_TXB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_SSG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
+	{TGSI_OPCODE_CMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
+	{TGSI_OPCODE_SCS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
+	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
 	{TGSI_OPCODE_NRM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_DIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_DP2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
 	{TGSI_OPCODE_TXL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_BRK,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_IF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_BRK,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
+	{TGSI_OPCODE_IF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
 	/* gap */
 	{75,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{76,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_ELSE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_ENDIF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_ELSE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
+	{TGSI_OPCODE_ENDIF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
 	/* gap */
 	{79,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{80,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
@@ -1297,7 +2497,7 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
 	{TGSI_OPCODE_CEIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_I2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_NOT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_TRUNC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_TRUNC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_trans_srcx_replicate},
 	{TGSI_OPCODE_SHL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	/* gap */
 	{88,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
@@ -1308,12 +2508,12 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
 	{TGSI_OPCODE_SAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_TXF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_TXQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_CONT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_CONT,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
 	{TGSI_OPCODE_EMIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_ENDPRIM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_BGNLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_BGNLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
 	{TGSI_OPCODE_BGNSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_ENDLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_ENDLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
 	{TGSI_OPCODE_ENDSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	/* gap */
 	{103,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index 2ee7780ead..7c722c07cb 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -42,6 +42,7 @@ struct r600_shader {
 	struct r600_shader_io	input[32];
 	struct r600_shader_io	output[32];
 	enum radeon_family	family;
+	boolean                 uses_kill;
 };
 
 #endif
diff --git a/src/gallium/drivers/r600/r600_sq.h b/src/gallium/drivers/r600/r600_sq.h
index 002660c654..fa7a31742a 100644
--- a/src/gallium/drivers/r600/r600_sq.h
+++ b/src/gallium/drivers/r600/r600_sq.h
@@ -206,6 +206,26 @@
 #define   S_SQ_ALU_WORD0_SRC0_SEL(x)                                 (((x) & 0x1FF) << 0)
 #define   G_SQ_ALU_WORD0_SRC0_SEL(x)                                 (((x) >> 0) & 0x1FF)
 #define   C_SQ_ALU_WORD0_SRC0_SEL                                    0xFFFFFE00
+/*
+ * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
+ * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
+ * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
+ * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
+ * 248  SQ_ALU_SRC_0: special constant 0.0.
+ * 249  SQ_ALU_SRC_1: special constant 1.0 float.
+ * 250  SQ_ALU_SRC_1_INT: special constant 1 integer.
+ * 251  SQ_ALU_SRC_M_1_INT: special constant -1 integer.
+ * 252  SQ_ALU_SRC_0_5: special constant 0.5 float.
+ * 253  SQ_ALU_SRC_LITERAL: literal constant.
+ * 254  SQ_ALU_SRC_PV: previous vector result.
+ * 255  SQ_ALU_SRC_PS: previous scalar result.
+ */
+#define     V_SQ_ALU_SRC_0                                           0x000000F8
+#define     V_SQ_ALU_SRC_1                                           0x000000F9
+#define     V_SQ_ALU_SRC_1_INT                                       0x000000FA
+#define     V_SQ_ALU_SRC_M_1_INT                                     0x000000FB
+#define     V_SQ_ALU_SRC_0_5                                         0x000000FC
+#define     V_SQ_ALU_SRC_LITERAL                                     0x000000FD
 #define   S_SQ_ALU_WORD0_SRC0_REL(x)                                 (((x) & 0x1) << 9)
 #define   G_SQ_ALU_WORD0_SRC0_REL(x)                                 (((x) >> 9) & 0x1)
 #define   C_SQ_ALU_WORD0_SRC0_REL                                    0xFFFFFDFF
@@ -583,4 +603,11 @@
 #define   G_SQ_TEX_WORD2_SRC_SEL_W(x)                                (((x) >> 29) & 0x7)
 #define   C_SQ_TEX_WORD2_SRC_SEL_W                                   0x1FFFFFFF
 
+#define V_SQ_CF_COND_ACTIVE                             0x00
+#define V_SQ_CF_COND_FALSE                              0x01
+#define V_SQ_CF_COND_BOOL                               0x02
+#define V_SQ_CF_COND_NOT_BOOL                           0x03
+
+#define V_SQ_REL_ABSOLUTE 0
+#define V_SQ_REL_RELATIVE 1
 #endif
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 3efd409ae0..66cab7d7a6 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -34,6 +34,17 @@
 #include "r600d.h"
 #include "r600_state_inlines.h"
 
+static void r600_blend(struct r600_context *rctx, struct radeon_state *rstate, const struct pipe_blend_state *state);
+static void r600_viewport(struct r600_context *rctx, struct radeon_state *rstate, const struct pipe_viewport_state *state);
+static void r600_ucp(struct r600_context *rctx, struct radeon_state *rstate, const struct pipe_clip_state *state);
+static void r600_sampler(struct r600_context *rctx, struct radeon_state *rstate, const struct pipe_sampler_state *state, unsigned id);
+static void r600_resource(struct pipe_context *ctx, struct radeon_state *rstate, const struct pipe_sampler_view *view, unsigned id);
+static void r600_cb(struct r600_context *rctx, struct radeon_state *rstate,
+			const struct pipe_framebuffer_state *state, int cb);
+static void r600_db(struct r600_context *rctx, struct radeon_state *rstate,
+			const struct pipe_framebuffer_state *state);
+
+
 static void *r600_create_blend_state(struct pipe_context *ctx,
 					const struct pipe_blend_state *state)
 {
@@ -81,11 +92,12 @@ static struct pipe_sampler_view *r600_create_sampler_view(struct pipe_context *c
 	struct r600_context *rctx = r600_context(ctx);
 	struct r600_context_state *rstate;
 
-	rstate = r600_context_state(rctx, pipe_sampler_type, state);
+	rstate = r600_context_state(rctx, pipe_sampler_view_type, state);
 	pipe_reference(NULL, &texture->reference);
 	rstate->state.sampler_view.texture = texture;
 	rstate->state.sampler_view.reference.count = 1;
 	rstate->state.sampler_view.context = ctx;
+	r600_resource(ctx, &rstate->rstate[0], &rstate->state.sampler_view, 0);
 	return &rstate->state.sampler_view;
 }
 
@@ -223,12 +235,24 @@ static void r600_bind_ps_sampler(struct pipe_context *ctx,
 	struct r600_context_state *rstate;
 	unsigned i;
 
-	for (i = 0; i < rctx->ps_nsampler; i++) {
-		rctx->ps_sampler[i] = r600_context_state_decref(rctx->ps_sampler[i]);
+	for (i = 0; i < count; i++) {
+		rstate = (struct r600_context_state *)states[i];
+		if (rstate) {
+			rstate->nrstate = 0;
+		}
 	}
 	for (i = 0; i < count; i++) {
 		rstate = (struct r600_context_state *)states[i];
-		rctx->ps_sampler[i] = r600_context_state_incref(rstate);
+		if (rstate) {
+			if (rstate->nrstate >= R600_MAX_RSTATE)
+				continue;
+			if (rstate->nrstate) {
+				memcpy(&rstate->rstate[rstate->nrstate], &rstate->rstate[0], sizeof(struct radeon_state));
+			}
+			radeon_state_convert(&rstate->rstate[rstate->nrstate], R600_STATE_SAMPLER, i, R600_SHADER_PS);
+			rctx->ps_sampler[i] = &rstate->rstate[rstate->nrstate];
+			rstate->nrstate++;
+		}
 	}
 	rctx->ps_nsampler = count;
 }
@@ -240,12 +264,24 @@ static void r600_bind_vs_sampler(struct pipe_context *ctx,
 	struct r600_context_state *rstate;
 	unsigned i;
 
-	for (i = 0; i < rctx->vs_nsampler; i++) {
-		rctx->vs_sampler[i] = r600_context_state_decref(rctx->vs_sampler[i]);
+	for (i = 0; i < count; i++) {
+		rstate = (struct r600_context_state *)states[i];
+		if (rstate) {
+			rstate->nrstate = 0;
+		}
 	}
 	for (i = 0; i < count; i++) {
 		rstate = (struct r600_context_state *)states[i];
-		rctx->vs_sampler[i] = r600_context_state_incref(rstate);
+		if (rstate) {
+			if (rstate->nrstate >= R600_MAX_RSTATE)
+				continue;
+			if (rstate->nrstate) {
+				memcpy(&rstate->rstate[rstate->nrstate], &rstate->rstate[0], sizeof(struct radeon_state));
+			}
+			radeon_state_convert(&rstate->rstate[rstate->nrstate], R600_STATE_SAMPLER, i, R600_SHADER_VS);
+			rctx->vs_sampler[i] = &rstate->rstate[rstate->nrstate];
+			rstate->nrstate++;
+		}
 	}
 	rctx->vs_nsampler = count;
 }
@@ -268,6 +304,13 @@ static void r600_set_blend_color(struct pipe_context *ctx,
 static void r600_set_clip_state(struct pipe_context *ctx,
 				const struct pipe_clip_state *state)
 {
+	struct r600_context *rctx = r600_context(ctx);
+	struct r600_context_state *rstate;
+
+	rstate = r600_context_state(rctx, pipe_clip_type, state);
+	r600_bind_state(ctx, rstate);
+	/* refcount is taken care of this */
+	r600_delete_state(ctx, rstate);
 }
 
 static void r600_set_constant_buffer(struct pipe_context *ctx,
@@ -276,19 +319,21 @@ static void r600_set_constant_buffer(struct pipe_context *ctx,
 {
 	struct r600_screen *rscreen = r600_screen(ctx->screen);
 	struct r600_context *rctx = r600_context(ctx);
-	unsigned nconstant = 0, i, type, id;
-	struct radeon_state *rstate;
+	unsigned nconstant = 0, i, type, shader_class;
+	struct radeon_state *rstate, *rstates;
 	struct pipe_transfer *transfer;
 	u32 *ptr;
 
+	type = R600_STATE_CONSTANT;
+
 	switch (shader) {
 	case PIPE_SHADER_VERTEX:
-		id = R600_VS_CONSTANT;
-		type = R600_VS_CONSTANT_TYPE;
+		shader_class = R600_SHADER_VS;
+		rstates = rctx->vs_constant;
 		break;
 	case PIPE_SHADER_FRAGMENT:
-		id = R600_PS_CONSTANT;
-		type = R600_PS_CONSTANT_TYPE;
+		shader_class = R600_SHADER_PS;
+		rstates = rctx->ps_constant;
 		break;
 	default:
 		R600_ERR("unsupported %d\n", shader);
@@ -300,17 +345,15 @@ static void r600_set_constant_buffer(struct pipe_context *ctx,
 		if (ptr == NULL)
 			return;
 		for (i = 0; i < nconstant; i++) {
-			rstate = radeon_state(rscreen->rw, type, id + i);
-			if (rstate == NULL)
-				return;
+			rstate = &rstates[i];
+			radeon_state_init(rstate, rscreen->rw, type, i, shader_class);
 			rstate->states[R600_PS_CONSTANT__SQ_ALU_CONSTANT0_0] = ptr[i * 4 + 0];
 			rstate->states[R600_PS_CONSTANT__SQ_ALU_CONSTANT1_0] = ptr[i * 4 + 1];
 			rstate->states[R600_PS_CONSTANT__SQ_ALU_CONSTANT2_0] = ptr[i * 4 + 2];
 			rstate->states[R600_PS_CONSTANT__SQ_ALU_CONSTANT3_0] = ptr[i * 4 + 3];
 			if (radeon_state_pm4(rstate))
 				return;
-			if (radeon_draw_set_new(rctx->draw, rstate))
-				return;
+			radeon_draw_bind(&rctx->draw, rstate);
 		}
 		pipe_buffer_unmap(ctx, buffer, transfer);
 	}
@@ -324,12 +367,24 @@ static void r600_set_ps_sampler_view(struct pipe_context *ctx,
 	struct r600_context_state *rstate;
 	unsigned i;
 
-	for (i = 0; i < rctx->ps_nsampler_view; i++) {
-		rctx->ps_sampler_view[i] = r600_context_state_decref(rctx->ps_sampler_view[i]);
+	for (i = 0; i < count; i++) {
+		rstate = (struct r600_context_state *)views[i];
+		if (rstate) {
+			rstate->nrstate = 0;
+		}
 	}
 	for (i = 0; i < count; i++) {
 		rstate = (struct r600_context_state *)views[i];
-		rctx->ps_sampler_view[i] = r600_context_state_incref(rstate);
+		if (rstate) {
+			if (rstate->nrstate >= R600_MAX_RSTATE)
+				continue;
+			if (rstate->nrstate) {
+				memcpy(&rstate->rstate[rstate->nrstate], &rstate->rstate[0], sizeof(struct radeon_state));
+			}
+			radeon_state_convert(&rstate->rstate[rstate->nrstate], R600_STATE_RESOURCE, i, R600_SHADER_PS);
+			rctx->ps_sampler_view[i] = &rstate->rstate[rstate->nrstate];
+			rstate->nrstate++;
+		}
 	}
 	rctx->ps_nsampler_view = count;
 }
@@ -342,12 +397,24 @@ static void r600_set_vs_sampler_view(struct pipe_context *ctx,
 	struct r600_context_state *rstate;
 	unsigned i;
 
-	for (i = 0; i < rctx->vs_nsampler_view; i++) {
-		rctx->vs_sampler_view[i] = r600_context_state_decref(rctx->vs_sampler_view[i]);
+	for (i = 0; i < count; i++) {
+		rstate = (struct r600_context_state *)views[i];
+		if (rstate) {
+			rstate->nrstate = 0;
+		}
 	}
 	for (i = 0; i < count; i++) {
 		rstate = (struct r600_context_state *)views[i];
-		rctx->vs_sampler_view[i] = r600_context_state_incref(rstate);
+		if (rstate) {
+			if (rstate->nrstate >= R600_MAX_RSTATE)
+				continue;
+			if (rstate->nrstate) {
+				memcpy(&rstate->rstate[rstate->nrstate], &rstate->rstate[0], sizeof(struct radeon_state));
+			}
+			radeon_state_convert(&rstate->rstate[rstate->nrstate], R600_STATE_RESOURCE, i, R600_SHADER_VS);
+			rctx->vs_sampler_view[i] = &rstate->rstate[rstate->nrstate];
+			rstate->nrstate++;
+		}
 	}
 	rctx->vs_nsampler_view = count;
 }
@@ -360,6 +427,12 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 
 	rstate = r600_context_state(rctx, pipe_framebuffer_type, state);
 	r600_bind_state(ctx, rstate);
+	for (int i = 0; i < state->nr_cbufs; i++) {
+		r600_cb(rctx, &rstate->rstate[i+1], state, i);
+	}
+	if (state->zsbuf) {
+		r600_db(rctx, &rstate->rstate[0], state);
+	}
 }
 
 static void r600_set_polygon_stipple(struct pipe_context *ctx,
@@ -525,7 +598,7 @@ struct r600_context_state *r600_context_state_decref(struct r600_context_state *
 		R600_ERR("invalid type %d\n", rstate->type);
 		return NULL;
 	}
-	radeon_state_decref(rstate->rstate);
+	radeon_state_fini(&rstate->rstate[0]);
 	FREE(rstate);
 	return NULL;
 }
@@ -558,6 +631,7 @@ struct r600_context_state *r600_context_state(struct r600_context *rctx, unsigne
 		break;
 	case pipe_viewport_type:
 		rstate->state.viewport = (*states).viewport;
+		r600_viewport(rctx, &rstate->rstate[0], &rstate->state.viewport);
 		break;
 	case pipe_depth_type:
 		rstate->state.depth = (*states).depth;
@@ -573,6 +647,7 @@ struct r600_context_state *r600_context_state(struct r600_context *rctx, unsigne
 		break;
 	case pipe_clip_type:
 		rstate->state.clip = (*states).clip;
+		r600_ucp(rctx, &rstate->rstate[0], &rstate->state.clip);
 		break;
 	case pipe_stencil_type:
 		rstate->state.stencil = (*states).stencil;
@@ -585,6 +660,7 @@ struct r600_context_state *r600_context_state(struct r600_context *rctx, unsigne
 		break;
 	case pipe_blend_type:
 		rstate->state.blend = (*states).blend;
+		r600_blend(rctx, &rstate->rstate[0], &rstate->state.blend);
 		break;
 	case pipe_stencil_ref_type:
 		rstate->state.stencil_ref = (*states).stencil_ref;
@@ -599,6 +675,7 @@ struct r600_context_state *r600_context_state(struct r600_context *rctx, unsigne
 		break;
 	case pipe_sampler_type:
 		rstate->state.sampler = (*states).sampler;
+		r600_sampler(rctx, &rstate->rstate[0], &rstate->state.sampler, 0);
 		break;
 	default:
 		R600_ERR("invalid type %d\n", rstate->type);
@@ -608,16 +685,12 @@ struct r600_context_state *r600_context_state(struct r600_context *rctx, unsigne
 	return rstate;
 }
 
-static struct radeon_state *r600_blend(struct r600_context *rctx)
+static void r600_blend(struct r600_context *rctx, struct radeon_state *rstate, const struct pipe_blend_state *state)
 {
 	struct r600_screen *rscreen = rctx->screen;
-	struct radeon_state *rstate;
-	const struct pipe_blend_state *state = &rctx->blend->state.blend;
 	int i;
 
-	rstate = radeon_state(rscreen->rw, R600_BLEND_TYPE, R600_BLEND);
-	if (rstate == NULL)
-		return NULL;
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_BLEND, 0, 0);
 	rstate->states[R600_BLEND__CB_BLEND_RED] = fui(rctx->blend_color.color[0]);
 	rstate->states[R600_BLEND__CB_BLEND_GREEN] = fui(rctx->blend_color.color[1]);
 	rstate->states[R600_BLEND__CB_BLEND_BLUE] = fui(rctx->blend_color.color[2]);
@@ -661,29 +734,38 @@ static struct radeon_state *r600_blend(struct r600_context *rctx)
 			rstate->states[R600_BLEND__CB_BLEND_CONTROL] = bc;
 	}
 
-	if (radeon_state_pm4(rstate)) {
-		radeon_state_decref(rstate);
-		return NULL;
+	radeon_state_pm4(rstate);
+}
+
+static void r600_ucp(struct r600_context *rctx, struct radeon_state *rstate,
+			const struct pipe_clip_state *state)
+{
+	struct r600_screen *rscreen = rctx->screen;
+
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_UCP, 0, 0);
+
+	for (int i = 0; i < state->nr; i++) {
+		rstate->states[i * 4 + 0] = fui(state->ucp[i][0]);
+		rstate->states[i * 4 + 1] = fui(state->ucp[i][1]);
+		rstate->states[i * 4 + 2] = fui(state->ucp[i][2]);
+		rstate->states[i * 4 + 3] = fui(state->ucp[i][3]);
 	}
-	return rstate;
+	radeon_state_pm4(rstate);
 }
 
-static struct radeon_state *r600_cb(struct r600_context *rctx, int cb)
+static void r600_cb(struct r600_context *rctx, struct radeon_state *rstate,
+			const struct pipe_framebuffer_state *state, int cb)
 {
 	struct r600_screen *rscreen = rctx->screen;
 	struct r600_resource_texture *rtex;
 	struct r600_resource *rbuffer;
-	struct radeon_state *rstate;
-	const struct pipe_framebuffer_state *state = &rctx->framebuffer->state.framebuffer;
 	unsigned level = state->cbufs[cb]->level;
 	unsigned pitch, slice;
 	unsigned color_info;
 	unsigned format, swap, ntype;
 	const struct util_format_description *desc;
 
-	rstate = radeon_state(rscreen->rw, R600_CB0_TYPE + cb, R600_CB0 + cb);
-	if (rstate == NULL)
-		return NULL;
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_CB0 + cb, 0, 0);
 	rtex = (struct r600_resource_texture*)state->cbufs[cb]->texture;
 	rbuffer = &rtex->resource;
 	rstate->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
@@ -710,7 +792,7 @@ static struct radeon_state *r600_cb(struct r600_context *rctx, int cb)
 		S_0280A0_SOURCE_FORMAT(1) |
 		S_0280A0_NUMBER_TYPE(ntype);
 
-	rstate->states[R600_CB0__CB_COLOR0_BASE] = 0x00000000;
+	rstate->states[R600_CB0__CB_COLOR0_BASE] = state->cbufs[cb]->offset >> 8;
 	rstate->states[R600_CB0__CB_COLOR0_INFO] = color_info;
 	rstate->states[R600_CB0__CB_COLOR0_SIZE] = S_028060_PITCH_TILE_MAX(pitch) |
 						S_028060_SLICE_TILE_MAX(slice);
@@ -718,32 +800,29 @@ static struct radeon_state *r600_cb(struct r600_context *rctx, int cb)
 	rstate->states[R600_CB0__CB_COLOR0_FRAG] = 0x00000000;
 	rstate->states[R600_CB0__CB_COLOR0_TILE] = 0x00000000;
 	rstate->states[R600_CB0__CB_COLOR0_MASK] = 0x00000000;
-	if (radeon_state_pm4(rstate)) {
-		radeon_state_decref(rstate);
-		return NULL;
-	}
-	return rstate;
+	radeon_state_pm4(rstate);
 }
 
-static struct radeon_state *r600_db(struct r600_context *rctx)
+static void r600_db(struct r600_context *rctx, struct radeon_state *rstate,
+			const struct pipe_framebuffer_state *state)
 {
 	struct r600_screen *rscreen = rctx->screen;
 	struct r600_resource_texture *rtex;
 	struct r600_resource *rbuffer;
-	struct radeon_state *rstate;
-	const struct pipe_framebuffer_state *state = &rctx->framebuffer->state.framebuffer;
 	unsigned level;
 	unsigned pitch, slice, format;
 
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_DB, 0, 0);
 	if (state->zsbuf == NULL)
-		return NULL;
-
-	rstate = radeon_state(rscreen->rw, R600_DB_TYPE, R600_DB);
-	if (rstate == NULL)
-		return NULL;
+		return;
 
 	rtex = (struct r600_resource_texture*)state->zsbuf->texture;
+	rtex->tilled = 1;
+	rtex->array_mode = 2;
+	rtex->tile_type = 1;
+	rtex->depth = 1;
 	rbuffer = &rtex->resource;
+
 	rstate->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
 	rstate->nbo = 1;
 	rstate->placement[0] = RADEON_GEM_DOMAIN_VRAM;
@@ -751,31 +830,30 @@ static struct radeon_state *r600_db(struct r600_context *rctx)
 	pitch = (rtex->pitch[level] / rtex->bpt) / 8 - 1;
 	slice = (rtex->pitch[level] / rtex->bpt) * state->zsbuf->height / 64 - 1;
 	format = r600_translate_dbformat(state->zsbuf->texture->format);
-	rstate->states[R600_DB__DB_DEPTH_BASE] = 0x00000000;
-	rstate->states[R600_DB__DB_DEPTH_INFO] = 0x00010000 |
+	rstate->states[R600_DB__DB_DEPTH_BASE] = state->zsbuf->offset >> 8;
+	rstate->states[R600_DB__DB_DEPTH_INFO] = S_028010_ARRAY_MODE(rtex->array_mode) |
 					S_028010_FORMAT(format);
 	rstate->states[R600_DB__DB_DEPTH_VIEW] = 0x00000000;
 	rstate->states[R600_DB__DB_PREFETCH_LIMIT] = (state->zsbuf->height / 8) -1;
 	rstate->states[R600_DB__DB_DEPTH_SIZE] = S_028000_PITCH_TILE_MAX(pitch) |
 						S_028000_SLICE_TILE_MAX(slice);
-	if (radeon_state_pm4(rstate)) {
-		radeon_state_decref(rstate);
-		return NULL;
-	}
-	return rstate;
+	radeon_state_pm4(rstate);
 }
 
-static struct radeon_state *r600_rasterizer(struct r600_context *rctx)
+static void r600_rasterizer(struct r600_context *rctx, struct radeon_state *rstate)
 {
 	const struct pipe_rasterizer_state *state = &rctx->rasterizer->state.rasterizer;
 	const struct pipe_framebuffer_state *fb = &rctx->framebuffer->state.framebuffer;
+	const struct pipe_clip_state *clip = NULL;
 	struct r600_screen *rscreen = rctx->screen;
-	struct radeon_state *rstate;
 	float offset_units = 0, offset_scale = 0;
 	char depth = 0;
 	unsigned offset_db_fmt_cntl = 0;
 	unsigned tmp;
 	unsigned prov_vtx = 1;
+
+	if (rctx->clip)
+		clip = &rctx->clip->state.clip;
 	if (fb->zsbuf) {
 		offset_units = state->offset_units;
 		offset_scale = state->offset_scale * 12.0f;
@@ -796,7 +874,7 @@ static struct radeon_state *r600_rasterizer(struct r600_context *rctx)
 			break;
 		default:
 			R600_ERR("unsupported %d\n", fb->zsbuf->texture->format);
-			return NULL;
+			return;
 		}
 	}
 	offset_db_fmt_cntl |= S_028DF8_POLY_OFFSET_NEG_NUM_DB_BITS(depth);
@@ -805,9 +883,7 @@ static struct radeon_state *r600_rasterizer(struct r600_context *rctx)
 		prov_vtx = 0;
 
 	rctx->flat_shade = state->flatshade;
-	rstate = radeon_state(rscreen->rw, R600_RASTERIZER_TYPE, R600_RASTERIZER);
-	if (rstate == NULL)
-		return NULL;
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_RASTERIZER, 0, 0);
 	rstate->states[R600_RASTERIZER__SPI_INTERP_CONTROL_0] = 0x00000001;
 	if (state->sprite_coord_enable) {
 		rstate->states[R600_RASTERIZER__SPI_INTERP_CONTROL_0] |=
@@ -821,7 +897,12 @@ static struct radeon_state *r600_rasterizer(struct r600_context *rctx)
 					S_0286D4_PNT_SPRITE_TOP_1(1);
 		}
 	}
-	rstate->states[R600_RASTERIZER__PA_CL_CLIP_CNTL] = 0x00000000;
+	rstate->states[R600_RASTERIZER__PA_CL_CLIP_CNTL] = 0;
+	if (clip) {
+		rstate->states[R600_RASTERIZER__PA_CL_CLIP_CNTL] = S_028810_PS_UCP_MODE(3) | ((1 << clip->nr) - 1);
+		rstate->states[R600_RASTERIZER__PA_CL_CLIP_CNTL] |= S_028810_ZCLIP_NEAR_DISABLE(clip->depth_clamp);
+		rstate->states[R600_RASTERIZER__PA_CL_CLIP_CNTL] |= S_028810_ZCLIP_FAR_DISABLE(clip->depth_clamp);
+	}
 	rstate->states[R600_RASTERIZER__PA_SU_SC_MODE_CNTL] =
 		S_028814_PROVOKING_VTX_LAST(prov_vtx) |
 		S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
@@ -835,7 +916,7 @@ static struct radeon_state *r600_rasterizer(struct r600_context *rctx)
 			S_02881C_VS_OUT_MISC_VEC_ENA(state->point_size_per_vertex);
 	rstate->states[R600_RASTERIZER__PA_CL_NANINF_CNTL] = 0x00000000;
 	/* point size 12.4 fixed point */
-	tmp = (unsigned)(state->point_size * 8.0 / 2.0);
+	tmp = (unsigned)(state->point_size * 8.0);
 	rstate->states[R600_RASTERIZER__PA_SU_POINT_SIZE] = S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp);
 	rstate->states[R600_RASTERIZER__PA_SU_POINT_MINMAX] = 0x80000000;
 	rstate->states[R600_RASTERIZER__PA_SU_LINE_CNTL] = 0x00000008;
@@ -852,19 +933,14 @@ static struct radeon_state *r600_rasterizer(struct r600_context *rctx)
 	rstate->states[R600_RASTERIZER__PA_SU_POLY_OFFSET_FRONT_OFFSET] = fui(offset_units);
 	rstate->states[R600_RASTERIZER__PA_SU_POLY_OFFSET_BACK_SCALE] = fui(offset_scale);
 	rstate->states[R600_RASTERIZER__PA_SU_POLY_OFFSET_BACK_OFFSET] = fui(offset_units);
-	if (radeon_state_pm4(rstate)) {
-		radeon_state_decref(rstate);
-		return NULL;
-	}
-	return rstate;
+	radeon_state_pm4(rstate);
 }
 
-static struct radeon_state *r600_scissor(struct r600_context *rctx)
+static void r600_scissor(struct r600_context *rctx, struct radeon_state *rstate)
 {
 	const struct pipe_scissor_state *state = &rctx->scissor->state.scissor;
 	const struct pipe_framebuffer_state *fb = &rctx->framebuffer->state.framebuffer;
 	struct r600_screen *rscreen = rctx->screen;
-	struct radeon_state *rstate;
 	unsigned minx, maxx, miny, maxy;
 	u32 tl, br;
 
@@ -881,9 +957,7 @@ static struct radeon_state *r600_scissor(struct r600_context *rctx)
 	}
 	tl = S_028240_TL_X(minx) | S_028240_TL_Y(miny) | S_028240_WINDOW_OFFSET_DISABLE(1);
 	br = S_028244_BR_X(maxx) | S_028244_BR_Y(maxy);
-	rstate = radeon_state(rscreen->rw, R600_SCISSOR_TYPE, R600_SCISSOR);
-	if (rstate == NULL)
-		return NULL;
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_SCISSOR, 0, 0);
 	rstate->states[R600_SCISSOR__PA_SC_SCREEN_SCISSOR_TL] = tl;
 	rstate->states[R600_SCISSOR__PA_SC_SCREEN_SCISSOR_BR] = br;
 	rstate->states[R600_SCISSOR__PA_SC_WINDOW_OFFSET] = 0x00000000;
@@ -903,22 +977,14 @@ static struct radeon_state *r600_scissor(struct r600_context *rctx)
 	rstate->states[R600_SCISSOR__PA_SC_GENERIC_SCISSOR_BR] = br;
 	rstate->states[R600_SCISSOR__PA_SC_VPORT_SCISSOR_0_TL] = tl;
 	rstate->states[R600_SCISSOR__PA_SC_VPORT_SCISSOR_0_BR] = br;
-	if (radeon_state_pm4(rstate)) {
-		radeon_state_decref(rstate);
-		return NULL;
-	}
-	return rstate;
+	radeon_state_pm4(rstate);
 }
 
-static struct radeon_state *r600_viewport(struct r600_context *rctx)
+static void r600_viewport(struct r600_context *rctx, struct radeon_state *rstate, const struct pipe_viewport_state *state)
 {
-	const struct pipe_viewport_state *state = &rctx->viewport->state.viewport;
 	struct r600_screen *rscreen = rctx->screen;
-	struct radeon_state *rstate;
 
-	rstate = radeon_state(rscreen->rw, R600_VIEWPORT_TYPE, R600_VIEWPORT);
-	if (rstate == NULL)
-		return NULL;
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_VIEWPORT, 0, 0);
 	rstate->states[R600_VIEWPORT__PA_SC_VPORT_ZMIN_0] = 0x00000000;
 	rstate->states[R600_VIEWPORT__PA_SC_VPORT_ZMAX_0] = 0x3F800000;
 	rstate->states[R600_VIEWPORT__PA_CL_VPORT_XSCALE_0] = fui(state->scale[0]);
@@ -928,29 +994,28 @@ static struct radeon_state *r600_viewport(struct r600_context *rctx)
 	rstate->states[R600_VIEWPORT__PA_CL_VPORT_YOFFSET_0] = fui(state->translate[1]);
 	rstate->states[R600_VIEWPORT__PA_CL_VPORT_ZOFFSET_0] = fui(state->translate[2]);
 	rstate->states[R600_VIEWPORT__PA_CL_VTE_CNTL] = 0x0000043F;
-	if (radeon_state_pm4(rstate)) {
-		radeon_state_decref(rstate);
-		return NULL;
-	}
-	return rstate;
+	radeon_state_pm4(rstate);
 }
 
-static struct radeon_state *r600_dsa(struct r600_context *rctx)
+static void r600_dsa(struct r600_context *rctx, struct radeon_state *rstate)
 {
 	const struct pipe_depth_stencil_alpha_state *state = &rctx->dsa->state.dsa;
 	const struct pipe_stencil_ref *stencil_ref = &rctx->stencil_ref->state.stencil_ref;
 	struct r600_screen *rscreen = rctx->screen;
 	unsigned db_depth_control, alpha_test_control, alpha_ref, db_shader_control;
 	unsigned stencil_ref_mask, stencil_ref_mask_bf;
-	struct r600_shader *rshader = &rctx->ps_shader->shader;
-	struct radeon_state *rstate;
+	struct r600_shader *rshader;
 	int i;
 
-	rstate = radeon_state(rscreen->rw, R600_DSA_TYPE, R600_DSA);
-	if (rstate == NULL)
-		return NULL;
+	if (rctx->ps_shader == NULL) {
+		return;
+	}
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_DSA, 0, 0);
 
 	db_shader_control = 0x210;
+	rshader = &rctx->ps_shader->shader;
+	if (rshader->uses_kill)
+		db_shader_control |= (1 << 6);
 	for (i = 0; i < rshader->noutput; i++) {
 		if (rshader->output[i].name == TGSI_SEMANTIC_POSITION)
 			db_shader_control |= 1;
@@ -1008,11 +1073,7 @@ static struct radeon_state *r600_dsa(struct r600_context *rctx)
 	rstate->states[R600_DSA__DB_SRESULTS_COMPARE_STATE1] = 0x00000000;
 	rstate->states[R600_DSA__DB_PRELOAD_CONTROL] = 0x00000000;
 	rstate->states[R600_DSA__DB_ALPHA_TO_MASK] = 0x0000AA00;
-	if (radeon_state_pm4(rstate)) {
-		radeon_state_decref(rstate);
-		return NULL;
-	}
-	return rstate;
+	radeon_state_pm4(rstate);
 }
 
 static inline unsigned r600_tex_wrap(unsigned wrap)
@@ -1090,16 +1151,12 @@ static INLINE u32 S_FIXED(float value, u32 frac_bits)
 	return value * (1 << frac_bits);
 }
 
-static struct radeon_state *r600_sampler(struct r600_context *rctx,
-				const struct pipe_sampler_state *state,
-				unsigned id)
+static void r600_sampler(struct r600_context *rctx, struct radeon_state *rstate,
+			const struct pipe_sampler_state *state, unsigned id)
 {
 	struct r600_screen *rscreen = rctx->screen;
-	struct radeon_state *rstate;
 
-	rstate = radeon_state(rscreen->rw, R600_PS_SAMPLER_TYPE, id);
-	if (rstate == NULL)
-		return NULL;
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_SAMPLER, id, R600_SHADER_PS);
 	rstate->states[R600_PS_SAMPLER__SQ_TEX_SAMPLER_WORD0_0] =
 			S_03C000_CLAMP_X(r600_tex_wrap(state->wrap_s)) |
 			S_03C000_CLAMP_Y(r600_tex_wrap(state->wrap_t)) |
@@ -1114,11 +1171,7 @@ static struct radeon_state *r600_sampler(struct r600_context *rctx,
 			S_03C004_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 6)) |
 			S_03C004_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 6));
 	rstate->states[R600_PS_SAMPLER__SQ_TEX_SAMPLER_WORD2_0] = S_03C008_TYPE(1);
-	if (radeon_state_pm4(rstate)) {
-		radeon_state_decref(rstate);
-		return NULL;
-	}
-	return rstate;
+	radeon_state_pm4(rstate);
 }
 
 static inline unsigned r600_tex_swizzle(unsigned swizzle)
@@ -1160,6 +1213,7 @@ static inline unsigned r600_tex_dim(unsigned dim)
 	case PIPE_TEXTURE_1D:
 		return V_038000_SQ_TEX_DIM_1D;
 	case PIPE_TEXTURE_2D:
+	case PIPE_TEXTURE_RECT:
 		return V_038000_SQ_TEX_DIM_2D;
 	case PIPE_TEXTURE_3D:
 		return V_038000_SQ_TEX_DIM_3D;
@@ -1168,19 +1222,20 @@ static inline unsigned r600_tex_dim(unsigned dim)
 	}
 }
 
-static struct radeon_state *r600_resource(struct r600_context *rctx,
-					const struct pipe_sampler_view *view,
-					unsigned id)
+static void r600_resource(struct pipe_context *ctx, struct radeon_state *rstate,
+			const struct pipe_sampler_view *view, unsigned id)
 {
+	struct r600_context *rctx = r600_context(ctx);
 	struct r600_screen *rscreen = rctx->screen;
 	const struct util_format_description *desc;
 	struct r600_resource_texture *tmp;
 	struct r600_resource *rbuffer;
-	struct radeon_state *rstate;
 	unsigned format;
-	uint32_t word4 = 0, yuv_format = 0;
-	unsigned char swizzle[4];
+	uint32_t word4 = 0, yuv_format = 0, pitch = 0;
+	unsigned char swizzle[4], array_mode = 0, tile_type = 0;
+	int r;
 
+	rstate->cpm4 = 0;
 	swizzle[0] = view->swizzle_r;
 	swizzle[1] = view->swizzle_g;
 	swizzle[2] = view->swizzle_b;
@@ -1188,37 +1243,49 @@ static struct radeon_state *r600_resource(struct r600_context *rctx,
 	format = r600_translate_texformat(view->texture->format,
 					  swizzle,
 					  &word4, &yuv_format);
-	if (format == ~0)
-		return NULL;
+	if (format == ~0) {
+		return;
+	}
 	desc = util_format_description(view->texture->format);
 	if (desc == NULL) {
 		R600_ERR("unknow format %d\n", view->texture->format);
-		return NULL;
-	}
-	rstate = radeon_state(rscreen->rw, R600_PS_RESOURCE_TYPE, id);
-	if (rstate == NULL) {
-		return NULL;
+		return;
 	}
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_RESOURCE, id, R600_SHADER_PS);
 	tmp = (struct r600_resource_texture*)view->texture;
 	rbuffer = &tmp->resource;
-	rstate->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
-	rstate->bo[1] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
+	if (tmp->depth) {
+		r = r600_texture_from_depth(ctx, tmp, view->first_level);
+		if (r) {
+			return;
+		}
+		rstate->bo[0] = radeon_bo_incref(rscreen->rw, tmp->uncompressed);
+		rstate->bo[1] = radeon_bo_incref(rscreen->rw, tmp->uncompressed);
+	} else {
+		rstate->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
+		rstate->bo[1] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
+	}
 	rstate->nbo = 2;
 	rstate->placement[0] = RADEON_GEM_DOMAIN_GTT;
 	rstate->placement[1] = RADEON_GEM_DOMAIN_GTT;
 	rstate->placement[2] = RADEON_GEM_DOMAIN_GTT;
 	rstate->placement[3] = RADEON_GEM_DOMAIN_GTT;
 
+	pitch = (tmp->pitch[0] / tmp->bpt);
+	pitch = (pitch + 0x7) & ~0x7;
+
 	/* FIXME properly handle first level != 0 */
 	rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD0] =
 			S_038000_DIM(r600_tex_dim(view->texture->target)) |
-			S_038000_PITCH(((tmp->pitch[0] / tmp->bpt) / 8) - 1) |
+			S_038000_TILE_MODE(array_mode) |
+			S_038000_TILE_TYPE(tile_type) |
+			S_038000_PITCH((pitch / 8) - 1) |
 			S_038000_TEX_WIDTH(view->texture->width0 - 1);
 	rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD1] =
 			S_038004_TEX_HEIGHT(view->texture->height0 - 1) |
 			S_038004_TEX_DEPTH(view->texture->depth0 - 1) |
 			S_038004_DATA_FORMAT(format);
-	rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD2] = 0;
+	rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD2] = tmp->offset[0] >> 8;
 	rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD3] = tmp->offset[1] >> 8;
 	rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD4] =
 		        word4 | 
@@ -1232,17 +1299,12 @@ static struct radeon_state *r600_resource(struct r600_context *rctx,
 			S_038014_LAST_ARRAY(0);
 	rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD6] =
 			S_038018_TYPE(V_038010_SQ_TEX_VTX_VALID_TEXTURE);
-	if (radeon_state_pm4(rstate)) {
-		radeon_state_decref(rstate);
-		return NULL;
-	}
-	return rstate;
+	radeon_state_pm4(rstate);
 }
 
-static struct radeon_state *r600_cb_cntl(struct r600_context *rctx)
+static void r600_cb_cntl(struct r600_context *rctx, struct radeon_state *rstate)
 {
 	struct r600_screen *rscreen = rctx->screen;
-	struct radeon_state *rstate;
 	const struct pipe_blend_state *pbs = &rctx->blend->state.blend;
 	int nr_cbufs = rctx->framebuffer->state.framebuffer.nr_cbufs;
 	uint32_t color_control, target_mask, shader_mask;
@@ -1257,7 +1319,7 @@ static struct radeon_state *r600_cb_cntl(struct r600_context *rctx)
 	}
 
 	if (pbs->logicop_enable) {
-		color_control |= (pbs->logicop_func) << 16;
+		color_control |= (pbs->logicop_func << 16) | (pbs->logicop_func << 20);
 	} else {
 		color_control |= (0xcc << 16);
 	}
@@ -1277,7 +1339,7 @@ static struct radeon_state *r600_cb_cntl(struct r600_context *rctx)
 			target_mask |= (pbs->rt[0].colormask << (4 * i));
 		}
 	}
-	rstate = radeon_state(rscreen->rw, R600_CB_CNTL_TYPE, R600_CB_CNTL);
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_CB_CNTL, 0, 0);
 	rstate->states[R600_CB_CNTL__CB_SHADER_MASK] = shader_mask;
 	rstate->states[R600_CB_CNTL__CB_TARGET_MASK] = target_mask;
 	rstate->states[R600_CB_CNTL__CB_COLOR_CONTROL] = color_control;
@@ -1289,115 +1351,51 @@ static struct radeon_state *r600_cb_cntl(struct r600_context *rctx)
 	rstate->states[R600_CB_CNTL__CB_CLRCMP_DST] = 0x000000FF;
 	rstate->states[R600_CB_CNTL__CB_CLRCMP_MSK] = 0xFFFFFFFF;
 	rstate->states[R600_CB_CNTL__PA_SC_AA_MASK] = 0xFFFFFFFF;
-	if (radeon_state_pm4(rstate)) {
-		radeon_state_decref(rstate);
-		return NULL;
-	}
-	return rstate;
+	radeon_state_pm4(rstate);
 }
 
-int r600_context_hw_states(struct r600_context *rctx)
+int r600_context_hw_states(struct pipe_context *ctx)
 {
+	struct r600_context *rctx = r600_context(ctx);
 	unsigned i;
-	int r;
-	int nr_cbufs = rctx->framebuffer->state.framebuffer.nr_cbufs;
 
-	/* free previous TODO determine what need to be updated, what
-	 * doesn't
-	 */
-	//radeon_state_decref(rctx->hw_states.config);
-	rctx->hw_states.cb_cntl = radeon_state_decref(rctx->hw_states.cb_cntl);
-	rctx->hw_states.db = radeon_state_decref(rctx->hw_states.db);
-	rctx->hw_states.rasterizer = radeon_state_decref(rctx->hw_states.rasterizer);
-	rctx->hw_states.scissor = radeon_state_decref(rctx->hw_states.scissor);
-	rctx->hw_states.dsa = radeon_state_decref(rctx->hw_states.dsa);
-	rctx->hw_states.blend = radeon_state_decref(rctx->hw_states.blend);
-	rctx->hw_states.viewport = radeon_state_decref(rctx->hw_states.viewport);
-	for (i = 0; i < 8; i++) {
-		rctx->hw_states.cb[i] = radeon_state_decref(rctx->hw_states.cb[i]);
+	/* build new states */
+	r600_rasterizer(rctx, &rctx->hw_states.rasterizer);
+	r600_scissor(rctx, &rctx->hw_states.scissor);
+	r600_dsa(rctx, &rctx->hw_states.dsa);
+	r600_cb_cntl(rctx, &rctx->hw_states.cb_cntl);
+
+	/* bind states */
+	radeon_draw_bind(&rctx->draw, &rctx->hw_states.rasterizer);
+	radeon_draw_bind(&rctx->draw, &rctx->hw_states.scissor);
+	radeon_draw_bind(&rctx->draw, &rctx->hw_states.dsa);
+	radeon_draw_bind(&rctx->draw, &rctx->hw_states.cb_cntl);
+
+	radeon_draw_bind(&rctx->draw, &rctx->config);
+
+	if (rctx->viewport) {
+		radeon_draw_bind(&rctx->draw, &rctx->viewport->rstate[0]);
 	}
-	for (i = 0; i < rctx->hw_states.ps_nresource; i++) {
-		radeon_state_decref(rctx->hw_states.ps_resource[i]);
-		rctx->hw_states.ps_resource[i] = NULL;
+	if (rctx->blend) {
+		radeon_draw_bind(&rctx->draw, &rctx->blend->rstate[0]);
 	}
-	rctx->hw_states.ps_nresource = 0;
-	for (i = 0; i < rctx->hw_states.ps_nsampler; i++) {
-		radeon_state_decref(rctx->hw_states.ps_sampler[i]);
-		rctx->hw_states.ps_sampler[i] = NULL;
+	if (rctx->clip) {
+		radeon_draw_bind(&rctx->draw, &rctx->clip->rstate[0]);
 	}
-	rctx->hw_states.ps_nsampler = 0;
-
-	/* build new states */
-	rctx->hw_states.rasterizer = r600_rasterizer(rctx);
-	rctx->hw_states.scissor = r600_scissor(rctx);
-	rctx->hw_states.dsa = r600_dsa(rctx);
-	rctx->hw_states.blend = r600_blend(rctx);
-	rctx->hw_states.viewport = r600_viewport(rctx);
-	for (i = 0; i < nr_cbufs; i++) {
-		rctx->hw_states.cb[i] = r600_cb(rctx, i);
+	for (i = 0; i < rctx->framebuffer->state.framebuffer.nr_cbufs; i++) {
+		radeon_draw_bind(&rctx->draw, &rctx->framebuffer->rstate[i+1]);
+	}
+	if (rctx->framebuffer->state.framebuffer.zsbuf) {
+		radeon_draw_bind(&rctx->draw, &rctx->framebuffer->rstate[0]);
 	}
-	rctx->hw_states.db = r600_db(rctx);
-	rctx->hw_states.cb_cntl = r600_cb_cntl(rctx);
-
 	for (i = 0; i < rctx->ps_nsampler; i++) {
 		if (rctx->ps_sampler[i]) {
-			rctx->hw_states.ps_sampler[i] = r600_sampler(rctx,
-							&rctx->ps_sampler[i]->state.sampler,
-							R600_PS_SAMPLER + i);
+			radeon_draw_bind(&rctx->draw, rctx->ps_sampler[i]);
 		}
 	}
-	rctx->hw_states.ps_nsampler = rctx->ps_nsampler;
 	for (i = 0; i < rctx->ps_nsampler_view; i++) {
 		if (rctx->ps_sampler_view[i]) {
-			rctx->hw_states.ps_resource[i] = r600_resource(rctx,
-							&rctx->ps_sampler_view[i]->state.sampler_view,
-							R600_PS_RESOURCE + i);
-		}
-	}
-	rctx->hw_states.ps_nresource = rctx->ps_nsampler_view;
-
-	/* bind states */
-	r = radeon_draw_set(rctx->draw, rctx->hw_states.db);
-	if (r)
-		return r;
-	r = radeon_draw_set(rctx->draw, rctx->hw_states.rasterizer);
-	if (r)
-		return r;
-	r = radeon_draw_set(rctx->draw, rctx->hw_states.scissor);
-	if (r)
-		return r;
-	r = radeon_draw_set(rctx->draw, rctx->hw_states.dsa);
-	if (r)
-		return r;
-	r = radeon_draw_set(rctx->draw, rctx->hw_states.blend);
-	if (r)
-		return r;
-	r = radeon_draw_set(rctx->draw, rctx->hw_states.viewport);
-	if (r)
-		return r;
-	for (i = 0; i < nr_cbufs; i++) {
-		r = radeon_draw_set(rctx->draw, rctx->hw_states.cb[i]);
-		if (r)
-			return r;
-	}
-	r = radeon_draw_set(rctx->draw, rctx->hw_states.config);
-	if (r)
-		return r;
-	r = radeon_draw_set(rctx->draw, rctx->hw_states.cb_cntl);
-	if (r)
-		return r;
-	for (i = 0; i < rctx->hw_states.ps_nresource; i++) {
-		if (rctx->hw_states.ps_resource[i]) {
-			r = radeon_draw_set(rctx->draw, rctx->hw_states.ps_resource[i]);
-			if (r)
-				return r;
-		}
-	}
-	for (i = 0; i < rctx->hw_states.ps_nsampler; i++) {
-		if (rctx->hw_states.ps_sampler[i]) {
-			r = radeon_draw_set(rctx->draw, rctx->hw_states.ps_sampler[i]);
-			if (r)
-				return r;
+			radeon_draw_bind(&rctx->draw, rctx->ps_sampler_view[i]);
 		}
 	}
 	return 0;
diff --git a/src/gallium/drivers/r600/r600_state_inlines.h b/src/gallium/drivers/r600/r600_state_inlines.h
index f93c20da35..84866825aa 100644
--- a/src/gallium/drivers/r600/r600_state_inlines.h
+++ b/src/gallium/drivers/r600/r600_state_inlines.h
@@ -252,6 +252,7 @@ static INLINE uint32_t r600_translate_colorformat(enum pipe_format format)
 	case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
 	case PIPE_FORMAT_X8B8G8R8_UNORM:
 	case PIPE_FORMAT_X8R8G8B8_UNORM:
+	case PIPE_FORMAT_R8G8B8_UNORM:
 		return V_0280A0_COLOR_8_8_8_8;
 
 	case PIPE_FORMAT_R10G10B10A2_UNORM:
@@ -262,7 +263,10 @@ static INLINE uint32_t r600_translate_colorformat(enum pipe_format format)
 
 	case PIPE_FORMAT_Z24X8_UNORM:
 	case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
-		return V_0280A0_COLOR_24_8;
+		return V_0280A0_COLOR_8_24;
+
+	case PIPE_FORMAT_R32_FLOAT:
+		return V_0280A0_COLOR_32_FLOAT;
 
 		/* 64-bit buffers. */
 	case PIPE_FORMAT_R16G16B16A16_UNORM:
@@ -275,6 +279,7 @@ static INLINE uint32_t r600_translate_colorformat(enum pipe_format format)
 
 		/* 128-bit buffers. */
 	case PIPE_FORMAT_R32G32B32_FLOAT:
+	  	return V_0280A0_COLOR_32_32_32_FLOAT;
 	case PIPE_FORMAT_R32G32B32A32_FLOAT:
 		return V_0280A0_COLOR_32_32_32_32_FLOAT;
 
diff --git a/src/gallium/drivers/r600/r600_texture.c b/src/gallium/drivers/r600/r600_texture.c
index 30d79ebdd6..b6698e3885 100644
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@@ -24,6 +24,7 @@
  *      Jerome Glisse
  *      Corbin Simpson
  */
+#include <errno.h>
 #include <pipe/p_screen.h>
 #include <util/u_format.h>
 #include <util/u_math.h>
@@ -33,10 +34,26 @@
 #include "r600_screen.h"
 #include "r600_context.h"
 #include "r600_resource.h"
+#include "r600_state_inlines.h"
 #include "r600d.h"
 
 extern struct u_resource_vtbl r600_texture_vtbl;
 
+/* Copy from a tiled texture to a detiled one. */
+static void r600_copy_from_tiled_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer)
+{
+	struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer;
+	struct pipe_resource *texture = transfer->resource;
+	struct pipe_subresource subdst;
+
+	subdst.face = 0;
+	subdst.level = 0;
+	ctx->resource_copy_region(ctx, rtransfer->linear_texture,
+				subdst, 0, 0, 0, texture, transfer->sr,
+				transfer->box.x, transfer->box.y, transfer->box.z,
+				transfer->box.width, transfer->box.height);
+}
+
 static unsigned long r600_texture_get_offset(struct r600_resource_texture *rtex,
 					unsigned level, unsigned zslice,
 					unsigned face)
@@ -65,7 +82,9 @@ static void r600_setup_miptree(struct r600_screen *rscreen, struct r600_resource
 	for (i = 0, offset = 0; i <= ptex->last_level; i++) {
 		w = u_minify(ptex->width0, i);
 		h = u_minify(ptex->height0, i);
+		h = util_next_power_of_two(h);
 		pitch = util_format_get_stride(ptex->format, align(w, 64));
+		pitch = align(pitch, 256);
 		layer_size = pitch * h;
 		if (ptex->target == PIPE_TEXTURE_CUBE)
 			size = layer_size * 6;
@@ -74,6 +93,8 @@ static void r600_setup_miptree(struct r600_screen *rscreen, struct r600_resource
 		rtex->offset[i] = offset;
 		rtex->layer_size[i] = layer_size;
 		rtex->pitch[i] = pitch;
+		rtex->width[i] = w;
+		rtex->height[i] = h;
 		offset += size;
 	}
 	rtex->size = offset;
@@ -104,10 +125,22 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
 		FREE(rtex);
 		return NULL;
 	}
-
 	return &resource->base.b;
 }
 
+static void r600_texture_destroy_state(struct pipe_resource *ptexture)
+{
+	struct r600_resource_texture *rtexture = (struct r600_resource_texture*)ptexture;
+
+	for (int i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++) {
+		radeon_state_fini(&rtexture->scissor[i]);
+		radeon_state_fini(&rtexture->db[i]);
+		for (int j = 0; j < 8; j++) {
+			radeon_state_fini(&rtexture->cb[j][i]);
+		}
+	}
+}
+
 static void r600_texture_destroy(struct pipe_screen *screen,
 				 struct pipe_resource *ptex)
 {
@@ -118,6 +151,10 @@ static void r600_texture_destroy(struct pipe_screen *screen,
 	if (resource->bo) {
 		radeon_bo_decref(rscreen->rw, resource->bo);
 	}
+	if (rtex->uncompressed) {
+		radeon_bo_decref(rscreen->rw, rtex->uncompressed);
+	}
+	r600_texture_destroy_state(ptex);
 	FREE(rtex);
 }
 
@@ -168,7 +205,8 @@ struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen,
 	}
 
 	/* Support only 2D textures without mipmaps */
-	if (templ->target != PIPE_TEXTURE_2D || templ->depth0 != 1 || templ->last_level != 0)
+	if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT) ||
+	      templ->depth0 != 1 || templ->last_level != 0)
 		return NULL;
 
 	rtex = CALLOC_STRUCT(r600_resource_texture);
@@ -181,9 +219,12 @@ struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen,
 	pipe_reference_init(&resource->base.b.reference, 1);
 	resource->base.b.screen = screen;
 	resource->bo = bo;
+	rtex->depth = 0;
 	rtex->pitch_override = whandle->stride;
 	rtex->bpt = util_format_get_blocksize(templ->format);
 	rtex->pitch[0] = whandle->stride;
+	rtex->width[0] = templ->width0;
+	rtex->height[0] = templ->height0;
 	rtex->offset[0] = 0;
 	rtex->size = align(rtex->pitch[0] * templ->height0, 64);
 
@@ -205,6 +246,7 @@ struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx,
 						const struct pipe_box *box)
 {
 	struct r600_resource_texture *rtex = (struct r600_resource_texture*)texture;
+	struct pipe_resource resource;
 	struct r600_transfer *trans;
 
 	trans = CALLOC_STRUCT(r600_transfer);
@@ -216,48 +258,117 @@ struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx,
 	trans->transfer.box = *box;
 	trans->transfer.stride = rtex->pitch[sr.level];
 	trans->offset = r600_texture_get_offset(rtex, sr.level, box->z, sr.face);
+	if (rtex->tilled && !rtex->depth) {
+		resource.target = PIPE_TEXTURE_2D;
+		resource.format = texture->format;
+		resource.width0 = box->width;
+		resource.height0 = box->height;
+		resource.depth0 = 0;
+		resource.last_level = 0;
+		resource.nr_samples = 0;
+		resource.usage = PIPE_USAGE_DYNAMIC;
+		resource.bind = 0;
+		resource.flags = 0;
+		/* For texture reading, the temporary (detiled) texture is used as
+		 * a render target when blitting from a tiled texture. */
+		if (usage & PIPE_TRANSFER_READ) {
+			resource.bind |= PIPE_BIND_RENDER_TARGET;
+		}
+		/* For texture writing, the temporary texture is used as a sampler
+		 * when blitting into a tiled texture. */
+		if (usage & PIPE_TRANSFER_WRITE) {
+			resource.bind |= PIPE_BIND_SAMPLER_VIEW;
+		}
+		/* Create the temporary texture. */
+		trans->linear_texture = ctx->screen->resource_create(ctx->screen, &resource);
+		if (trans->linear_texture == NULL) {
+			R600_ERR("failed to create temporary texture to hold untiled copy\n");
+			pipe_resource_reference(&trans->transfer.resource, NULL);
+			FREE(trans);
+			return NULL;
+		}
+		if (usage & PIPE_TRANSFER_READ) {
+			/* We cannot map a tiled texture directly because the data is
+			 * in a different order, therefore we do detiling using a blit. */
+			r600_copy_from_tiled_texture(ctx, trans);
+			/* Always referenced in the blit. */
+			ctx->flush(ctx, 0, NULL);
+		}
+	}
 	return &trans->transfer;
 }
 
 void r600_texture_transfer_destroy(struct pipe_context *ctx,
-				   struct pipe_transfer *trans)
+				   struct pipe_transfer *transfer)
 {
-	pipe_resource_reference(&trans->resource, NULL);
-	FREE(trans);
+	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
+
+	if (rtransfer->linear_texture) {
+		pipe_resource_reference(&rtransfer->linear_texture, NULL);
+	}
+	pipe_resource_reference(&transfer->resource, NULL);
+	FREE(transfer);
 }
 
 void* r600_texture_transfer_map(struct pipe_context *ctx,
 				struct pipe_transfer* transfer)
 {
 	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
-	struct r600_resource *resource;
+	struct radeon_bo *bo;
 	enum pipe_format format = transfer->resource->format;
 	struct r600_screen *rscreen = r600_screen(ctx->screen);
+	struct r600_resource_texture *rtex;
+	unsigned long offset = 0;
 	char *map;
+	int r;
 
 	r600_flush(ctx, 0, NULL);
-
-	resource = (struct r600_resource *)transfer->resource;
-	if (radeon_bo_map(rscreen->rw, resource->bo)) {
+	if (rtransfer->linear_texture) {
+		bo = ((struct r600_resource *)rtransfer->linear_texture)->bo;
+	} else {
+		rtex = (struct r600_resource_texture*)transfer->resource;
+		if (rtex->depth) {
+			r = r600_texture_from_depth(ctx, rtex, transfer->sr.level);
+			if (r) {
+				return NULL;
+			}
+			r600_flush(ctx, 0, NULL);
+			bo = rtex->uncompressed;
+		} else {
+			bo = ((struct r600_resource *)transfer->resource)->bo;
+		}
+		offset = rtransfer->offset +
+			transfer->box.y / util_format_get_blockheight(format) * transfer->stride +
+			transfer->box.x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
+	}
+	if (radeon_bo_map(rscreen->rw, bo)) {
 		return NULL;
 	}
-	radeon_bo_wait(rscreen->rw, resource->bo);
-
-	map = resource->bo->data;
+	radeon_bo_wait(rscreen->rw, bo);
 
-	return map + rtransfer->offset +
-		transfer->box.y / util_format_get_blockheight(format) * transfer->stride +
-		transfer->box.x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
+	map = bo->data;
+	return map + offset;
 }
 
 void r600_texture_transfer_unmap(struct pipe_context *ctx,
 				 struct pipe_transfer* transfer)
 {
+	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
 	struct r600_screen *rscreen = r600_screen(ctx->screen);
-	struct r600_resource *resource;
-
-	resource = (struct r600_resource *)transfer->resource;
-	radeon_bo_unmap(rscreen->rw, resource->bo);
+	struct r600_resource_texture *rtex;
+	struct radeon_bo *bo;
+
+	if (rtransfer->linear_texture) {
+		bo = ((struct r600_resource *)rtransfer->linear_texture)->bo;
+	} else {
+		rtex = (struct r600_resource_texture*)transfer->resource;
+		if (rtex->depth) {
+			bo = rtex->uncompressed;
+		} else {
+			bo = ((struct r600_resource *)transfer->resource)->bo;
+		}
+	}
+	radeon_bo_unmap(rscreen->rw, bo);
 }
 
 struct u_resource_vtbl r600_texture_vtbl =
@@ -280,51 +391,51 @@ void r600_init_screen_texture_functions(struct pipe_screen *screen)
 }
 
 static unsigned r600_get_swizzle_combined(const unsigned char *swizzle_format,
-					  const unsigned char *swizzle_view)
+		const unsigned char *swizzle_view)
 {
-    unsigned i;
-    unsigned char swizzle[4];
-    unsigned result = 0;
-    const uint32_t swizzle_shift[4] = {
-	    16, 19, 22, 25,
-    };
-    const uint32_t swizzle_bit[4] = {
-	    0, 1, 2, 3,
-    };
-
-    if (swizzle_view) {
-        /* Combine two sets of swizzles. */
-        for (i = 0; i < 4; i++) {
-            swizzle[i] = swizzle_view[i] <= UTIL_FORMAT_SWIZZLE_W ?
-                         swizzle_format[swizzle_view[i]] : swizzle_view[i];
-        }
-    } else {
-        memcpy(swizzle, swizzle_format, 4);
-    }
-
-    /* Get swizzle. */
-    for (i = 0; i < 4; i++) {
-        switch (swizzle[i]) {
-            case UTIL_FORMAT_SWIZZLE_Y:
-                result |= swizzle_bit[1] << swizzle_shift[i];
-                break;
-            case UTIL_FORMAT_SWIZZLE_Z:
-                result |= swizzle_bit[2] << swizzle_shift[i];
-                break;
-            case UTIL_FORMAT_SWIZZLE_W:
-                result |= swizzle_bit[3] << swizzle_shift[i];
-                break;
-            case UTIL_FORMAT_SWIZZLE_0:
-                result |= V_038010_SQ_SEL_0 << swizzle_shift[i];
-                break;
-            case UTIL_FORMAT_SWIZZLE_1:
-                result |= V_038010_SQ_SEL_1 << swizzle_shift[i];
-                break;
-            default: /* UTIL_FORMAT_SWIZZLE_X */
-                result |= swizzle_bit[0] << swizzle_shift[i];
-        }
-    }
-    return result;
+	unsigned i;
+	unsigned char swizzle[4];
+	unsigned result = 0;
+	const uint32_t swizzle_shift[4] = {
+		16, 19, 22, 25,
+	};
+	const uint32_t swizzle_bit[4] = {
+		0, 1, 2, 3,
+	};
+
+	if (swizzle_view) {
+		/* Combine two sets of swizzles. */
+		for (i = 0; i < 4; i++) {
+			swizzle[i] = swizzle_view[i] <= UTIL_FORMAT_SWIZZLE_W ?
+				swizzle_format[swizzle_view[i]] : swizzle_view[i];
+		}
+	} else {
+		memcpy(swizzle, swizzle_format, 4);
+	}
+
+	/* Get swizzle. */
+	for (i = 0; i < 4; i++) {
+		switch (swizzle[i]) {
+		case UTIL_FORMAT_SWIZZLE_Y:
+			result |= swizzle_bit[1] << swizzle_shift[i];
+			break;
+		case UTIL_FORMAT_SWIZZLE_Z:
+			result |= swizzle_bit[2] << swizzle_shift[i];
+			break;
+		case UTIL_FORMAT_SWIZZLE_W:
+			result |= swizzle_bit[3] << swizzle_shift[i];
+			break;
+		case UTIL_FORMAT_SWIZZLE_0:
+			result |= V_038010_SQ_SEL_0 << swizzle_shift[i];
+			break;
+		case UTIL_FORMAT_SWIZZLE_1:
+			result |= V_038010_SQ_SEL_1 << swizzle_shift[i];
+			break;
+		default: /* UTIL_FORMAT_SWIZZLE_X */
+			result |= swizzle_bit[0] << swizzle_shift[i];
+		}
+	}
+	return result;
 }
 
 /* texture format translate */
@@ -344,19 +455,21 @@ uint32_t r600_translate_texformat(enum pipe_format format,
 	};
 	desc = util_format_description(format);
 
+	word4 |= r600_get_swizzle_combined(desc->swizzle, swizzle_view);
+
 	/* Colorspace (return non-RGB formats directly). */
 	switch (desc->colorspace) {
 		/* Depth stencil formats */
 	case UTIL_FORMAT_COLORSPACE_ZS:
 		switch (format) {
 		case PIPE_FORMAT_Z16_UNORM:
-			result = V_028010_DEPTH_16;
+			result = V_0280A0_COLOR_16;
 			goto out_word4;
 		case PIPE_FORMAT_Z24X8_UNORM:
-			result = V_028010_DEPTH_X8_24;
+			result = V_0280A0_COLOR_8_24;
 			goto out_word4;
 		case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
-			result = V_028010_DEPTH_8_24;
+			result = V_0280A0_COLOR_8_24;
 			goto out_word4;
 		default:
 			goto out_unknown;
@@ -382,8 +495,6 @@ uint32_t r600_translate_texformat(enum pipe_format format,
 		break;
 	}
 	
-	word4 |= r600_get_swizzle_combined(desc->swizzle, swizzle_view);
-
 	/* S3TC formats. TODO */
 	if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
 		goto out_unknown;
@@ -519,9 +630,221 @@ out_word4:
 		*word4_p = word4;
 	if (yuv_format_p)
 		*yuv_format_p = yuv_format;
-//	fprintf(stderr,"returning %08x %08x %08x\n", result, word4, yuv_format);
 	return result;
 out_unknown:
 //	R600_ERR("Unable to handle texformat %d %s\n", format, util_format_name(format));
 	return ~0;
 }
+
+int r600_texture_from_depth(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+	int r;
+
+	if (!rtexture->depth) {
+		/* This shouldn't happen maybe print a warning */
+		return 0;
+	}
+	if (rtexture->uncompressed && !rtexture->dirty) {
+		/* Uncompressed bo already in good state */
+		return 0;
+	}
+
+	/* allocate uncompressed texture */
+	if (rtexture->uncompressed == NULL) {
+		rtexture->uncompressed = radeon_bo(rscreen->rw, 0, rtexture->size, 4096, NULL);
+		if (rtexture->uncompressed == NULL) {
+			return -ENOMEM;
+		}
+	}
+
+	/* render a rectangle covering whole buffer to uncompress depth */
+	r = r600_blit_uncompress_depth(ctx, rtexture, level);
+	if (r) {
+		return r;
+	}
+
+	rtexture->dirty = 0;
+	return 0;
+}
+
+static void r600_texture_state_scissor(struct r600_screen *rscreen,
+					struct r600_resource_texture *rtexture,
+					unsigned level)
+{
+	struct radeon_state *rstate = &rtexture->scissor[level];
+
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_SCISSOR, 0, 0);
+	/* set states (most default value are 0 and struct already
+	 * initialized to 0, thus avoid resetting them)
+	 */
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_0_BR] = S_028244_BR_X(rtexture->width[level]) | S_028244_BR_Y(rtexture->height[level]);
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_0_TL] = 0x80000000;
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_1_BR] = S_028244_BR_X(rtexture->width[level]) | S_028244_BR_Y(rtexture->height[level]);
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_1_TL] = 0x80000000;
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_2_BR] = S_028244_BR_X(rtexture->width[level]) | S_028244_BR_Y(rtexture->height[level]);
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_2_TL] = 0x80000000;
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_3_BR] = S_028244_BR_X(rtexture->width[level]) | S_028244_BR_Y(rtexture->height[level]);
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_3_TL] = 0x80000000;
+	rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_RULE] = 0x0000FFFF;
+	rstate->states[R600_SCISSOR__PA_SC_EDGERULE] = 0xAAAAAAAA;
+	rstate->states[R600_SCISSOR__PA_SC_GENERIC_SCISSOR_BR] = S_028244_BR_X(rtexture->width[level]) | S_028244_BR_Y(rtexture->height[level]);
+	rstate->states[R600_SCISSOR__PA_SC_GENERIC_SCISSOR_TL] = 0x80000000;
+	rstate->states[R600_SCISSOR__PA_SC_SCREEN_SCISSOR_BR] = S_028244_BR_X(rtexture->width[level]) | S_028244_BR_Y(rtexture->height[level]);
+	rstate->states[R600_SCISSOR__PA_SC_SCREEN_SCISSOR_TL] = 0x80000000;
+	rstate->states[R600_SCISSOR__PA_SC_VPORT_SCISSOR_0_BR] = S_028244_BR_X(rtexture->width[level]) | S_028244_BR_Y(rtexture->height[level]);
+	rstate->states[R600_SCISSOR__PA_SC_VPORT_SCISSOR_0_TL] = 0x80000000;
+	rstate->states[R600_SCISSOR__PA_SC_WINDOW_SCISSOR_BR] = S_028244_BR_X(rtexture->width[level]) | S_028244_BR_Y(rtexture->height[level]);
+	rstate->states[R600_SCISSOR__PA_SC_WINDOW_SCISSOR_TL] = 0x80000000;
+
+	radeon_state_pm4(rstate);
+}
+
+static void r600_texture_state_cb(struct r600_screen *rscreen, struct r600_resource_texture *rtexture, unsigned cb, unsigned level)
+{
+	struct radeon_state *rstate;
+	struct r600_resource *rbuffer;
+	unsigned pitch, slice;
+	unsigned color_info;
+	unsigned format, swap, ntype;
+	const struct util_format_description *desc;
+
+	rstate = &rtexture->cb[cb][level];
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_CB0 + cb, 0, 0);
+	rbuffer = &rtexture->resource;
+
+	/* set states (most default value are 0 and struct already
+	 * initialized to 0, thus avoid resetting them)
+	 */
+	pitch = (rtexture->pitch[level] / rtexture->bpt) / 8 - 1;
+	slice = (rtexture->pitch[level] / rtexture->bpt) * rtexture->height[level] / 64 - 1;
+	ntype = 0;
+	desc = util_format_description(rbuffer->base.b.format);
+	if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
+		ntype = V_0280A0_NUMBER_SRGB;
+	format = r600_translate_colorformat(rtexture->resource.base.b.format);
+	swap = r600_translate_colorswap(rtexture->resource.base.b.format);
+	if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+		rstate->bo[0] = radeon_bo_incref(rscreen->rw, rtexture->uncompressed);
+		rstate->bo[1] = radeon_bo_incref(rscreen->rw, rtexture->uncompressed);
+		rstate->bo[2] = radeon_bo_incref(rscreen->rw, rtexture->uncompressed);
+		rstate->placement[0] = RADEON_GEM_DOMAIN_GTT;
+		rstate->placement[2] = RADEON_GEM_DOMAIN_GTT;
+		rstate->placement[4] = RADEON_GEM_DOMAIN_GTT;
+		rstate->nbo = 3;
+		color_info = 0;
+	} else {
+		rstate->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
+		rstate->bo[1] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
+		rstate->bo[2] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
+		rstate->placement[0] = RADEON_GEM_DOMAIN_GTT;
+		rstate->placement[2] = RADEON_GEM_DOMAIN_GTT;
+		rstate->placement[4] = RADEON_GEM_DOMAIN_GTT;
+		rstate->nbo = 3;
+		color_info = S_0280A0_SOURCE_FORMAT(1);
+	}
+	color_info |= S_0280A0_FORMAT(format) |
+		S_0280A0_COMP_SWAP(swap) |
+		S_0280A0_BLEND_CLAMP(1) |
+		S_0280A0_NUMBER_TYPE(ntype);
+	rstate->states[R600_CB0__CB_COLOR0_BASE] = rtexture->offset[level] >> 8;
+	rstate->states[R600_CB0__CB_COLOR0_INFO] = color_info;
+	rstate->states[R600_CB0__CB_COLOR0_SIZE] = S_028060_PITCH_TILE_MAX(pitch) |
+						S_028060_SLICE_TILE_MAX(slice);
+
+	radeon_state_pm4(rstate);
+}
+
+static void r600_texture_state_db(struct r600_screen *rscreen, struct r600_resource_texture *rtexture, unsigned level)
+{
+	struct radeon_state *rstate = &rtexture->db[level];
+	struct r600_resource *rbuffer;
+	unsigned pitch, slice, format;
+
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_DB, 0, 0);
+	rbuffer = &rtexture->resource;
+	rtexture->tilled = 1;
+	rtexture->array_mode = 2;
+	rtexture->tile_type = 1;
+	rtexture->depth = 1;
+
+	/* set states (most default value are 0 and struct already
+	 * initialized to 0, thus avoid resetting them)
+	 */
+	pitch = (rtexture->pitch[level] / rtexture->bpt) / 8 - 1;
+	slice = (rtexture->pitch[level] / rtexture->bpt) * rtexture->height[level] / 64 - 1;
+	format = r600_translate_dbformat(rbuffer->base.b.format);
+	rstate->states[R600_DB__DB_DEPTH_BASE] = rtexture->offset[level] >> 8;
+	rstate->states[R600_DB__DB_DEPTH_INFO] = S_028010_ARRAY_MODE(rtexture->array_mode) |
+					S_028010_FORMAT(format);
+	rstate->states[R600_DB__DB_DEPTH_VIEW] = 0x00000000;
+	rstate->states[R600_DB__DB_PREFETCH_LIMIT] = (rtexture->height[level] / 8) -1;
+	rstate->states[R600_DB__DB_DEPTH_SIZE] = S_028000_PITCH_TILE_MAX(pitch) |
+						S_028000_SLICE_TILE_MAX(slice);
+	rstate->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
+	rstate->placement[0] = RADEON_GEM_DOMAIN_GTT;
+	rstate->nbo = 1;
+
+	radeon_state_pm4(rstate);
+}
+
+int r600_texture_scissor(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+
+	if (!rtexture->scissor[level].cpm4) {
+		r600_texture_state_scissor(rscreen, rtexture, level);
+	}
+	return 0;
+}
+
+static void r600_texture_state_viewport(struct r600_screen *rscreen, struct r600_resource_texture *rtexture, unsigned level)
+{
+	struct radeon_state *rstate = &rtexture->viewport[level];
+
+	radeon_state_init(rstate, rscreen->rw, R600_STATE_VIEWPORT, 0, 0);
+
+	/* set states (most default value are 0 and struct already
+	 * initialized to 0, thus avoid resetting them)
+	 */
+	rstate->states[R600_VIEWPORT__PA_CL_VPORT_XOFFSET_0] = fui((float)rtexture->width[level]/2.0);
+	rstate->states[R600_VIEWPORT__PA_CL_VPORT_XSCALE_0] = fui((float)rtexture->width[level]/2.0);
+	rstate->states[R600_VIEWPORT__PA_CL_VPORT_YOFFSET_0] = fui((float)rtexture->height[level]/2.0);
+	rstate->states[R600_VIEWPORT__PA_CL_VPORT_YSCALE_0] = fui((float)-rtexture->height[level]/2.0);
+	rstate->states[R600_VIEWPORT__PA_CL_VPORT_ZOFFSET_0] = 0x3F000000;
+	rstate->states[R600_VIEWPORT__PA_CL_VPORT_ZSCALE_0] = 0x3F000000;
+	rstate->states[R600_VIEWPORT__PA_CL_VTE_CNTL] = 0x0000043F;
+	rstate->states[R600_VIEWPORT__PA_SC_VPORT_ZMAX_0] = 0x3F800000;
+
+	radeon_state_pm4(rstate);
+}
+
+int r600_texture_cb(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned cb, unsigned level)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+
+	if (!rtexture->cb[cb][level].cpm4) {
+		r600_texture_state_cb(rscreen, rtexture, cb, level);
+	}
+	return 0;
+}
+
+int r600_texture_db(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+
+	if (!rtexture->db[level].cpm4) {
+		r600_texture_state_db(rscreen, rtexture, level);
+	}
+	return 0;
+}
+
+int r600_texture_viewport(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level)
+{
+	struct r600_screen *rscreen = r600_screen(ctx->screen);
+
+	if (!rtexture->viewport[level].cpm4) {
+		r600_texture_state_viewport(rscreen, rtexture, level);
+	}
+	return 0;
+}
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index 53388f822e..7b9a983d53 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -199,6 +199,7 @@
 #define     V_0280A0_COLOR_16_16_16_16_FLOAT           0x00000020
 #define     V_0280A0_COLOR_32_32_32_32                 0x00000022
 #define     V_0280A0_COLOR_32_32_32_32_FLOAT           0x00000023
+#define     V_0280A0_COLOR_32_32_32_FLOAT              0x00000030
 #define   S_0280A0_ARRAY_MODE(x)                       (((x) & 0xF) << 8)
 #define   G_0280A0_ARRAY_MODE(x)                       (((x) >> 8) & 0xF)
 #define   C_0280A0_ARRAY_MODE                          0xFFFFF0FF
@@ -1316,4 +1317,11 @@
 #define   G_0286D4_PNT_SPRITE_TOP_1(x)                 (((x) >> 14) & 0x1)
 #define   C_0286D4_PNT_SPRITE_TOP_1                    0xFFFFBFFF
 
+#define SQ_TEX_INST_LD 0x03
+#define SQ_TEX_INST_GET_GRADIENTS_H 0x7
+#define SQ_TEX_INST_GET_GRADIENTS_V 0x8
+
+#define SQ_TEX_INST_SAMPLE 0x10
+#define SQ_TEX_INST_SAMPLE_L 0x11
+#define SQ_TEX_INST_SAMPLE_C 0x18
 #endif
diff --git a/src/gallium/drivers/r600/radeon.h b/src/gallium/drivers/r600/radeon.h
index 8f00a4895a..aaac8de528 100644
--- a/src/gallium/drivers/r600/radeon.h
+++ b/src/gallium/drivers/r600/radeon.h
@@ -77,6 +77,14 @@ enum radeon_family {
 	CHIP_LAST,
 };
 
+enum {
+	R600_SHADER_PS = 1,
+	R600_SHADER_VS,
+	R600_SHADER_GS,
+	R600_SHADER_FS,
+	R600_SHADER_MAX = R600_SHADER_FS,
+};
+
 enum radeon_family radeon_get_family(struct radeon *rw);
 
 /*
@@ -98,22 +106,23 @@ struct radeon_bo *radeon_bo_incref(struct radeon *radeon, struct radeon_bo *bo);
 struct radeon_bo *radeon_bo_decref(struct radeon *radeon, struct radeon_bo *bo);
 int radeon_bo_wait(struct radeon *radeon, struct radeon_bo *bo);
 
+struct radeon_stype_info;
 /*
  * states functions
  */
 struct radeon_state {
 	struct radeon			*radeon;
 	unsigned			refcount;
-	unsigned			type;
+	struct radeon_stype_info	*stype;
+	unsigned			state_id;
 	unsigned			id;
+	unsigned			shader_index;
 	unsigned			nstates;
-	u32				*states;
+	u32				states[64];
 	unsigned			npm4;
 	unsigned			cpm4;
 	u32				pm4_crc;
-	u32				*pm4;
-	u32				nimmd;
-	u32				*immd;
+	u32				pm4[128];
 	unsigned			nbo;
 	struct radeon_bo		*bo[4];
 	unsigned			nreloc;
@@ -123,38 +132,22 @@ struct radeon_state {
 	unsigned			bo_dirty[4];
 };
 
-struct radeon_state *radeon_state(struct radeon *radeon, u32 type, u32 id);
-struct radeon_state *radeon_state_incref(struct radeon_state *state);
-struct radeon_state *radeon_state_decref(struct radeon_state *state);
+int radeon_state_init(struct radeon_state *rstate, struct radeon *radeon, u32 type, u32 id, u32 shader_class);
+void radeon_state_fini(struct radeon_state *state);
 int radeon_state_pm4(struct radeon_state *state);
+int radeon_state_convert(struct radeon_state *state, u32 stype, u32 id, u32 shader_type);
 
 /*
  * draw functions
  */
 struct radeon_draw {
-	unsigned			refcount;
 	struct radeon			*radeon;
-	unsigned			nstate;
 	struct radeon_state		**state;
-	unsigned			cpm4;
 };
 
-struct radeon_draw *radeon_draw(struct radeon *radeon);
-struct radeon_draw *radeon_draw_duplicate(struct radeon_draw *draw);
-struct radeon_draw *radeon_draw_incref(struct radeon_draw *draw);
-struct radeon_draw *radeon_draw_decref(struct radeon_draw *draw);
-int radeon_draw_set(struct radeon_draw *draw, struct radeon_state *state);
-int radeon_draw_set_new(struct radeon_draw *draw, struct radeon_state *state);
-int radeon_draw_check(struct radeon_draw *draw);
-
-struct radeon_ctx *radeon_ctx(struct radeon *radeon);
-struct radeon_ctx *radeon_ctx_decref(struct radeon_ctx *ctx);
-struct radeon_ctx *radeon_ctx_incref(struct radeon_ctx *ctx);
-int radeon_ctx_set_draw(struct radeon_ctx *ctx, struct radeon_draw *draw);
-int radeon_ctx_set_draw_new(struct radeon_ctx *ctx, struct radeon_draw *draw);
-int radeon_ctx_pm4(struct radeon_ctx *ctx);
-int radeon_ctx_submit(struct radeon_ctx *ctx);
-void radeon_ctx_dump_bof(struct radeon_ctx *ctx, const char *file);
+int radeon_draw_init(struct radeon_draw *draw, struct radeon *radeon);
+void radeon_draw_bind(struct radeon_draw *draw, struct radeon_state *state);
+void radeon_draw_unbind(struct radeon_draw *draw, struct radeon_state *state);
 
 /*
  * radeon context functions
@@ -169,95 +162,57 @@ struct radeon_cs_reloc {
 #pragma pack()
 
 struct radeon_ctx {
-	int				refcount;
 	struct radeon			*radeon;
 	u32				*pm4;
-	u32				cpm4;
-	u32				draw_cpm4;
-	unsigned			id;
-	unsigned			next_id;
+	int				cdwords;
+	int				ndwords;
 	unsigned			nreloc;
 	struct radeon_cs_reloc		*reloc;
 	unsigned			nbo;
 	struct radeon_bo		**bo;
-	unsigned			ndraw;
-	struct radeon_draw		*cdraw;
-	struct radeon_draw		**draw;
-	unsigned			nstate;
-	struct radeon_state		**state;
 };
 
+int radeon_ctx_init(struct radeon_ctx *ctx, struct radeon *radeon);
+void radeon_ctx_fini(struct radeon_ctx *ctx);
+void radeon_ctx_clear(struct radeon_ctx *ctx);
+int radeon_ctx_set_draw(struct radeon_ctx *ctx, struct radeon_draw *draw);
+int radeon_ctx_submit(struct radeon_ctx *ctx);
+void radeon_ctx_dump_bof(struct radeon_ctx *ctx, const char *file);
+int radeon_ctx_set_query_state(struct radeon_ctx *ctx, struct radeon_state *state);
+
 /*
  * R600/R700
  */
 
-#define R600_NSTATE				1280
-#define R600_NTYPE				32
+enum r600_stype {
+	R600_STATE_CONFIG,
+	R600_STATE_CB_CNTL,
+	R600_STATE_RASTERIZER,
+	R600_STATE_VIEWPORT,
+	R600_STATE_SCISSOR,
+	R600_STATE_BLEND,
+	R600_STATE_DSA,
+	R600_STATE_SHADER,          /* has PS,VS,GS,FS variants */
+	R600_STATE_CONSTANT,        /* has PS,VS,GS,FS variants */
+	R600_STATE_RESOURCE,        /* has PS,VS,GS,FS variants */
+	R600_STATE_SAMPLER,         /* has PS,VS,GS,FS variants */
+	R600_STATE_SAMPLER_BORDER,  /* has PS,VS,GS,FS variants */
+	R600_STATE_CB0,
+	R600_STATE_CB1,
+	R600_STATE_CB2,
+	R600_STATE_CB3,
+	R600_STATE_CB4,
+	R600_STATE_CB5,
+	R600_STATE_CB6,
+	R600_STATE_CB7,
+	R600_STATE_DB,
+	R600_STATE_QUERY_BEGIN,
+	R600_STATE_QUERY_END,
+	R600_STATE_UCP,
+	R600_STATE_VGT,
+	R600_STATE_DRAW,
+};
 
-#define R600_CONFIG				0
-#define R600_CONFIG_TYPE				0
-#define R600_CB_CNTL				1
-#define R600_CB_CNTL_TYPE				1
-#define R600_RASTERIZER				2
-#define R600_RASTERIZER_TYPE				2
-#define R600_VIEWPORT				3
-#define R600_VIEWPORT_TYPE				3
-#define R600_SCISSOR				4
-#define R600_SCISSOR_TYPE				4
-#define R600_BLEND				5
-#define R600_BLEND_TYPE				5
-#define R600_DSA				6
-#define R600_DSA_TYPE				6
-#define R600_VS_SHADER				7
-#define R600_VS_SHADER_TYPE				7
-#define R600_PS_SHADER				8
-#define R600_PS_SHADER_TYPE				8
-#define R600_PS_CONSTANT				9
-#define R600_PS_CONSTANT_TYPE				9
-#define R600_VS_CONSTANT				265
-#define R600_VS_CONSTANT_TYPE				10
-#define R600_PS_RESOURCE				521
-#define R600_PS_RESOURCE_TYPE				11
-#define R600_VS_RESOURCE				681
-#define R600_VS_RESOURCE_TYPE				12
-#define R600_FS_RESOURCE				841
-#define R600_FS_RESOURCE_TYPE				13
-#define R600_GS_RESOURCE				1001
-#define R600_GS_RESOURCE_TYPE				14
-#define R600_PS_SAMPLER				1161
-#define R600_PS_SAMPLER_TYPE				15
-#define R600_VS_SAMPLER				1179
-#define R600_VS_SAMPLER_TYPE				16
-#define R600_GS_SAMPLER				1197
-#define R600_GS_SAMPLER_TYPE				17
-#define R600_PS_SAMPLER_BORDER				1215
-#define R600_PS_SAMPLER_BORDER_TYPE				18
-#define R600_VS_SAMPLER_BORDER				1233
-#define R600_VS_SAMPLER_BORDER_TYPE				19
-#define R600_GS_SAMPLER_BORDER				1251
-#define R600_GS_SAMPLER_BORDER_TYPE				20
-#define R600_CB0				1269
-#define R600_CB0_TYPE				21
-#define R600_CB1				1270
-#define R600_CB1_TYPE				22
-#define R600_CB2				1271
-#define R600_CB2_TYPE				23
-#define R600_CB3				1272
-#define R600_CB3_TYPE				24
-#define R600_CB4				1273
-#define R600_CB4_TYPE				25
-#define R600_CB5				1274
-#define R600_CB5_TYPE				26
-#define R600_CB6				1275
-#define R600_CB6_TYPE				27
-#define R600_CB7				1276
-#define R600_CB7_TYPE				28
-#define R600_DB				1277
-#define R600_DB_TYPE				29
-#define R600_VGT				1278
-#define R600_VGT_TYPE				30
-#define R600_DRAW				1279
-#define R600_DRAW_TYPE				31
 /* R600_CONFIG */
 #define R600_CONFIG__SQ_CONFIG			0
 #define R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1			1
@@ -639,9 +594,40 @@ struct radeon_ctx {
 /* R600_DRAW */
 #define R600_DRAW__VGT_NUM_INDICES			0
 #define R600_DRAW__VGT_DMA_BASE_HI			1
-#define R600_DRAW__VGT_DMA_BASE			2
+#define R600_DRAW__VGT_DMA_BASE				2
 #define R600_DRAW__VGT_DRAW_INITIATOR			3
-#define R600_DRAW_SIZE				4
-#define R600_DRAW_PM4				128
+#define R600_DRAW_SIZE					4
+#define R600_DRAW_PM4					128
+/* R600_CLIP */
+#define R600_CLIP__PA_CL_UCP_X_0			0
+#define R600_CLIP__PA_CL_UCP_Y_0			1
+#define R600_CLIP__PA_CL_UCP_Z_0			2
+#define R600_CLIP__PA_CL_UCP_W_0			3
+#define R600_CLIP__PA_CL_UCP_X_1			4
+#define R600_CLIP__PA_CL_UCP_Y_1			5
+#define R600_CLIP__PA_CL_UCP_Z_1			6
+#define R600_CLIP__PA_CL_UCP_W_1			7
+#define R600_CLIP__PA_CL_UCP_X_2			8
+#define R600_CLIP__PA_CL_UCP_Y_2			9
+#define R600_CLIP__PA_CL_UCP_Z_2			10
+#define R600_CLIP__PA_CL_UCP_W_2			11
+#define R600_CLIP__PA_CL_UCP_X_3			12
+#define R600_CLIP__PA_CL_UCP_Y_3			13
+#define R600_CLIP__PA_CL_UCP_Z_3			14
+#define R600_CLIP__PA_CL_UCP_W_3			15
+#define R600_CLIP__PA_CL_UCP_X_4			16
+#define R600_CLIP__PA_CL_UCP_Y_4			17
+#define R600_CLIP__PA_CL_UCP_Z_4			18
+#define R600_CLIP__PA_CL_UCP_W_4			19
+#define R600_CLIP__PA_CL_UCP_X_5			20
+#define R600_CLIP__PA_CL_UCP_Y_5			21
+#define R600_CLIP__PA_CL_UCP_Z_5			22
+#define R600_CLIP__PA_CL_UCP_W_5			23
+#define R600_CLIP_SIZE					24
+#define R600_CLIP_PM4					128
+/* R600 QUERY BEGIN/END */
+#define R600_QUERY__OFFSET			0
+#define R600_QUERY_SIZE				1
+#define R600_QUERY_PM4				128
 
 #endif
diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c
index 386c8acb8c..01b4ca985d 100644
--- a/src/gallium/drivers/softpipe/sp_draw_arrays.c
+++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c
@@ -75,14 +75,10 @@ softpipe_draw_stream_output(struct pipe_context *pipe, unsigned mode)
    buf = (void*)((int32_t*)buf + offset);
    draw_set_mapped_vertex_buffer(draw, 0, buf);
 
-   draw_set_mapped_element_buffer_range(draw,
-                                        0, 0,
-                                        start,
-                                        start + count - 1,
-                                        NULL);
+   draw_set_mapped_index_buffer(draw, NULL);
 
    /* draw! */
-   draw_arrays_instanced(draw, mode, start, count, 0, 1);
+   draw_arrays(draw, mode, start, count);
 
    /* unmap vertex/index buffers - will cause draw module to flush */
    draw_set_mapped_vertex_buffer(draw, 0, NULL);
@@ -138,28 +134,20 @@ softpipe_draw_vbo(struct pipe_context *pipe,
    }
 
    /* Map index buffer, if present */
-   if (info->indexed && sp->index_buffer.buffer) {
-      char *indices = (char *) softpipe_resource(sp->index_buffer.buffer)->data;
-      mapped_indices = (void *) (indices + sp->index_buffer.offset);
-   }
+   if (info->indexed && sp->index_buffer.buffer)
+      mapped_indices = softpipe_resource(sp->index_buffer.buffer)->data;
 
-   draw_set_mapped_element_buffer_range(draw, (mapped_indices) ?
-                                        sp->index_buffer.index_size : 0,
-                                        info->index_bias,
-                                        info->min_index,
-                                        info->max_index,
-                                        mapped_indices);
+   draw_set_mapped_index_buffer(draw, mapped_indices);
 
    /* draw! */
-   draw_arrays_instanced(draw, info->mode, info->start, info->count,
-         info->start_instance, info->instance_count);
+   draw_vbo(draw, info);
 
    /* unmap vertex/index buffers - will cause draw module to flush */
    for (i = 0; i < sp->num_vertex_buffers; i++) {
       draw_set_mapped_vertex_buffer(draw, i, NULL);
    }
    if (mapped_indices) {
-      draw_set_mapped_element_buffer(draw, 0, 0, NULL);
+      draw_set_mapped_index_buffer(draw, NULL);
    }
 
    /*
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
index 4a53ef048f..1071011db0 100644
--- a/src/gallium/drivers/softpipe/sp_flush.c
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -31,6 +31,7 @@
 
 
 #include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
 #include "draw/draw_context.h"
 #include "sp_flush.h"
 #include "sp_context.h"
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 93af6ee5b0..73ae2dea56 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -199,6 +199,7 @@ softpipe_is_format_supported( struct pipe_screen *screen,
    assert(target == PIPE_BUFFER ||
           target == PIPE_TEXTURE_1D ||
           target == PIPE_TEXTURE_2D ||
+          target == PIPE_TEXTURE_RECT ||
           target == PIPE_TEXTURE_3D ||
           target == PIPE_TEXTURE_CUBE);
 
diff --git a/src/gallium/drivers/softpipe/sp_state_vertex.c b/src/gallium/drivers/softpipe/sp_state_vertex.c
index 880a7c7cd2..b650fcaea5 100644
--- a/src/gallium/drivers/softpipe/sp_state_vertex.c
+++ b/src/gallium/drivers/softpipe/sp_state_vertex.c
@@ -100,5 +100,5 @@ softpipe_set_index_buffer(struct pipe_context *pipe,
    else
       memset(&softpipe->index_buffer, 0, sizeof(softpipe->index_buffer));
 
-   /* TODO make this more like a state */
+   draw_set_index_buffer(softpipe->draw, ib);
 }
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index cf7ab81405..e654bb77c2 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -1785,6 +1785,7 @@ get_lambda_func(const union sp_sampler_key key)
    case PIPE_TEXTURE_1D:
       return compute_lambda_1d;
    case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
    case PIPE_TEXTURE_CUBE:
       return compute_lambda_2d;
    case PIPE_TEXTURE_3D:
@@ -1809,6 +1810,7 @@ get_img_filter(const union sp_sampler_key key,
          return img_filter_1d_linear;
       break;
    case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
       /* Try for fast path:
        */
       if (key.bits.is_pot &&
diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c
index 7b2dfe2549..e975f3b02f 100644
--- a/src/gallium/drivers/svga/svga_cmd.c
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -67,7 +67,7 @@ void surface_to_surfaceid(struct svga_winsys_context *swc, // IN
       id->mipmap = s->real_level;
    }
    else {
-      id->sid = SVGA3D_INVALID_ID;
+      swc->surface_relocation(swc, &id->sid, NULL, flags);
       id->face = 0;
       id->mipmap = 0;
    }
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
index 3b30b9e341..cd3f6b8982 100644
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -214,6 +214,11 @@ void svga_context_flush( struct svga_context *svga,
 
    svga_screen_cache_flush(svgascreen, fence);
 
+   /* To force the reemission of rendertargets and texture bindings at
+    * the beginning of every command buffer.
+    */
+   svga->dirty |= SVGA_NEW_COMMAND_BUFFER;
+
    if (SVGA_DEBUG & DEBUG_SYNC) {
       if (fence)
          svga->pipe.screen->fence_finish( svga->pipe.screen, fence, 0);
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index 67a7614c8a..1fb5a04887 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -382,6 +382,7 @@ struct svga_context
 #define SVGA_NEW_ZERO_STRIDE         0x2000000
 #define SVGA_NEW_TEXTURE_FLAGS       0x4000000
 #define SVGA_NEW_STENCIL_REF         0x8000000
+#define SVGA_NEW_COMMAND_BUFFER      0x10000000
 
 
 
diff --git a/src/gallium/drivers/svga/svga_pipe_draw.c b/src/gallium/drivers/svga/svga_pipe_draw.c
index de08bc5e56..001ec3616c 100644
--- a/src/gallium/drivers/svga/svga_pipe_draw.c
+++ b/src/gallium/drivers/svga/svga_pipe_draw.c
@@ -146,23 +146,15 @@ retry:
 }
 
 
-
-
-
 static void
-svga_draw_range_elements( struct pipe_context *pipe,
-                          struct pipe_resource *index_buffer,
-                          unsigned index_size,
-                          int index_bias,
-                          unsigned min_index,
-                          unsigned max_index,
-                          unsigned prim, unsigned start, unsigned count)
+svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
    struct svga_context *svga = svga_context( pipe );
-   unsigned reduced_prim = u_reduced_prim(prim);
+   unsigned reduced_prim = u_reduced_prim( info->mode );
+   unsigned count = info->count;
    enum pipe_error ret = 0;
 
-   if (!u_trim_pipe_prim( prim, &count ))
+   if (!u_trim_pipe_prim( info->mode, &count ))
       return;
 
    /*
@@ -187,34 +179,32 @@ svga_draw_range_elements( struct pipe_context *pipe,
       return;
 #endif
 
-   if (svga->state.sw.need_swtnl)
-   {
-      ret = svga_swtnl_draw_range_elements( svga, 
-                                            index_buffer, 
-                                            index_size,
-                                            index_bias,
-                                            min_index, max_index,
-                                            prim,
-                                            start, count );
+   if (svga->state.sw.need_swtnl) {
+      ret = svga_swtnl_draw_vbo( svga, info );
    }
    else {
-      if (index_buffer) {
+      if (info->indexed && svga->curr.ib.buffer) {
+         unsigned offset;
+
+         assert(svga->curr.ib.offset % svga->curr.ib.index_size == 0);
+         offset = svga->curr.ib.offset / svga->curr.ib.index_size;
+
          ret = retry_draw_range_elements( svga,
-                                          index_buffer,
-                                          index_size,
-                                          index_bias,
-                                          min_index,
-                                          max_index,
-                                          prim,
-                                          start,
-                                          count,
+                                          svga->curr.ib.buffer,
+                                          svga->curr.ib.index_size,
+                                          info->index_bias,
+                                          info->min_index,
+                                          info->max_index,
+                                          info->mode,
+                                          info->start + offset,
+                                          info->count,
                                           TRUE );
       }
       else {
-         ret = retry_draw_arrays( svga, 
-                                  prim, 
-                                  start, 
-                                  count,
+         ret = retry_draw_arrays( svga,
+                                  info->mode,
+                                  info->start,
+                                  info->count,
                                   TRUE );
       }
    }
@@ -226,30 +216,6 @@ svga_draw_range_elements( struct pipe_context *pipe,
 }
 
 
-static void
-svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
-{
-   struct svga_context *svga = svga_context(pipe);
-
-   if (info->indexed && svga->curr.ib.buffer) {
-      unsigned offset;
-
-      assert(svga->curr.ib.offset % svga->curr.ib.index_size == 0);
-      offset = svga->curr.ib.offset / svga->curr.ib.index_size;
-
-      svga_draw_range_elements(pipe, svga->curr.ib.buffer,
-                               svga->curr.ib.index_size, info->index_bias,
-                               info->min_index, info->max_index,
-                               info->mode, info->start + offset, info->count);
-   }
-   else {
-      svga_draw_range_elements(pipe, NULL, 0, 0,
-                               info->min_index, info->max_index,
-                               info->mode, info->start, info->count);
-   }
-}
-
-
 void svga_init_draw_functions( struct svga_context *svga )
 {
    svga->pipe.draw_vbo = svga_draw_vbo;
diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c
index ff83c750aa..26eb03a895 100644
--- a/src/gallium/drivers/svga/svga_resource_texture.c
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@@ -583,7 +583,8 @@ svga_texture_from_handle(struct pipe_screen *screen,
    assert(screen);
 
    /* Only supports one type */
-   if (template->target != PIPE_TEXTURE_2D ||
+   if ((template->target != PIPE_TEXTURE_2D &&
+       template->target != PIPE_TEXTURE_RECT) ||
        template->last_level != 0 ||
        template->depth0 != 1) {
       return NULL;
diff --git a/src/gallium/drivers/svga/svga_state_framebuffer.c b/src/gallium/drivers/svga/svga_state_framebuffer.c
index bd92f00343..fcbb35e797 100644
--- a/src/gallium/drivers/svga/svga_state_framebuffer.c
+++ b/src/gallium/drivers/svga/svga_state_framebuffer.c
@@ -43,15 +43,18 @@ static int emit_framebuffer( struct svga_context *svga,
 {
    const struct pipe_framebuffer_state *curr = &svga->curr.framebuffer;
    struct pipe_framebuffer_state *hw = &svga->state.hw_clear.framebuffer;
+   boolean reemit = !!(dirty & SVGA_NEW_COMMAND_BUFFER);
    unsigned i;
    enum pipe_error ret;
 
-   /* XXX: Need shadow state in svga->hw to eliminate redundant
-    * uploads, especially of NULL buffers.
+   /*
+    * We need to reemit non-null surface bindings, even when they are not
+    * dirty, to ensure that the resources are paged in.
     */
    
    for(i = 0; i < PIPE_MAX_COLOR_BUFS; ++i) {
-      if (curr->cbufs[i] != hw->cbufs[i]) {
+      if (curr->cbufs[i] != hw->cbufs[i] ||
+          (reemit && hw->cbufs[i])) {
          if (svga->curr.nr_fbs++ > 8)
             return PIPE_ERROR_OUT_OF_MEMORY;
 
@@ -64,7 +67,8 @@ static int emit_framebuffer( struct svga_context *svga,
    }
 
    
-   if (curr->zsbuf != hw->zsbuf) {
+   if (curr->zsbuf != hw->zsbuf ||
+       (reemit && hw->zsbuf)) {
       ret = SVGA3D_SetRenderTarget(svga->swc, SVGA3D_RT_DEPTH, curr->zsbuf);
       if (ret != PIPE_OK)
          return ret;
@@ -92,7 +96,8 @@ static int emit_framebuffer( struct svga_context *svga,
 struct svga_tracked_state svga_hw_framebuffer = 
 {
    "hw framebuffer state",
-   SVGA_NEW_FRAME_BUFFER,
+   SVGA_NEW_FRAME_BUFFER |
+   SVGA_NEW_COMMAND_BUFFER,
    emit_framebuffer
 };
 
diff --git a/src/gallium/drivers/svga/svga_state_tss.c b/src/gallium/drivers/svga/svga_state_tss.c
index 76a2dae143..4a50b19474 100644
--- a/src/gallium/drivers/svga/svga_state_tss.c
+++ b/src/gallium/drivers/svga/svga_state_tss.c
@@ -56,6 +56,7 @@ static int
 update_tss_binding(struct svga_context *svga, 
                    unsigned dirty )
 {
+   boolean reemit = !!(dirty & SVGA_NEW_COMMAND_BUFFER);
    unsigned i;
    unsigned count = MAX2( svga->curr.num_sampler_views,
                           svga->state.hw_draw.num_views );
@@ -107,12 +108,18 @@ update_tss_binding(struct svga_context *svga,
                                                 max_lod);
       }
 
-      if (view->dirty) {
+      /*
+       * We need to reemit non-null texture bindings, even when they are not
+       * dirty, to ensure that the resources are paged in.
+       */
+
+      if (view->dirty ||
+          (reemit && view->v)) {
          queue.bind[queue.bind_count].unit = i;
          queue.bind[queue.bind_count].view = view;
          queue.bind_count++;
       } 
-      else if (view->v) {
+      if (!view->dirty && view->v) {
          svga_validate_sampler_view(svga, view->v);
       }
    }
@@ -128,18 +135,21 @@ update_tss_binding(struct svga_context *svga,
          goto fail;
 
       for (i = 0; i < queue.bind_count; i++) {
+         struct svga_winsys_surface *handle;
+
          ts[i].stage = queue.bind[i].unit;
          ts[i].name = SVGA3D_TS_BIND_TEXTURE;
 
          if (queue.bind[i].view->v) {
-            svga->swc->surface_relocation(svga->swc,
-                                          &ts[i].value,
-                                          queue.bind[i].view->v->handle,
-                                          SVGA_RELOC_READ);
+            handle = queue.bind[i].view->v->handle;
          }
          else {
-            ts[i].value = SVGA3D_INVALID_ID;
+            handle = NULL;
          }
+         svga->swc->surface_relocation(svga->swc,
+                                       &ts[i].value,
+                                       handle,
+                                       SVGA_RELOC_READ);
          
          queue.bind[i].view->dirty = FALSE;
       }
@@ -157,7 +167,8 @@ fail:
 struct svga_tracked_state svga_hw_tss_binding = {
    "texture binding emit",
    SVGA_NEW_TEXTURE_BINDING |
-   SVGA_NEW_SAMPLER,
+   SVGA_NEW_SAMPLER |
+   SVGA_NEW_COMMAND_BUFFER,
    update_tss_binding
 };
 
diff --git a/src/gallium/drivers/svga/svga_swtnl.h b/src/gallium/drivers/svga/svga_swtnl.h
index 65c675f99c..fc094e5142 100644
--- a/src/gallium/drivers/svga/svga_swtnl.h
+++ b/src/gallium/drivers/svga/svga_swtnl.h
@@ -38,15 +38,8 @@ void svga_destroy_swtnl( struct svga_context *svga );
 
 
 enum pipe_error
-svga_swtnl_draw_range_elements(struct svga_context *svga,
-                               struct pipe_resource *indexBuffer,
-                               unsigned indexSize,
-                               int indexBias,
-                               unsigned min_index,
-                               unsigned max_index,
-                               unsigned prim, 
-                               unsigned start, 
-                               unsigned count);
+svga_swtnl_draw_vbo(struct svga_context *svga,
+                    const struct pipe_draw_info *info);
 
 
 #endif
diff --git a/src/gallium/drivers/svga/svga_swtnl_draw.c b/src/gallium/drivers/svga/svga_swtnl_draw.c
index eb71c23195..814e8edd70 100644
--- a/src/gallium/drivers/svga/svga_swtnl_draw.c
+++ b/src/gallium/drivers/svga/svga_swtnl_draw.c
@@ -36,13 +36,8 @@
 
 
 enum pipe_error
-svga_swtnl_draw_range_elements(struct svga_context *svga,
-                               struct pipe_resource *indexBuffer,
-                               unsigned indexSize,
-                               int indexBias,
-                               unsigned min_index,
-                               unsigned max_index,
-                               unsigned prim, unsigned start, unsigned count)
+svga_swtnl_draw_vbo(struct svga_context *svga,
+                    const struct pipe_draw_info *info)
 {
    struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS];
    struct pipe_transfer *ib_transfer = NULL;
@@ -76,19 +71,18 @@ svga_swtnl_draw_range_elements(struct svga_context *svga,
       draw_set_mapped_vertex_buffer(draw, i, map);
    }
 
+   /* TODO move this to update_swtnl_draw */
+   draw_set_index_buffer(draw, &svga->curr.ib);
+
    /* Map index buffer, if present */
-   if (indexBuffer) {
-      map = pipe_buffer_map(&svga->pipe, indexBuffer,
+   map = NULL;
+   if (info->indexed && svga->curr.ib.buffer) {
+      map = pipe_buffer_map(&svga->pipe, svga->curr.ib.buffer,
                             PIPE_TRANSFER_READ,
-			    &ib_transfer);
-
-      draw_set_mapped_element_buffer_range(draw, 
-                                           indexSize, indexBias,
-                                           min_index,
-                                           max_index,
-                                           map);
+                            &ib_transfer);
    }
-   
+   draw_set_mapped_index_buffer(draw, map);
+
    if (svga->curr.cb[PIPE_SHADER_VERTEX]) {
       map = pipe_buffer_map(&svga->pipe,
                             svga->curr.cb[PIPE_SHADER_VERTEX],
@@ -101,7 +95,7 @@ svga_swtnl_draw_range_elements(struct svga_context *svga,
          svga->curr.cb[PIPE_SHADER_VERTEX]->width0);
    }
 
-   draw_arrays(svga->swtnl.draw, prim, start, count);
+   draw_vbo(draw, info);
 
    draw_flush(svga->swtnl.draw);
 
@@ -117,9 +111,9 @@ svga_swtnl_draw_range_elements(struct svga_context *svga,
       draw_set_mapped_vertex_buffer(draw, i, NULL);
    }
 
-   if (indexBuffer) {
-      pipe_buffer_unmap(&svga->pipe, indexBuffer, ib_transfer);
-      draw_set_mapped_element_buffer(draw, 0, 0, NULL);
+   if (ib_transfer) {
+      pipe_buffer_unmap(&svga->pipe, svga->curr.ib.buffer, ib_transfer);
+      draw_set_mapped_index_buffer(draw, NULL);
    }
 
    if (svga->curr.cb[PIPE_SHADER_VERTEX]) {
@@ -157,7 +151,8 @@ boolean svga_init_swtnl( struct svga_context *svga )
    draw_install_aapoint_stage(svga->swtnl.draw, &svga->pipe);
    draw_install_pstipple_stage(svga->swtnl.draw, &svga->pipe);
 
-   draw_set_driver_clipping(svga->swtnl.draw, debug_get_bool_option("SVGA_SWTNL_FSE", FALSE));
+   if (debug_get_bool_option("SVGA_SWTNL_FSE", FALSE))
+      draw_set_driver_clipping(svga->swtnl.draw, TRUE, TRUE);
 
    return TRUE;
 
diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h
index 48eced2ece..b4e90a957d 100644
--- a/src/gallium/drivers/svga/svga_tgsi_emit.h
+++ b/src/gallium/drivers/svga/svga_tgsi_emit.h
@@ -353,6 +353,7 @@ static INLINE ubyte svga_tgsi_sampler_type( struct svga_shader_emitter *emit,
    case PIPE_TEXTURE_1D:
       return SVGA3DSAMP_2D;
    case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
       return SVGA3DSAMP_2D;
    case PIPE_TEXTURE_3D:
       return SVGA3DSAMP_VOLUME;
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index 67e1f22a70..72dccdf150 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -806,6 +806,20 @@ static boolean emit_cmp(struct svga_shader_emitter *emit,
    const struct src_register src2 = translate_src_register(
       emit, &insn->Src[2] );
 
+   if (emit->unit == PIPE_SHADER_VERTEX) {
+      SVGA3dShaderDestToken temp = get_temp(emit);
+      struct src_register zero = scalar(get_zero_immediate(emit), TGSI_SWIZZLE_X);
+
+      /* Since vertex shaders don't support the CMP instruction,
+       * simulate it with SLT and LRP instructions.
+       *    SLT  TMP, SRC0, 0.0
+       *    LRP  DST, TMP, SRC1, SRC2
+       */
+      if (!submit_op2(emit, inst_token(SVGA3DOP_SLT), temp, src0, zero))
+         return FALSE;
+      return submit_op3(emit, inst_token(SVGA3DOP_LRP), dst, src(temp), src1, src2);
+   }
+
    /* CMP  DST, SRC0, SRC2, SRC1 */
    return submit_op3( emit, inst_token( SVGA3DOP_CMP ), dst, src0, src2, src1);
 }
@@ -2682,6 +2696,11 @@ needs_to_create_zero( struct svga_shader_emitter *emit )
          return TRUE;
    }
 
+   if (emit->unit == PIPE_SHADER_VERTEX) {
+      if (emit->info.opcode_count[TGSI_OPCODE_CMP] >= 1)
+         return TRUE;
+   }
+
    if (emit->info.opcode_count[TGSI_OPCODE_IF] >= 1 ||
        emit->info.opcode_count[TGSI_OPCODE_BGNLOOP] >= 1 ||
        emit->info.opcode_count[TGSI_OPCODE_DDX] >= 1 ||
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index 84e5a6a824..271cd4aff5 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -885,7 +885,7 @@ trace_sampler_view_destroy(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(ptr, view);
 
-   pipe->sampler_view_destroy(pipe, view);
+   pipe_sampler_view_reference(&tr_view->sampler_view, NULL);
 
    trace_dump_call_end();
 
@@ -1002,7 +1002,7 @@ trace_context_set_index_buffer(struct pipe_context *_pipe,
    trace_dump_call_begin("pipe_context", "set_index_buffer");
 
    trace_dump_arg(ptr, pipe);
-   trace_dump_arg(index_buffer, ib);
+   trace_dump_arg(index_buffer, _ib);
 
    pipe->set_index_buffer(pipe, ib);
 
@@ -1063,7 +1063,10 @@ trace_context_clear(struct pipe_context *_pipe,
 
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(uint, buffers);
-   trace_dump_arg_array(float, rgba, 4);
+   if (rgba)
+      trace_dump_arg_array(float, rgba, 4);
+   else
+      trace_dump_null();
    trace_dump_arg(float, depth);
    trace_dump_arg(uint, stencil);
 
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index 1fa3ec8300..0a5be43f6b 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -79,6 +79,14 @@ typedef unsigned char boolean;
 #define FALSE false
 #endif
 
+#ifndef va_copy
+#ifdef __va_copy
+#define va_copy(dest, src) __va_copy((dest), (src))
+#else
+#define va_copy(dest, src) (dest) = (src)
+#endif
+#endif
+
 /* Function inlining */
 #ifndef INLINE
 #  ifdef __cplusplus
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index 0579962ec6..0e53aef6d2 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -28,19 +28,37 @@
 #ifndef PIPE_CONTEXT_H
 #define PIPE_CONTEXT_H
 
-#include "p_state.h"
-
+#include "p_compiler.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-   
-struct pipe_screen;
+
+struct pipe_blend_color;
+struct pipe_blend_state;
+struct pipe_box;
+struct pipe_clip_state;
+struct pipe_depth_stencil_alpha_state;
+struct pipe_draw_info;
 struct pipe_fence_handle;
-struct pipe_state_cache;
+struct pipe_framebuffer_state;
+struct pipe_index_buffer;
 struct pipe_query;
-struct pipe_winsys;
+struct pipe_poly_stipple;
+struct pipe_rasterizer_state;
+struct pipe_resource;
+struct pipe_sampler_state;
+struct pipe_sampler_view;
+struct pipe_scissor_state;
+struct pipe_shader_state;
+struct pipe_stencil_ref;
+struct pipe_stream_output_state;
+struct pipe_subresource;
+struct pipe_surface;
+struct pipe_vertex_buffer;
+struct pipe_vertex_element;
+struct pipe_viewport_state;
 
 /**
  * Gallium rendering context.  Basically:
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 00aa2076ed..627b5ae538 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -28,7 +28,7 @@
 #ifndef PIPE_DEFINES_H
 #define PIPE_DEFINES_H
 
-#include "p_format.h"
+#include "p_compiler.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -135,13 +135,15 @@ enum pipe_error {
 #define PIPE_STENCIL_OP_DECR_WRAP  6
 #define PIPE_STENCIL_OP_INVERT     7
 
-/** Texture types */
+/** Texture types.
+ * See the documentation for info on PIPE_TEXTURE_RECT vs PIPE_TEXTURE_2D */
 enum pipe_texture_target {
    PIPE_BUFFER       = 0,
    PIPE_TEXTURE_1D   = 1,
    PIPE_TEXTURE_2D   = 2,
    PIPE_TEXTURE_3D   = 3,
    PIPE_TEXTURE_CUBE = 4,
+   PIPE_TEXTURE_RECT = 5,
    PIPE_MAX_TEXTURE_TYPES
 };
 
diff --git a/src/gallium/include/pipe/p_format.h b/src/gallium/include/pipe/p_format.h
index 436c3f627a..06412f4894 100644
--- a/src/gallium/include/pipe/p_format.h
+++ b/src/gallium/include/pipe/p_format.h
@@ -29,8 +29,6 @@
 #ifndef PIPE_FORMAT_H
 #define PIPE_FORMAT_H
 
-#include "p_compiler.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index 9df20ea858..c4bd17e92b 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -33,8 +33,6 @@
 extern "C" {
 #endif
 
-#include "p_compiler.h"
-
 
 struct tgsi_header
 {
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 0f1a44cde4..9a2b31da50 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -43,7 +43,6 @@
 #include "p_compiler.h"
 #include "p_defines.h"
 #include "p_format.h"
-#include "p_screen.h"
 
 
 #ifdef __cplusplus
diff --git a/src/gallium/include/state_tracker/graw.h b/src/gallium/include/state_tracker/graw.h
index 59b0e337c9..6a99b234aa 100644
--- a/src/gallium/include/state_tracker/graw.h
+++ b/src/gallium/include/state_tracker/graw.h
@@ -1,3 +1,30 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
 #ifndef GALLIUM_RAW_H
 #define GALLIUM_RAW_H
 
@@ -14,6 +41,7 @@
  * those for parsing text representations of TGSI shaders.
  */
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 
 struct pipe_screen;
diff --git a/src/gallium/state_trackers/dri/common/dri_context.h b/src/gallium/state_trackers/dri/common/dri_context.h
index 692c49d7cd..35b870a8a3 100644
--- a/src/gallium/state_trackers/dri/common/dri_context.h
+++ b/src/gallium/state_trackers/dri/common/dri_context.h
@@ -34,7 +34,6 @@
 
 #include "pipe/p_compiler.h"
 #include "dri_wrapper.h"
-#include "main/mtypes.h"
 
 struct pipe_context;
 struct pipe_fence;
diff --git a/src/gallium/state_trackers/dri/common/dri_screen.c b/src/gallium/state_trackers/dri/common/dri_screen.c
index 6ad2c7da4d..0ab4dd1893 100644
--- a/src/gallium/state_trackers/dri/common/dri_screen.c
+++ b/src/gallium/state_trackers/dri/common/dri_screen.c
@@ -383,6 +383,11 @@ dri_init_screen_helper(struct dri_screen *screen,
    if (!screen->st_api)
       return NULL;
 
+   if(pscreen->get_param(pscreen, PIPE_CAP_NPOT_TEXTURES))
+      screen->target = PIPE_TEXTURE_2D;
+   else
+      screen->target = PIPE_TEXTURE_RECT;
+
    driParseOptionInfo(&screen->optionCache,
                       __driConfigOptions, __driNConfigOptions);
 
diff --git a/src/gallium/state_trackers/dri/common/dri_screen.h b/src/gallium/state_trackers/dri/common/dri_screen.h
index 53ccce145b..849f399b2f 100644
--- a/src/gallium/state_trackers/dri/common/dri_screen.h
+++ b/src/gallium/state_trackers/dri/common/dri_screen.h
@@ -68,6 +68,7 @@ struct dri_screen
    boolean d_depth_bits_last;
    boolean sd_depth_bits_last;
    boolean auto_fake_front;
+   enum pipe_texture_target target;
 };
 
 /** cast wrapper */
diff --git a/src/gallium/state_trackers/dri/drm/dri2.c b/src/gallium/state_trackers/dri/drm/dri2.c
index 47005c17e2..93f910a26d 100644
--- a/src/gallium/state_trackers/dri/drm/dri2.c
+++ b/src/gallium/state_trackers/dri/drm/dri2.c
@@ -195,7 +195,7 @@ dri2_drawable_process_buffers(struct dri_drawable *drawable,
       pipe_resource_reference(&drawable->textures[i], NULL);
 
    memset(&templ, 0, sizeof(templ));
-   templ.target = PIPE_TEXTURE_2D;
+   templ.target = screen->target;
    templ.last_level = 0;
    templ.width0 = dri_drawable->w;
    templ.height0 = dri_drawable->h;
@@ -342,7 +342,7 @@ dri2_create_image_from_name(__DRIcontext *context,
    memset(&templ, 0, sizeof(templ));
    templ.bind = tex_usage;
    templ.format = pf;
-   templ.target = PIPE_TEXTURE_2D;
+   templ.target = screen->target;
    templ.last_level = 0;
    templ.width0 = width;
    templ.height0 = height;
diff --git a/src/gallium/state_trackers/dri/sw/drisw.c b/src/gallium/state_trackers/dri/sw/drisw.c
index 249ccd7fcf..04bba631ae 100644
--- a/src/gallium/state_trackers/dri/sw/drisw.c
+++ b/src/gallium/state_trackers/dri/sw/drisw.c
@@ -216,7 +216,7 @@ drisw_allocate_textures(struct dri_drawable *drawable,
    }
 
    memset(&templ, 0, sizeof(templ));
-   templ.target = PIPE_TEXTURE_2D;
+   templ.target = screen->target;
    templ.width0 = width;
    templ.height0 = height;
    templ.depth0 = 1;
diff --git a/src/gallium/state_trackers/egl/Makefile b/src/gallium/state_trackers/egl/Makefile
index 9e9e479e7e..4199d7c6ba 100644
--- a/src/gallium/state_trackers/egl/Makefile
+++ b/src/gallium/state_trackers/egl/Makefile
@@ -24,7 +24,7 @@ x11_SOURCES = $(wildcard x11/*.c) \
 x11_OBJECTS = $(x11_SOURCES:.c=.o)
 
 
-kms_INCLUDES = $(shell pkg-config --cflags-only-I libdrm)
+kms_INCLUDES = -I$(TOP)/src/gallium/winsys $(shell pkg-config --cflags-only-I libdrm)
 kms_SOURCES = $(wildcard kms/*.c)
 kms_OBJECTS = $(kms_SOURCES:.c=.o)
 
diff --git a/src/gallium/state_trackers/egl/SConscript b/src/gallium/state_trackers/egl/SConscript
index e71aec35b7..efcce25e31 100644
--- a/src/gallium/state_trackers/egl/SConscript
+++ b/src/gallium/state_trackers/egl/SConscript
@@ -21,6 +21,7 @@ if 'egl' in env['statetrackers']:
         'common/egl_g3d_api.c',
         'common/egl_g3d_image.c',
         'common/egl_g3d_st.c',
+        'common/egl_g3d_sync.c',
         'common/native_helper.c',
     ]
 
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d.c b/src/gallium/state_trackers/egl/common/egl_g3d.c
index 56d575ffe0..4e653bdf3b 100644
--- a/src/gallium/state_trackers/egl/common/egl_g3d.c
+++ b/src/gallium/state_trackers/egl/common/egl_g3d.c
@@ -530,6 +530,18 @@ egl_g3d_initialize(_EGLDriver *drv, _EGLDisplay *dpy,
    if (gdpy->native->get_param(gdpy->native, NATIVE_PARAM_USE_NATIVE_BUFFER))
       dpy->Extensions.KHR_image_pixmap = EGL_TRUE;
 
+   dpy->Extensions.KHR_reusable_sync = EGL_TRUE;
+   dpy->Extensions.KHR_fence_sync = EGL_TRUE;
+
+   dpy->Extensions.KHR_surfaceless_gles1 = EGL_TRUE;
+   dpy->Extensions.KHR_surfaceless_gles2 = EGL_TRUE;
+   dpy->Extensions.KHR_surfaceless_opengl = EGL_TRUE;
+
+   if (dpy->Platform == _EGL_PLATFORM_DRM) {
+      dpy->Extensions.MESA_drm_display = EGL_TRUE;
+      dpy->Extensions.MESA_drm_image = EGL_TRUE;
+   }
+
    if (egl_g3d_add_configs(drv, dpy, 1) == 1) {
       _eglError(EGL_NOT_INITIALIZED, "eglInitialize(unable to add configs)");
       goto fail;
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d.h b/src/gallium/state_trackers/egl/common/egl_g3d.h
index f33dc91cf9..be450bbede 100644
--- a/src/gallium/state_trackers/egl/common/egl_g3d.h
+++ b/src/gallium/state_trackers/egl/common/egl_g3d.h
@@ -30,12 +30,14 @@
 #include "pipe/p_screen.h"
 #include "pipe/p_context.h"
 #include "pipe/p_format.h"
+#include "os/os_thread.h"
 #include "egldriver.h"
 #include "egldisplay.h"
 #include "eglcontext.h"
 #include "eglsurface.h"
 #include "eglconfig.h"
 #include "eglimage.h"
+#include "eglsync.h"
 #include "eglscreen.h"
 #include "eglmode.h"
 
@@ -99,6 +101,24 @@ struct egl_g3d_image {
 _EGL_DRIVER_STANDARD_TYPECASTS(egl_g3d)
 _EGL_DRIVER_TYPECAST(egl_g3d_image, _EGLImage, obj)
 
+#ifdef EGL_KHR_reusable_sync
+
+struct egl_g3d_sync {
+   _EGLSync base;
+
+   int refs;
+
+   /* the mutex protects only the condvar, not the struct */
+   pipe_mutex mutex;
+   pipe_condvar condvar;
+
+   /* for fence sync */
+   struct pipe_fence_handle *fence;
+};
+_EGL_DRIVER_TYPECAST(egl_g3d_sync, _EGLSync, obj)
+
+#endif /* EGL_KHR_reusable_sync */
+
 #ifdef EGL_MESA_screen_surface
 
 struct egl_g3d_screen {
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d_api.c b/src/gallium/state_trackers/egl/common/egl_g3d_api.c
index edac72a822..3ec53653f4 100644
--- a/src/gallium/state_trackers/egl/common/egl_g3d_api.c
+++ b/src/gallium/state_trackers/egl/common/egl_g3d_api.c
@@ -34,6 +34,7 @@
 #include "egl_g3d.h"
 #include "egl_g3d_api.h"
 #include "egl_g3d_image.h"
+#include "egl_g3d_sync.h"
 #include "egl_g3d_st.h"
 #include "egl_g3d_loader.h"
 #include "native.h"
@@ -103,7 +104,7 @@ egl_g3d_create_context(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *conf,
    }
 
    gctx->stctxi = gctx->stapi->create_context(gctx->stapi, gdpy->smapi,
-         &gconf->stvis, (gshare) ? gshare->stctxi : NULL);
+         (gconf) ? &gconf->stvis : NULL, (gshare) ? gshare->stctxi : NULL);
    if (!gctx->stctxi) {
       FREE(gctx);
       return NULL;
@@ -437,16 +438,19 @@ egl_g3d_make_current(_EGLDriver *drv, _EGLDisplay *dpy,
       ok = gctx->stapi->make_current(gctx->stapi, gctx->stctxi,
             (gdraw) ? gdraw->stfbi : NULL, (gread) ? gread->stfbi : NULL);
       if (ok) {
-         gctx->stctxi->notify_invalid_framebuffer(gctx->stctxi, gdraw->stfbi);
-         if (gread != gdraw) {
+         if (gdraw) {
             gctx->stctxi->notify_invalid_framebuffer(gctx->stctxi,
-                  gread->stfbi);
-         }
+                  gdraw->stfbi);
 
-         if (gdraw->base.Type == EGL_WINDOW_BIT) {
-            gctx->base.WindowRenderBuffer =
-               (gdraw->stvis.render_buffer == ST_ATTACHMENT_FRONT_LEFT) ?
-               EGL_SINGLE_BUFFER : EGL_BACK_BUFFER;
+            if (gdraw->base.Type == EGL_WINDOW_BIT) {
+               gctx->base.WindowRenderBuffer =
+                  (gdraw->stvis.render_buffer == ST_ATTACHMENT_FRONT_LEFT) ?
+                  EGL_SINGLE_BUFFER : EGL_BACK_BUFFER;
+            }
+         }
+         if (gread && gread != gdraw) {
+            gctx->stctxi->notify_invalid_framebuffer(gctx->stctxi,
+                  gread->stfbi);
          }
       }
    }
@@ -805,6 +809,17 @@ egl_g3d_init_driver_api(_EGLDriver *drv)
 
    drv->API.CreateImageKHR = egl_g3d_create_image;
    drv->API.DestroyImageKHR = egl_g3d_destroy_image;
+#ifdef EGL_MESA_drm_image
+   drv->API.CreateDRMImageMESA = egl_g3d_create_drm_image;
+   drv->API.ExportDRMImageMESA = egl_g3d_export_drm_image;
+#endif
+
+#ifdef EGL_KHR_reusable_sync
+   drv->API.CreateSyncKHR = egl_g3d_create_sync;
+   drv->API.DestroySyncKHR = egl_g3d_destroy_sync;
+   drv->API.ClientWaitSyncKHR = egl_g3d_client_wait_sync;
+   drv->API.SignalSyncKHR = egl_g3d_signal_sync;
+#endif
 
 #ifdef EGL_MESA_screen_surface
    drv->API.CreateScreenSurfaceMESA = egl_g3d_create_screen_surface;
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d_image.c b/src/gallium/state_trackers/egl/common/egl_g3d_image.c
index 1e13cfcf7e..558638e72f 100644
--- a/src/gallium/state_trackers/egl/common/egl_g3d_image.c
+++ b/src/gallium/state_trackers/egl/common/egl_g3d_image.c
@@ -31,12 +31,16 @@
 #include "util/u_rect.h"
 #include "util/u_inlines.h"
 #include "eglcurrent.h"
+#include "egllog.h"
 
 #include "native.h"
 #include "egl_g3d.h"
 #include "egl_g3d_api.h"
 #include "egl_g3d_image.h"
 
+/* move this to native display? */
+#include "state_tracker/drm_driver.h"
+
 /**
  * Reference and return the front left buffer of the native pixmap.
  */
@@ -67,6 +71,165 @@ egl_g3d_reference_native_pixmap(_EGLDisplay *dpy, EGLNativePixmapType pix)
    return textures[natt];
 }
 
+#ifdef EGL_MESA_drm_image
+
+static struct pipe_resource *
+egl_g3d_create_drm_buffer(_EGLDisplay *dpy, const EGLint *attribs)
+{
+   struct egl_g3d_display *gdpy = egl_g3d_display(dpy);
+   struct pipe_screen *screen = gdpy->native->screen;
+   struct pipe_resource templ;
+   EGLint width = 0, height = 0, format = 0, use = 0;
+   EGLint valid_use;
+   EGLint i, err = EGL_SUCCESS;
+
+   for (i = 0; attribs[i] != EGL_NONE; i++) {
+      EGLint attr = attribs[i++];
+      EGLint val = attribs[i];
+
+      switch (attr) {
+      case EGL_WIDTH:
+	 width = val;
+         break;
+      case EGL_HEIGHT:
+	 height = val;
+         break;
+      case EGL_DRM_BUFFER_FORMAT_MESA:
+	 format = val;
+         break;
+      case EGL_DRM_BUFFER_USE_MESA:
+	 use = val;
+         break;
+      default:
+         err = EGL_BAD_ATTRIBUTE;
+         break;
+      }
+
+      if (err != EGL_SUCCESS) {
+         _eglLog(_EGL_DEBUG, "bad image attribute 0x%04x", attr);
+         return NULL;
+      }
+   }
+
+   if (width <= 0 || height <= 0) {
+      _eglLog(_EGL_DEBUG, "bad width or height (%dx%d)", width, height);
+      return NULL;
+   }
+
+   switch (format) {
+   case EGL_DRM_BUFFER_FORMAT_ARGB32_MESA:
+      format = PIPE_FORMAT_B8G8R8A8_UNORM;
+      break;
+   default:
+      _eglLog(_EGL_DEBUG, "bad image format value 0x%04x", format);
+      return NULL;
+      break;
+   }
+
+   valid_use = EGL_DRM_BUFFER_USE_SCANOUT_MESA |
+               EGL_DRM_BUFFER_USE_SHARE_MESA;
+   if (use & ~valid_use) {
+      _eglLog(_EGL_DEBUG, "bad image use bit 0x%04x", use);
+      return NULL;
+   }
+
+   memset(&templ, 0, sizeof(templ));
+   templ.target = PIPE_TEXTURE_2D;
+   templ.format = format;
+   templ.bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW;
+   templ.width0 = width;
+   templ.height0 = height;
+   templ.depth0 = 1;
+
+   /*
+    * XXX fix apps (e.g. wayland) and pipe drivers (e.g. i915) and remove the
+    * size check
+    */
+   if ((use & EGL_DRM_BUFFER_USE_SCANOUT_MESA) &&
+       width >= 640 && height >= 480)
+      templ.bind |= PIPE_BIND_SCANOUT;
+   if (use & EGL_DRM_BUFFER_USE_SHARE_MESA)
+      templ.bind |= PIPE_BIND_SHARED;
+
+   return screen->resource_create(screen, &templ);
+}
+
+static struct pipe_resource *
+egl_g3d_reference_drm_buffer(_EGLDisplay *dpy, EGLint name,
+                             const EGLint *attribs)
+{
+   struct egl_g3d_display *gdpy = egl_g3d_display(dpy);
+   struct pipe_screen *screen = gdpy->native->screen;
+   struct pipe_resource templ;
+   struct winsys_handle wsh;
+   EGLint width = 0, height = 0, format = 0, stride = 0;
+   EGLint i, err = EGL_SUCCESS;
+
+   /* winsys_handle is in theory platform-specific */
+   if (dpy->Platform != _EGL_PLATFORM_DRM)
+      return NULL;
+
+   for (i = 0; attribs[i] != EGL_NONE; i++) {
+      EGLint attr = attribs[i++];
+      EGLint val = attribs[i];
+
+      switch (attr) {
+      case EGL_WIDTH:
+	 width = val;
+         break;
+      case EGL_HEIGHT:
+	 height = val;
+         break;
+      case EGL_DRM_BUFFER_FORMAT_MESA:
+	 format = val;
+         break;
+      case EGL_DRM_BUFFER_STRIDE_MESA:
+	 stride = val;
+         break;
+      default:
+         err = EGL_BAD_ATTRIBUTE;
+         break;
+      }
+
+      if (err != EGL_SUCCESS) {
+         _eglLog(_EGL_DEBUG, "bad image attribute 0x%04x", attr);
+         return NULL;
+      }
+   }
+
+   if (width <= 0 || height <= 0 || stride <= 0) {
+      _eglLog(_EGL_DEBUG, "bad width, height, or stride (%dx%dx%d)",
+            width, height, stride);
+      return NULL;
+   }
+
+   switch (format) {
+   case EGL_DRM_BUFFER_FORMAT_ARGB32_MESA:
+      format = PIPE_FORMAT_B8G8R8A8_UNORM;
+      break;
+   default:
+      _eglLog(_EGL_DEBUG, "bad image format value 0x%04x", format);
+      return NULL;
+      break;
+   }
+
+   memset(&templ, 0, sizeof(templ));
+   templ.target = PIPE_TEXTURE_2D;
+   templ.format = format;
+   templ.bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW;
+   templ.width0 = width;
+   templ.height0 = height;
+   templ.depth0 = 1;
+
+   memset(&wsh, 0, sizeof(wsh));
+   wsh.handle = (unsigned) name;
+   wsh.stride = stride;
+
+   return screen->resource_from_handle(screen, &templ, &wsh);
+}
+
+#endif /* EGL_MESA_drm_image */
+
 _EGLImage *
 egl_g3d_create_image(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx,
                      EGLenum target, EGLClientBuffer buffer,
@@ -92,6 +255,11 @@ egl_g3d_create_image(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx,
       ptex = egl_g3d_reference_native_pixmap(dpy,
             (EGLNativePixmapType) buffer);
       break;
+#ifdef EGL_MESA_drm_image
+   case EGL_DRM_BUFFER_MESA:
+      ptex = egl_g3d_reference_drm_buffer(dpy, (EGLint) buffer, attribs);
+      break;
+#endif
    default:
       ptex = NULL;
       break;
@@ -134,3 +302,80 @@ egl_g3d_destroy_image(_EGLDriver *drv, _EGLDisplay *dpy, _EGLImage *img)
 
    return EGL_TRUE;
 }
+
+_EGLImage *
+egl_g3d_create_drm_image(_EGLDriver *drv, _EGLDisplay *dpy,
+                         const EGLint *attribs)
+{
+   struct egl_g3d_image *gimg;
+   struct pipe_resource *ptex;
+
+   gimg = CALLOC_STRUCT(egl_g3d_image);
+   if (!gimg) {
+      _eglError(EGL_BAD_ALLOC, "eglCreateDRMImageKHR");
+      return NULL;
+   }
+
+   if (!_eglInitImage(&gimg->base, dpy, attribs)) {
+      FREE(gimg);
+      return NULL;
+   }
+
+#ifdef EGL_MESA_drm_image
+   ptex = egl_g3d_create_drm_buffer(dpy, attribs);
+#else
+   ptex = NULL;
+#endif
+   if (!ptex) {
+      FREE(gimg);
+      return NULL;
+   }
+
+   /* transfer the ownership to the image */
+   gimg->texture = ptex;
+   gimg->face = 0;
+   gimg->level = 0;
+   gimg->zslice = 0;
+
+   return &gimg->base;
+}
+
+EGLBoolean
+egl_g3d_export_drm_image(_EGLDriver *drv, _EGLDisplay *dpy, _EGLImage *img,
+			 EGLint *name, EGLint *handle, EGLint *stride)
+{
+   struct egl_g3d_display *gdpy = egl_g3d_display(dpy);
+   struct egl_g3d_image *gimg = egl_g3d_image(img);
+   struct pipe_screen *screen = gdpy->native->screen;
+   struct winsys_handle wsh;
+
+   /* winsys_handle is in theory platform-specific */
+   if (dpy->Platform != _EGL_PLATFORM_DRM)
+      return EGL_FALSE;
+
+   /* get shared handle */
+   if (name) {
+      memset(&handle, 0, sizeof(handle));
+      wsh.type = DRM_API_HANDLE_TYPE_SHARED;
+      if (!screen->resource_get_handle(screen, gimg->texture, &wsh)) {
+         return EGL_FALSE;
+      }
+
+      *name = wsh.handle;
+   }
+
+   /* get KMS handle */
+   if (handle || stride) {
+      memset(&wsh, 0, sizeof(wsh));
+      wsh.type = DRM_API_HANDLE_TYPE_KMS;
+      if (!screen->resource_get_handle(screen, gimg->texture, &wsh))
+         return EGL_FALSE;
+
+      if (handle)
+         *handle = wsh.handle;
+      if (stride)
+         *stride = wsh.stride;
+   }
+
+   return EGL_TRUE;
+}
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d_image.h b/src/gallium/state_trackers/egl/common/egl_g3d_image.h
index adda933371..f051da8283 100644
--- a/src/gallium/state_trackers/egl/common/egl_g3d_image.h
+++ b/src/gallium/state_trackers/egl/common/egl_g3d_image.h
@@ -39,4 +39,12 @@ egl_g3d_create_image(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx,
 EGLBoolean
 egl_g3d_destroy_image(_EGLDriver *drv, _EGLDisplay *dpy, _EGLImage *image);
 
+_EGLImage *
+egl_g3d_create_drm_image(_EGLDriver *drv, _EGLDisplay *dpy,
+                         const EGLint *attribs);
+
+EGLBoolean
+egl_g3d_export_drm_image(_EGLDriver *drv, _EGLDisplay *dpy, _EGLImage *img,
+			 EGLint *name, EGLint *handle, EGLint *stride);
+
 #endif /* _EGL_G3D_IMAGE_H_ */
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d_sync.c b/src/gallium/state_trackers/egl/common/egl_g3d_sync.c
new file mode 100644
index 0000000000..ec74e9eb94
--- /dev/null
+++ b/src/gallium/state_trackers/egl/common/egl_g3d_sync.c
@@ -0,0 +1,284 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.9
+ *
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "util/u_memory.h"
+#include "util/u_atomic.h"
+#include "os/os_thread.h"
+#include "eglsync.h"
+#include "eglcurrent.h"
+
+#include "egl_g3d.h"
+#include "egl_g3d_sync.h"
+
+#ifdef EGL_KHR_reusable_sync
+
+/**
+ * Wait for the conditional variable.
+ */
+static EGLint
+egl_g3d_wait_sync_condvar(struct egl_g3d_sync *gsync, EGLTimeKHR timeout)
+{
+   _EGLDisplay *dpy = gsync->base.Resource.Display;
+
+   pipe_mutex_lock(gsync->mutex);
+
+   /* unlock display lock just before waiting */
+   _eglUnlockMutex(&dpy->Mutex);
+
+   /* No timed wait.  Always treat timeout as EGL_FOREVER_KHR */
+   pipe_condvar_wait(gsync->condvar, gsync->mutex);
+
+   _eglLockMutex(&dpy->Mutex);
+
+   pipe_mutex_unlock(gsync->mutex);
+
+   return EGL_CONDITION_SATISFIED_KHR;
+}
+
+/**
+ * Signal the conditional variable.
+ */
+static void
+egl_g3d_signal_sync_condvar(struct egl_g3d_sync *gsync)
+{
+   pipe_mutex_lock(gsync->mutex);
+   pipe_condvar_broadcast(gsync->condvar);
+   pipe_mutex_unlock(gsync->mutex);
+}
+
+/**
+ * Insert a fence command to the command stream of the current context.
+ */
+static EGLint
+egl_g3d_insert_fence_sync(struct egl_g3d_sync *gsync)
+{
+   _EGLContext *ctx = _eglGetCurrentContext();
+   struct egl_g3d_context *gctx = egl_g3d_context(ctx);
+
+   /* already checked in egl_g3d_create_sync */
+   assert(gctx);
+
+   /* insert the fence command */
+   gctx->stctxi->flush(gctx->stctxi, 0x0, &gsync->fence);
+   if (!gsync->fence)
+      gsync->base.SyncStatus = EGL_SIGNALED_KHR;
+
+   return EGL_SUCCESS;
+}
+
+/**
+ * Wait for the fence sync to be signaled.
+ */
+static EGLint
+egl_g3d_wait_fence_sync(struct egl_g3d_sync *gsync, EGLTimeKHR timeout)
+{
+   EGLint ret;
+
+   if (gsync->fence) {
+      _EGLDisplay *dpy = gsync->base.Resource.Display;
+      struct egl_g3d_display *gdpy = egl_g3d_display(dpy);
+      struct pipe_screen *screen = gdpy->native->screen;
+      struct pipe_fence_handle *fence = gsync->fence;
+
+      gsync->fence = NULL;
+
+      _eglUnlockMutex(&dpy->Mutex);
+      /* no timed finish? */
+      screen->fence_finish(screen, fence, 0x0);
+      ret = EGL_CONDITION_SATISFIED_KHR;
+      _eglLockMutex(&dpy->Mutex);
+
+      gsync->base.SyncStatus = EGL_SIGNALED_KHR;
+
+      screen->fence_reference(screen, &fence, NULL);
+      egl_g3d_signal_sync_condvar(gsync);
+   }
+   else {
+      ret = egl_g3d_wait_sync_condvar(gsync, timeout);
+   }
+
+   return ret;
+}
+
+static INLINE void
+egl_g3d_ref_sync(struct egl_g3d_sync *gsync)
+{
+   p_atomic_inc(&gsync->refs);
+}
+
+static INLINE void
+egl_g3d_unref_sync(struct egl_g3d_sync *gsync)
+{
+   if (p_atomic_dec_zero(&gsync->refs)) {
+      pipe_condvar_destroy(gsync->condvar);
+      pipe_mutex_destroy(gsync->mutex);
+
+      if (gsync->fence) {
+         struct egl_g3d_display *gdpy =
+            egl_g3d_display(gsync->base.Resource.Display);
+         struct pipe_screen *screen = gdpy->native->screen;
+
+         screen->fence_reference(screen, &gsync->fence, NULL);
+      }
+
+      FREE(gsync);
+   }
+}
+
+_EGLSync *
+egl_g3d_create_sync(_EGLDriver *drv, _EGLDisplay *dpy,
+                    EGLenum type, const EGLint *attrib_list)
+{
+   _EGLContext *ctx = _eglGetCurrentContext();
+   struct egl_g3d_sync *gsync;
+   EGLint err;
+
+   if (!ctx || ctx->Resource.Display != dpy) {
+      _eglError(EGL_BAD_MATCH, "eglCreateSyncKHR");
+      return NULL;
+   }
+
+   gsync = CALLOC_STRUCT(egl_g3d_sync);
+   if (!gsync) {
+      _eglError(EGL_BAD_ALLOC, "eglCreateSyncKHR");
+      return NULL;
+   }
+
+   if (!_eglInitSync(&gsync->base, dpy, type, attrib_list)) {
+      FREE(gsync);
+      return NULL;
+   }
+
+   switch (type) {
+   case EGL_SYNC_REUSABLE_KHR:
+      err = EGL_SUCCESS;
+      break;
+   case EGL_SYNC_FENCE_KHR:
+      err = egl_g3d_insert_fence_sync(gsync);
+      break;
+   default:
+      err = EGL_BAD_ATTRIBUTE;
+      break;
+   }
+
+   if (err != EGL_SUCCESS) {
+      _eglError(err, "eglCreateSyncKHR");
+      FREE(gsync);
+      return NULL;
+   }
+
+   pipe_mutex_init(gsync->mutex);
+   pipe_condvar_init(gsync->condvar);
+   p_atomic_set(&gsync->refs, 1);
+
+   return &gsync->base;
+}
+
+EGLBoolean
+egl_g3d_destroy_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync)
+{
+   struct egl_g3d_sync *gsync = egl_g3d_sync(sync);
+
+   switch (gsync->base.Type) {
+   case EGL_SYNC_REUSABLE_KHR:
+      /* signal the waiters */
+      if (gsync->base.SyncStatus != EGL_SIGNALED_KHR) {
+         gsync->base.SyncStatus = EGL_SIGNALED_KHR;
+         egl_g3d_signal_sync_condvar(gsync);
+      }
+      break;
+   default:
+      break;
+   }
+
+   egl_g3d_unref_sync(gsync);
+
+   return EGL_TRUE;
+}
+
+EGLint
+egl_g3d_client_wait_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                         EGLint flags, EGLTimeKHR timeout)
+{
+   struct egl_g3d_sync *gsync = egl_g3d_sync(sync);
+   EGLint ret = EGL_CONDITION_SATISFIED_KHR;
+
+   if (gsync->base.SyncStatus != EGL_SIGNALED_KHR) {
+      /* flush if there is a current context */
+      if (flags & EGL_SYNC_FLUSH_COMMANDS_BIT_KHR) {
+         _EGLContext *ctx = _eglGetCurrentContext();
+         struct egl_g3d_context *gctx = egl_g3d_context(ctx);
+
+         if (gctx)
+            gctx->stctxi->flush(gctx->stctxi, PIPE_FLUSH_RENDER_CACHE , NULL);
+      }
+
+      if (timeout) {
+         /* reference the sync object in case it is destroyed while waiting */
+         egl_g3d_ref_sync(gsync);
+
+         switch (gsync->base.Type) {
+         case EGL_SYNC_REUSABLE_KHR:
+            ret = egl_g3d_wait_sync_condvar(gsync, timeout);
+            break;
+         case EGL_SYNC_FENCE_KHR:
+            ret = egl_g3d_wait_fence_sync(gsync, timeout);
+         default:
+            break;
+         }
+
+         egl_g3d_unref_sync(gsync);
+      }
+      else {
+         ret = EGL_TIMEOUT_EXPIRED_KHR;
+      }
+   }
+
+   return ret;
+}
+
+EGLBoolean
+egl_g3d_signal_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                    EGLenum mode)
+{
+   struct egl_g3d_sync *gsync = egl_g3d_sync(sync);
+
+   /* only for reusable sync */
+   if (sync->Type != EGL_SYNC_REUSABLE_KHR)
+      return _eglError(EGL_BAD_MATCH, "eglSignalSyncKHR");
+
+   if (gsync->base.SyncStatus != mode) {
+      gsync->base.SyncStatus = mode;
+      if (mode == EGL_SIGNALED_KHR)
+         egl_g3d_signal_sync_condvar(gsync);
+   }
+
+   return EGL_TRUE;
+}
+
+#endif /* EGL_KHR_reusable_sync */
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d_sync.h b/src/gallium/state_trackers/egl/common/egl_g3d_sync.h
new file mode 100644
index 0000000000..3179ca04e1
--- /dev/null
+++ b/src/gallium/state_trackers/egl/common/egl_g3d_sync.h
@@ -0,0 +1,53 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.9
+ *
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#ifndef _EGL_G3D_SYNC_H_
+#define _EGL_G3D_SYNC_H_
+
+#include "egl_g3d.h"
+
+#ifdef EGL_KHR_reusable_sync
+
+_EGLSync *
+egl_g3d_create_sync(_EGLDriver *drv, _EGLDisplay *dpy,
+                    EGLenum type, const EGLint *attrib_list);
+
+EGLBoolean
+egl_g3d_destroy_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync);
+
+EGLint
+egl_g3d_client_wait_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                         EGLint flags, EGLTimeKHR timeout);
+
+EGLBoolean
+egl_g3d_signal_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                    EGLenum mode);
+
+#endif /* EGL_KHR_reusable_sync */
+
+#endif /* _EGL_G3D_SYNC_H_ */
diff --git a/src/gallium/state_trackers/egl/kms/native_kms.c b/src/gallium/state_trackers/egl/kms/native_kms.c
index d4e8fbc913..208f73306c 100644
--- a/src/gallium/state_trackers/egl/kms/native_kms.c
+++ b/src/gallium/state_trackers/egl/kms/native_kms.c
@@ -38,6 +38,10 @@
 
 #include "native_kms.h"
 
+/* see get_drm_screen_name */
+#include <radeon_drm.h>
+#include "radeon/drm/radeon_drm.h"
+
 static boolean
 kms_surface_validate(struct native_surface *nsurf, uint attachment_mask,
                      unsigned int *seq_num, struct pipe_resource **textures,
@@ -584,7 +588,9 @@ kms_display_get_configs(struct native_display *ndpy, int *num_configs)
 
       nconf->color_format = format;
 
-      nconf->scanout_bit = TRUE;
+      /* support KMS */
+      if (kdpy->resources)
+         nconf->scanout_bit = TRUE;
    }
 
    configs = MALLOC(sizeof(*configs));
@@ -664,6 +670,27 @@ kms_display_destroy(struct native_display *ndpy)
    FREE(kdpy);
 }
 
+static const char *
+get_drm_screen_name(int fd, drmVersionPtr version)
+{
+   const char *name = version->name;
+
+   if (name && !strcmp(name, "radeon")) {
+      int chip_id;
+      struct drm_radeon_info info;
+
+      memset(&info, 0, sizeof(info));
+      info.request = RADEON_INFO_DEVICE_ID;
+      info.value = pointer_to_intptr(&chip_id);
+      if (drmCommandWriteRead(fd, DRM_RADEON_INFO, &info, sizeof(info)) != 0)
+         return NULL;
+
+      name = is_r3xx(chip_id) ? "r300" : "r600";
+   }
+
+   return name;
+}
+
 /**
  * Initialize KMS and pipe screen.
  */
@@ -672,6 +699,7 @@ kms_display_init_screen(struct native_display *ndpy)
 {
    struct kms_display *kdpy = kms_display(ndpy);
    drmVersionPtr version;
+   const char *name;
 
    version = drmGetVersion(kdpy->fd);
    if (!version) {
@@ -679,8 +707,11 @@ kms_display_init_screen(struct native_display *ndpy)
       return FALSE;
    }
 
-   kdpy->base.screen = kdpy->event_handler->new_drm_screen(&kdpy->base,
-         version->name, kdpy->fd);;
+   name = get_drm_screen_name(kdpy->fd, version);
+   if (name) {
+      kdpy->base.screen =
+         kdpy->event_handler->new_drm_screen(&kdpy->base, name, kdpy->fd);
+   }
    drmFreeVersion(version);
 
    if (!kdpy->base.screen) {
@@ -717,32 +748,32 @@ kms_create_display(int fd, struct native_event_handler *event_handler,
       return NULL;
    }
 
+   kdpy->base.destroy = kms_display_destroy;
+   kdpy->base.get_param = kms_display_get_param;
+   kdpy->base.get_configs = kms_display_get_configs;
+
    /* resources are fixed, unlike crtc, connector, or encoder */
    kdpy->resources = drmModeGetResources(kdpy->fd);
-   if (!kdpy->resources) {
-      kms_display_destroy(&kdpy->base);
-      return NULL;
-   }
+   if (kdpy->resources) {
+      kdpy->saved_crtcs =
+         CALLOC(kdpy->resources->count_crtcs, sizeof(*kdpy->saved_crtcs));
+      if (!kdpy->saved_crtcs) {
+         kms_display_destroy(&kdpy->base);
+         return NULL;
+      }
 
-   kdpy->saved_crtcs =
-      CALLOC(kdpy->resources->count_crtcs, sizeof(*kdpy->saved_crtcs));
-   if (!kdpy->saved_crtcs) {
-      kms_display_destroy(&kdpy->base);
-      return NULL;
-   }
+      kdpy->shown_surfaces =
+         CALLOC(kdpy->resources->count_crtcs, sizeof(*kdpy->shown_surfaces));
+      if (!kdpy->shown_surfaces) {
+         kms_display_destroy(&kdpy->base);
+         return NULL;
+      }
 
-   kdpy->shown_surfaces =
-      CALLOC(kdpy->resources->count_crtcs, sizeof(*kdpy->shown_surfaces));
-   if (!kdpy->shown_surfaces) {
-      kms_display_destroy(&kdpy->base);
-      return NULL;
+      kdpy->base.modeset = &kms_display_modeset;
+   }
+   else {
+      _eglLog(_EGL_DEBUG, "Failed to get KMS resources.  Disable modeset.");
    }
-
-   kdpy->base.destroy = kms_display_destroy;
-   kdpy->base.get_param = kms_display_get_param;
-   kdpy->base.get_configs = kms_display_get_configs;
-
-   kdpy->base.modeset = &kms_display_modeset;
 
    return &kdpy->base;
 }
diff --git a/src/gallium/state_trackers/glx/xlib/glx_api.c b/src/gallium/state_trackers/glx/xlib/glx_api.c
index eb8d6a1933..dcd50e19d7 100644
--- a/src/gallium/state_trackers/glx/xlib/glx_api.c
+++ b/src/gallium/state_trackers/glx/xlib/glx_api.c
@@ -34,10 +34,6 @@
 #include "GL/glx.h"
 
 #include "xm_api.h"
-#include "main/context.h"
-#include "main/macros.h"
-#include "main/imports.h"
-#include "main/version.h"
 
 
 /* This indicates the client-side GLX API and GLX encoder version. */
@@ -603,8 +599,8 @@ destroy_visuals_on_display(Display *dpy)
 static int
 close_display_callback(Display *dpy, XExtCodes *codes)
 {
-   destroy_visuals_on_display(dpy);
    xmesa_destroy_buffers_on_display(dpy);
+   destroy_visuals_on_display(dpy);
    return 0;
 }
 
@@ -1299,7 +1295,7 @@ glXCopyContext( Display *dpy, GLXContext src, GLXContext dst,
    XMesaContext xm_dst = dst->xmesaContext;
    (void) dpy;
    if (MakeCurrent_PrevContext == src) {
-      _mesa_Flush();
+      glFlush();
    }
    XMesaCopyContext(xm_src, xm_dst, mask);
 }
diff --git a/src/gallium/state_trackers/glx/xlib/glx_usefont.c b/src/gallium/state_trackers/glx/xlib/glx_usefont.c
index 8903b0e6cb..0aa37e150b 100644
--- a/src/gallium/state_trackers/glx/xlib/glx_usefont.c
+++ b/src/gallium/state_trackers/glx/xlib/glx_usefont.c
@@ -30,8 +30,7 @@
  */
 
 
-#include "main/context.h"
-#include "main/imports.h"
+#include "main/core.h"
 #include <GL/glx.h>
 
 
diff --git a/src/gallium/state_trackers/glx/xlib/xm_api.c b/src/gallium/state_trackers/glx/xlib/xm_api.c
index c0c418306f..eb4ce74266 100644
--- a/src/gallium/state_trackers/glx/xlib/xm_api.c
+++ b/src/gallium/state_trackers/glx/xlib/xm_api.c
@@ -56,7 +56,6 @@
 #include "xm_api.h"
 #include "xm_st.h"
 
-#include "main/context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 #include "pipe/p_context.h"
@@ -72,10 +71,35 @@
 static struct xm_driver driver;
 static struct st_api *stapi;
 
+/* Default strict invalidate to false.  This means we will not call
+ * XGetGeometry after every swapbuffers, which allows swapbuffers to
+ * remain asynchronous.  For apps running at 100fps with synchronous
+ * swapping, a 10% boost is typical.  For gears, I see closer to 20%
+ * speedup.
+ *
+ * Note that the work of copying data on swapbuffers doesn't disappear
+ * - this change just allows the X server to execute the PutImage
+ * asynchronously without us effectively blocked until its completion.
+ *
+ * This speeds up even llvmpipe's threaded rasterization as the
+ * swapbuffers operation was a large part of the serial component of
+ * an llvmpipe frame.
+ *
+ * The downside of this is correctness - applications which don't call
+ * glViewport on window resizes will get incorrect rendering.  A
+ * better solution would be to have per-frame but asynchronous
+ * invalidation.  Xcb almost looks as if it could provide this, but
+ * the API doesn't seem to quite be there.
+ */
+boolean xmesa_strict_invalidate = FALSE;
+
 void xmesa_set_driver( const struct xm_driver *templ )
 {
    driver = *templ;
    stapi = driver.create_st_api();
+
+   xmesa_strict_invalidate =
+      debug_get_bool_option("XMESA_STRICT_INVALIDATE", FALSE);
 }
 
 
@@ -91,7 +115,12 @@ static int
 xmesa_get_param(struct st_manager *smapi,
                 enum st_manager_param param)
 {
-   return 0;
+   switch(param) {
+   case ST_MANAGER_BROKEN_INVALIDATE:
+      return !xmesa_strict_invalidate;
+   default:
+      return 0;
+   }
 }
 
 static XMesaDisplay
@@ -263,7 +292,6 @@ xmesa_get_window_size(Display *dpy, XMesaBuffer b,
    Status stat;
 
    pipe_mutex_lock(xmdpy->mutex);
-   XSync(b->xm_visual->display, 0); /* added for Chromium */
    stat = get_drawable_size(dpy, b->ws.drawable, width, height);
    pipe_mutex_unlock(xmdpy->mutex);
 
@@ -726,15 +754,39 @@ XMesaVisual XMesaCreateVisual( Display *display,
       alpha_bits = v->mesa_visual.alphaBits;
    }
 
-   _mesa_initialize_visual( &v->mesa_visual,
-                            db_flag, stereo_flag,
-                            red_bits, green_bits,
-                            blue_bits, alpha_bits,
-                            depth_size,
-                            stencil_size,
-                            accum_red_size, accum_green_size,
-                            accum_blue_size, accum_alpha_size,
-                            0 );
+   /* initialize visual */
+   {
+      __GLcontextModes *vis = &v->mesa_visual;
+
+      vis->rgbMode          = GL_TRUE;
+      vis->doubleBufferMode = db_flag;
+      vis->stereoMode       = stereo_flag;
+
+      vis->redBits          = red_bits;
+      vis->greenBits        = green_bits;
+      vis->blueBits         = blue_bits;
+      vis->alphaBits        = alpha_bits;
+      vis->rgbBits          = red_bits + green_bits + blue_bits;
+
+      vis->indexBits      = 0;
+      vis->depthBits      = depth_size;
+      vis->stencilBits    = stencil_size;
+
+      vis->accumRedBits   = accum_red_size;
+      vis->accumGreenBits = accum_green_size;
+      vis->accumBlueBits  = accum_blue_size;
+      vis->accumAlphaBits = accum_alpha_size;
+
+      vis->haveAccumBuffer   = accum_red_size > 0;
+      vis->haveDepthBuffer   = depth_size > 0;
+      vis->haveStencilBuffer = stencil_size > 0;
+
+      vis->numAuxBuffers = 0;
+      vis->level = 0;
+      vis->pixmapMode = 0;
+      vis->sampleBuffers = 0;
+      vis->samples = 0;
+   }
 
    v->stvis.buffer_mask = ST_ATTACHMENT_FRONT_LEFT_MASK;
    if (db_flag)
@@ -1154,7 +1206,7 @@ void XMesaFlush( XMesaContext c )
          xmdpy->screen->fence_finish(xmdpy->screen, fence, 0);
          xmdpy->screen->fence_reference(xmdpy->screen, &fence, NULL);
       }
-      XSync( c->xm_visual->display, False );
+      XFlush( c->xm_visual->display );
    }
 }
 
diff --git a/src/gallium/state_trackers/glx/xlib/xm_api.h b/src/gallium/state_trackers/glx/xlib/xm_api.h
index 4f2c8a6e6a..f209b14ea1 100644
--- a/src/gallium/state_trackers/glx/xlib/xm_api.h
+++ b/src/gallium/state_trackers/glx/xlib/xm_api.h
@@ -57,7 +57,7 @@ and create a window, you must do the following to use the X/Mesa interface:
 #define XMESA_H
 
 
-#include "main/mtypes.h"
+#include "main/core.h" /* for GLvisual and MESA_VERSION_STRING */
 #include "state_tracker/st_api.h"
 #include "os/os_thread.h"
 
@@ -378,6 +378,6 @@ xmesa_buffer_height(XMesaBuffer b)
    return b->height;
 }
 
-
+extern boolean xmesa_strict_invalidate;
 
 #endif
diff --git a/src/gallium/state_trackers/glx/xlib/xm_st.c b/src/gallium/state_trackers/glx/xlib/xm_st.c
index c62eb8bfbd..0f74b3f7aa 100644
--- a/src/gallium/state_trackers/glx/xlib/xm_st.c
+++ b/src/gallium/state_trackers/glx/xlib/xm_st.c
@@ -26,18 +26,18 @@
  *    Chia-I Wu <olv@lunarg.com>
  */
 
-#include "util/u_memory.h"
-#include "util/u_inlines.h"
-
 #include "xm_api.h"
 #include "xm_st.h"
 
+#include "util/u_inlines.h"
+
 struct xmesa_st_framebuffer {
    XMesaDisplay display;
    XMesaBuffer buffer;
    struct pipe_screen *screen;
 
    struct st_visual stvis;
+   enum pipe_texture_target target;
 
    unsigned texture_width, texture_height, texture_mask;
    struct pipe_resource *textures[ST_ATTACHMENT_COUNT];
@@ -139,7 +139,7 @@ xmesa_st_framebuffer_validate_textures(struct st_framebuffer_iface *stfbi,
    }
 
    memset(&templ, 0, sizeof(templ));
-   templ.target = PIPE_TEXTURE_2D;
+   templ.target = xstfb->target;
    templ.width0 = width;
    templ.height0 = height;
    templ.depth0 = 1;
@@ -210,6 +210,12 @@ xmesa_st_framebuffer_validate(struct st_framebuffer_iface *stfbi,
    /* record newly allocated textures */
    new_mask = statt_mask & ~xstfb->texture_mask;
 
+   /* If xmesa_strict_invalidate is not set, we will not yet have
+    * called XGetGeometry().  Do so here:
+    */
+   if (!xmesa_strict_invalidate)
+      xmesa_check_buffer_size(xstfb->buffer);
+
    resized = (xstfb->buffer->width != xstfb->texture_width ||
               xstfb->buffer->height != xstfb->texture_height);
 
@@ -251,7 +257,8 @@ xmesa_st_framebuffer_flush_front(struct st_framebuffer_iface *stfbi,
    boolean ret;
 
    ret = xmesa_st_framebuffer_display(stfbi, statt);
-   if (ret)
+
+   if (ret && xmesa_strict_invalidate)
       xmesa_check_buffer_size(xstfb->buffer);
 
    return ret;
@@ -279,6 +286,10 @@ xmesa_create_st_framebuffer(XMesaDisplay xmdpy, XMesaBuffer b)
    xstfb->buffer = b;
    xstfb->screen = xmdpy->screen;
    xstfb->stvis = b->xm_visual->stvis;
+   if(xstfb->screen->get_param(xstfb->screen, PIPE_CAP_NPOT_TEXTURES))
+      xstfb->target = PIPE_TEXTURE_2D;
+   else
+      xstfb->target = PIPE_TEXTURE_RECT;
 
    stfbi->visual = &xstfb->stvis;
    stfbi->flush_front = xmesa_st_framebuffer_flush_front;
@@ -322,7 +333,8 @@ xmesa_swap_st_framebuffer(struct st_framebuffer_iface *stfbi)
          *back = tmp;
       }
 
-      xmesa_check_buffer_size(xstfb->buffer);
+      if (xmesa_strict_invalidate)
+	 xmesa_check_buffer_size(xstfb->buffer);
    }
 }
 
diff --git a/src/gallium/state_trackers/wgl/stw_context.c b/src/gallium/state_trackers/wgl/stw_context.c
index 0fb7cd8306..a0e14b9601 100644
--- a/src/gallium/state_trackers/wgl/stw_context.c
+++ b/src/gallium/state_trackers/wgl/stw_context.c
@@ -33,7 +33,7 @@
 
 /* for _mesa_share_state */
 #include "state_tracker/st_context.h"
-#include "main/context.h"
+#include "main/core.h"
 
 #include "stw_icd.h"
 #include "stw_device.h"
diff --git a/src/gallium/state_trackers/wgl/stw_device.c b/src/gallium/state_trackers/wgl/stw_device.c
index a107c71bda..37809d084c 100644
--- a/src/gallium/state_trackers/wgl/stw_device.c
+++ b/src/gallium/state_trackers/wgl/stw_device.c
@@ -27,7 +27,7 @@
 
 #include <windows.h>
 
-#include "glapi/glthread.h"
+#include "glapi/glapi.h"
 #include "util/u_debug.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
diff --git a/src/gallium/state_trackers/wgl/stw_pixelformat.c b/src/gallium/state_trackers/wgl/stw_pixelformat.c
index e606477e97..18ac482369 100644
--- a/src/gallium/state_trackers/wgl/stw_pixelformat.c
+++ b/src/gallium/state_trackers/wgl/stw_pixelformat.c
@@ -25,15 +25,13 @@
  * 
  **************************************************************************/
 
-#include "main/mtypes.h"
-#include "main/context.h"
-
 #include "pipe/p_format.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 
 #include "util/u_format.h"
 #include "util/u_debug.h"
+#include "util/u_memory.h"
 
 #include "stw_icd.h"
 #include "stw_device.h"
diff --git a/src/gallium/state_trackers/wgl/stw_pixelformat.h b/src/gallium/state_trackers/wgl/stw_pixelformat.h
index d405172773..282c9f643c 100644
--- a/src/gallium/state_trackers/wgl/stw_pixelformat.h
+++ b/src/gallium/state_trackers/wgl/stw_pixelformat.h
@@ -34,8 +34,6 @@
 #define PFD_SUPPORT_COMPOSITION 0x00008000
 #endif
 
-#include "main/mtypes.h"
-
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "state_tracker/st_api.h"
diff --git a/src/gallium/targets/Makefile.dri b/src/gallium/targets/Makefile.dri
index de05f96d23..59961e982a 100644
--- a/src/gallium/targets/Makefile.dri
+++ b/src/gallium/targets/Makefile.dri
@@ -1,11 +1,12 @@
 # -*-makefile-*-
 
+
 ifeq ($(MESA_LLVM),1)
 PIPE_DRIVERS += $(TOP)/src/gallium/drivers/llvmpipe/libllvmpipe.a
 LDFLAGS += $(LLVM_LDFLAGS)
-LD = g++
 DRIVER_EXTRAS = $(LLVM_LIBS)
-USE_CXX=1
+else
+LDFLAGS += -lstdc++
 endif
 
 MESA_MODULES = \
@@ -75,15 +76,11 @@ default: depend symlinks $(TOP)/$(LIB_DIR)/gallium $(LIBNAME) $(LIBNAME_STAGING)
 
 $(LIBNAME): $(OBJECTS) $(MESA_MODULES) $(PIPE_DRIVERS) Makefile \
 		$(TOP)/src/mesa/drivers/dri/Makefile.template $(TOP)/src/mesa/drivers/dri/common/dri_test.o
-	$(MKLIB) -o $@.tmp -noprefix -linker '$(CC)' -ldflags '$(LDFLAGS)' \
+	$(MKLIB) -o $@.tmp -noprefix -linker '$(CXX)' -ldflags '$(LDFLAGS)' \
 		$(OBJECTS) $(PIPE_DRIVERS) \
                 -Wl,--start-group $(MESA_MODULES) -Wl,--end-group \
                  $(DRI_LIB_DEPS) $(DRIVER_EXTRAS)
-	if [ "x${USE_CXX}" == "x" ]; then \
-		$(CC) $(CFLAGS) -o $@.test $(TOP)/src/mesa/drivers/dri/common/dri_test.o $@.tmp $(DRI_LIB_DEPS); \
-	else \
-		$(CXX) $(CFLAGS) -o $@.test $(TOP)/src/mesa/drivers/dri/common/dri_test.o $@.tmp $(DRI_LIB_DEPS); \
-	fi
+	$(CXX) $(CFLAGS) -o $@.test $(TOP)/src/mesa/drivers/dri/common/dri_test.o $@.tmp $(DRI_LIB_DEPS);
 	@rm -f $@.test
 	mv -f $@.tmp $@
 
diff --git a/src/gallium/targets/SConscript b/src/gallium/targets/SConscript
index f8276b1555..e447d09361 100644
--- a/src/gallium/targets/SConscript
+++ b/src/gallium/targets/SConscript
@@ -1,18 +1,13 @@
 import os
 Import('*')
-	
+
 # Compatibility with old build scripts:
 #
 if 'mesa' in env['statetrackers']:
-	if 'xlib' in env['winsys']:
-		SConscript([
-			'libgl-xlib/SConscript',
-		])
-
-	if 'gdi' in env['winsys']:
-		SConscript([
-			'libgl-gdi/SConscript',
-		])
+    if 'xlib' in env['winsys'] and 'libgl-xlib' not in env['targets']:
+        env['targets'].append('libgl-xlib')
+    if 'gdi' in env['winsys'] and 'libgl-gdi' not in env['targets']:
+        env['targets'].append('libgl-gdi')
 
 if not 'graw-xlib' in env['targets'] and not 'graw-null' in env['targets'] and not env['msvc']:
         # XXX: disable until MSVC can link correctly
diff --git a/src/gallium/targets/dri-radeong/Makefile b/src/gallium/targets/dri-r300/Makefile
index 3f9ec36166..9afbb13276 100644
--- a/src/gallium/targets/dri-radeong/Makefile
+++ b/src/gallium/targets/dri-r300/Makefile
@@ -1,7 +1,7 @@
 TOP = ../../../..
 include $(TOP)/configs/current
 
-LIBNAME = radeong_dri.so
+LIBNAME = r300_dri.so
 
 PIPE_DRIVERS = \
 	$(TOP)/src/gallium/state_trackers/dri/drm/libdridrm.a \
diff --git a/src/gallium/targets/dri-radeong/SConscript b/src/gallium/targets/dri-r300/SConscript
index 1402c3bd12..33a458f2e6 100644
--- a/src/gallium/targets/dri-radeong/SConscript
+++ b/src/gallium/targets/dri-r300/SConscript
@@ -1,7 +1,7 @@
 Import('*')
 
 if not 'r300' in env['drivers']:
-    print 'warning: r300 pipe driver not built skipping radeong_dri.so'
+    print 'warning: r300 pipe driver not built skipping r300_dri.so'
     Return()
 
 env = drienv.Clone()
@@ -24,7 +24,7 @@ env.Prepend(LIBS = [
 ])
 
 env.SharedLibrary(
-    target ='radeon_dri.so',
+    target ='r300_dri.so',
     source = 'target.c',
     SHLIBPREFIX = '',
 )
diff --git a/src/gallium/targets/dri-radeong/target.c b/src/gallium/targets/dri-r300/target.c
index 5a0a8dc573..2ecf3457a7 100644
--- a/src/gallium/targets/dri-radeong/target.c
+++ b/src/gallium/targets/dri-r300/target.c
@@ -23,4 +23,4 @@ create_screen(int fd)
    return screen;
 }
 
-DRM_DRIVER_DESCRIPTOR("radeon", "radeon", create_screen)
+DRM_DRIVER_DESCRIPTOR("r300", "radeon", create_screen)
diff --git a/src/gallium/targets/dri-r600/Makefile b/src/gallium/targets/dri-r600/Makefile
index 932303d194..661283de6a 100644
--- a/src/gallium/targets/dri-r600/Makefile
+++ b/src/gallium/targets/dri-r600/Makefile
@@ -4,12 +4,12 @@ include $(TOP)/configs/current
 LIBNAME = r600_dri.so
 
 PIPE_DRIVERS = \
+	$(TOP)/src/gallium/drivers/r600/libr600.a \
 	$(TOP)/src/gallium/state_trackers/dri/drm/libdridrm.a \
 	$(TOP)/src/gallium/winsys/r600/drm/libr600winsys.a \
 	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
 	$(TOP)/src/gallium/drivers/trace/libtrace.a \
-	$(TOP)/src/gallium/drivers/rbug/librbug.a \
-	$(TOP)/src/gallium/drivers/r600/libr600.a
+	$(TOP)/src/gallium/drivers/rbug/librbug.a
 
 C_SOURCES = \
 	target.c \
@@ -21,6 +21,6 @@ DRIVER_DEFINES = \
 
 include ../Makefile.dri
 
-DRI_LIB_DEPS += -ldrm_radeon
+DRI_LIB_DEPS +=
 
 symlinks:
diff --git a/src/gallium/targets/egl-gdi/egl-static.c b/src/gallium/targets/egl-gdi/egl-static.c
index ec2f865c31..4655d79117 100644
--- a/src/gallium/targets/egl-gdi/egl-static.c
+++ b/src/gallium/targets/egl-gdi/egl-static.c
@@ -33,6 +33,8 @@
 #include "target-helpers/inline_debug_helper.h"
 #include "egldriver.h"
 
+static struct st_api *stapis[ST_API_COUNT];
+
 static uint
 get_api_mask(void)
 {
@@ -57,7 +59,11 @@ get_api_mask(void)
 static struct st_api *
 get_st_api(enum st_api_type api)
 {
-   struct st_api *stapi = NULL;
+   struct st_api *stapi;
+
+   stapi = stapis[api];
+   if (stapi)
+      return stapi;
 
    switch (api) {
 #if FEATURE_GL
@@ -84,13 +90,33 @@ get_st_api(enum st_api_type api)
          break;
    }
 
+   stapis[api] = stapi;
+
    return stapi;
 }
 
 static struct st_api *
 guess_gl_api(void)
 {
-   return NULL;
+   struct st_api *stapi = NULL;
+
+#if FEATURE_GL
+   stapi = get_st_api(ST_API_OPENGL);
+   if (stapi)
+      return stapi;
+#endif
+#if FEATURE_ES1
+   stapi = get_st_api(ST_API_OPENGL_ES1);
+   if (stapi)
+      return stapi;
+#endif
+#if FEATURE_ES2
+   stapi = get_st_api(ST_API_OPENGL_ES2);
+   if (stapi)
+      return stapi;
+#endif
+
+   return stapi;
 }
 
 static struct pipe_screen *
@@ -127,7 +153,16 @@ init_loader(struct egl_g3d_loader *loader)
 static void
 egl_g3d_unload(_EGLDriver *drv)
 {
+   int i;
+
    egl_g3d_destroy_driver(drv);
+
+   for (i = 0; i < ST_API_COUNT; i++) {
+      if (stapis[i]) {
+         stapis[i]->destroy(stapis[i]);
+         stapis[i] = NULL;
+      }
+   }
 }
 
 static struct egl_g3d_loader loader;
diff --git a/src/gallium/targets/egl/Makefile b/src/gallium/targets/egl/Makefile
index 1e4bb4d94c..2784fd0d10 100644
--- a/src/gallium/targets/egl/Makefile
+++ b/src/gallium/targets/egl/Makefile
@@ -90,13 +90,20 @@ nouveau_LIBS := \
 	$(TOP)/src/gallium/drivers/nv50/libnv50.a \
 	$(TOP)/src/gallium/drivers/nouveau/libnouveau.a
 
-# radeon pipe driver
-radeon_CPPFLAGS :=
-radeon_SYS := -ldrm -ldrm_radeon
-radeon_LIBS := \
+# r300 pipe driver
+r300_CPPFLAGS :=
+r300_SYS := -ldrm -ldrm_radeon
+r300_LIBS := \
 	$(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \
 	$(TOP)/src/gallium/drivers/r300/libr300.a
 
+# r600 pipe driver
+r600_CPPFLAGS :=
+r600_SYS := -ldrm -ldrm_radeon
+r600_LIBS := \
+	$(TOP)/src/gallium/winsys/r600/drm/libr600winsys.a \
+	$(TOP)/src/gallium/drivers/r600/libr600.a
+
 # vmwgfx pipe driver
 vmwgfx_CPPFLAGS :=
 vmwgfx_SYS :=
@@ -119,17 +126,17 @@ endif
 
 # OpenGL state tracker
 GL_CPPFLAGS := -I$(TOP)/src/mesa $(API_DEFINES)
-GL_SYS := -lpthread -lm -L$(TOP)/$(LIB_DIR) -l$(GL_LIB)
+GL_SYS := $(DRI_LIB_DEPS) -L$(TOP)/$(LIB_DIR) -l$(GL_LIB)
 GL_LIBS := $(TOP)/src/mesa/libmesagallium.a
 
 # OpenGL ES 1.x state tracker
 GLESv1_CM_CPPFLAGS := -I$(TOP)/src/mesa
-GLESv1_CM_SYS := -lpthread -lm -L$(TOP)/$(LIB_DIR) -l$(GLESv1_CM_LIB)
+GLESv1_CM_SYS := $(DRI_LIB_DEPS) -L$(TOP)/$(LIB_DIR) -l$(GLESv1_CM_LIB)
 GLESv1_CM_LIBS := $(TOP)/src/mesa/libes1gallium.a
 
 # OpenGL ES 2.x state tracker
 GLESv2_CPPFLAGS := -I$(TOP)/src/mesa
-GLESv2_SYS := -lpthread -lm -L$(TOP)/$(LIB_DIR) -l$(GLESv2_LIB)
+GLESv2_SYS := $(DRI_LIB_DEPS) -L$(TOP)/$(LIB_DIR) -l$(GLESv2_LIB)
 GLESv2_LIBS := $(TOP)/src/mesa/libes2gallium.a
 
 # OpenVG state tracker
@@ -151,7 +158,10 @@ ifneq ($(findstring nouveau/drm,$(GALLIUM_WINSYS_DIRS)),)
 OUTPUTS += nouveau
 endif
 ifneq ($(findstring radeon/drm,$(GALLIUM_WINSYS_DIRS)),)
-OUTPUTS += radeon
+OUTPUTS += r300
+endif
+ifneq ($(findstring r600/drm,$(GALLIUM_WINSYS_DIRS)),)
+OUTPUTS += r600
 endif
 ifneq ($(findstring svga/drm,$(GALLIUM_WINSYS_DIRS)),)
 OUTPUTS += vmwgfx
@@ -188,8 +198,11 @@ $(OUTPUT_PATH)/$(PIPE_PREFIX)i965.so: pipe_i965.o $(i965_LIBS)
 $(OUTPUT_PATH)/$(PIPE_PREFIX)nouveau.so: pipe_nouveau.o $(nouveau_LIBS)
 	$(call mklib,nouveau)
 
-$(OUTPUT_PATH)/$(PIPE_PREFIX)radeon.so: pipe_radeon.o $(radeon_LIBS)
-	$(call mklib,radeon)
+$(OUTPUT_PATH)/$(PIPE_PREFIX)r300.so: pipe_r300.o $(r300_LIBS)
+	$(call mklib,r300)
+
+$(OUTPUT_PATH)/$(PIPE_PREFIX)r600.so: pipe_r600.o $(r600_LIBS)
+	$(call mklib,r600)
 
 $(OUTPUT_PATH)/$(PIPE_PREFIX)vmwgfx.so: pipe_vmwgfx.o $(vmwgfx_LIBS)
 	$(call mklib,vmwgfx)
diff --git a/src/gallium/targets/egl/egl.c b/src/gallium/targets/egl/egl.c
index d9d89485c3..a573b21217 100644
--- a/src/gallium/targets/egl/egl.c
+++ b/src/gallium/targets/egl/egl.c
@@ -155,24 +155,23 @@ load_pipe_module(struct pipe_module *pmod, const char *name)
    if (!pmod->name)
       return FALSE;
 
+   _eglLog(_EGL_DEBUG, "searching for pipe module %s", pmod->name);
    _eglSearchPathForEach(dlopen_pipe_module_cb, (void *) pmod);
    if (pmod->lib) {
       pmod->drmdd = (const struct drm_driver_descriptor *)
          util_dl_get_proc_address(pmod->lib, "driver_descriptor");
-      if (pmod->drmdd) {
-         if (pmod->drmdd->driver_name) {
-            /* driver name mismatch */
-            if (strcmp(pmod->drmdd->driver_name, pmod->name) != 0)
-               pmod->drmdd = NULL;
-         }
-         else {
-            /* swrast */
-            pmod->swrast_create_screen =
-               (struct pipe_screen *(*)(struct sw_winsys *))
-               util_dl_get_proc_address(pmod->lib, "swrast_create_screen");
-            if (!pmod->swrast_create_screen)
-               pmod->drmdd = NULL;
-         }
+
+      /* sanity check on the name */
+      if (pmod->drmdd && strcmp(pmod->drmdd->name, pmod->name) != 0)
+         pmod->drmdd = NULL;
+
+      /* swrast */
+      if (pmod->drmdd && !pmod->drmdd->driver_name) {
+         pmod->swrast_create_screen =
+            (struct pipe_screen *(*)(struct sw_winsys *))
+            util_dl_get_proc_address(pmod->lib, "swrast_create_screen");
+         if (!pmod->swrast_create_screen)
+            pmod->drmdd = NULL;
       }
 
       if (!pmod->drmdd) {
diff --git a/src/gallium/targets/egl/pipe_radeon.c b/src/gallium/targets/egl/pipe_r300.c
index 35550bcb26..d84bb92539 100644
--- a/src/gallium/targets/egl/pipe_radeon.c
+++ b/src/gallium/targets/egl/pipe_r300.c
@@ -24,4 +24,4 @@ create_screen(int fd)
 }
 
 PUBLIC
-DRM_DRIVER_DESCRIPTOR("radeon", "radeon", create_screen)
+DRM_DRIVER_DESCRIPTOR("r300", "radeon", create_screen)
diff --git a/src/gallium/targets/egl/pipe_r600.c b/src/gallium/targets/egl/pipe_r600.c
new file mode 100644
index 0000000000..486a659258
--- /dev/null
+++ b/src/gallium/targets/egl/pipe_r600.c
@@ -0,0 +1,27 @@
+
+#include "state_tracker/drm_driver.h"
+#include "target-helpers/inline_debug_helper.h"
+#include "r600/drm/r600_drm_public.h"
+#include "r600/r600_public.h"
+
+static struct pipe_screen *
+create_screen(int fd)
+{
+   struct radeon *rw;
+   struct pipe_screen *screen;
+
+   rw = r600_drm_winsys_create(fd);
+   if (!rw)
+      return NULL;
+
+   screen = r600_screen_create(rw);
+   if (!screen)
+      return NULL;
+
+   screen = debug_screen_wrap(screen);
+
+   return screen;
+}
+
+PUBLIC
+DRM_DRIVER_DESCRIPTOR("r600", "radeon", create_screen)
diff --git a/src/gallium/targets/graw-xlib/graw_util.c b/src/gallium/targets/graw-xlib/graw_util.c
index 47aca4464d..fc7c9ae6f9 100644
--- a/src/gallium/targets/graw-xlib/graw_util.c
+++ b/src/gallium/targets/graw-xlib/graw_util.c
@@ -1,6 +1,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_context.h"
+#include "pipe/p_state.h"
 #include "tgsi/tgsi_text.h"
 #include "util/u_memory.h"
 #include "state_tracker/graw.h"
diff --git a/src/gallium/targets/graw-xlib/graw_xlib.c b/src/gallium/targets/graw-xlib/graw_xlib.c
index 41120ba3c7..8b64a0b819 100644
--- a/src/gallium/targets/graw-xlib/graw_xlib.c
+++ b/src/gallium/targets/graw-xlib/graw_xlib.c
@@ -1,5 +1,6 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_context.h"
+#include "pipe/p_screen.h"
 #include "util/u_debug.h"
 #include "util/u_memory.h"
 #include "target-helpers/wrap_screen.h"
diff --git a/src/gallium/targets/libgl-gdi/SConscript b/src/gallium/targets/libgl-gdi/SConscript
index 144084f74f..12fe403f62 100644
--- a/src/gallium/targets/libgl-gdi/SConscript
+++ b/src/gallium/targets/libgl-gdi/SConscript
@@ -17,6 +17,7 @@ if env['platform'] == 'windows':
         'user32',
         'kernel32',
         'ws2_32',
+        talloc,
     ])
 
     sources = []
diff --git a/src/gallium/targets/libgl-xlib/Makefile b/src/gallium/targets/libgl-xlib/Makefile
index e745023ba5..fe0541543a 100644
--- a/src/gallium/targets/libgl-xlib/Makefile
+++ b/src/gallium/targets/libgl-xlib/Makefile
@@ -68,8 +68,9 @@ $(TOP)/$(LIB_DIR)/gallium:
 # Make the libGL.so library
 $(TOP)/$(LIB_DIR)/gallium/$(GL_LIB_NAME): $(XLIB_TARGET_OBJECTS) $(LIBS) Makefile
 	$(TOP)/bin/mklib -o $(GL_LIB) \
-		-linker "$(CC)" \
+		-linker "$(CXX)" \
 		-major $(GL_MAJOR) -minor $(GL_MINOR) -patch $(GL_TINY) \
+		-cplusplus \
 		-install $(TOP)/$(LIB_DIR)/gallium \
 		$(MKLIB_OPTIONS) $(XLIB_TARGET_OBJECTS) \
 		-Wl,--start-group $(LIBS) -Wl,--end-group $(GL_LIB_DEPS)
diff --git a/src/gallium/targets/libgl-xlib/SConscript b/src/gallium/targets/libgl-xlib/SConscript
index 78703fd096..88e216a65b 100644
--- a/src/gallium/targets/libgl-xlib/SConscript
+++ b/src/gallium/targets/libgl-xlib/SConscript
@@ -35,6 +35,7 @@ env.Prepend(LIBS = [
     mesa,
     glsl,
     gallium,
+    'talloc'
 ])
 
 sources = [
diff --git a/src/gallium/targets/libgl-xlib/xlib.c b/src/gallium/targets/libgl-xlib/xlib.c
index 69b4ddd33f..5a9c80c856 100644
--- a/src/gallium/targets/libgl-xlib/xlib.c
+++ b/src/gallium/targets/libgl-xlib/xlib.c
@@ -36,6 +36,7 @@
 #include "state_tracker/xlib_sw_winsys.h"
 #include "xm_public.h"
 
+#include "state_tracker/st_api.h"
 #include "state_tracker/st_gl_api.h"
 
 /* piggy back on this libGL for OpenGL support in EGL */
diff --git a/src/gallium/tests/python/tests/texture_blit.py b/src/gallium/tests/python/tests/texture_blit.py
index 58706dab93..089d05c623 100755
--- a/src/gallium/tests/python/tests/texture_blit.py
+++ b/src/gallium/tests/python/tests/texture_blit.py
@@ -55,7 +55,7 @@ def tex_coords(texture, face, level, zslice):
         [0.0, 1.0],
     ] 
     
-    if texture.target == PIPE_TEXTURE_2D:
+    if texture.target == PIPE_TEXTURE_2D or texture.target == PIPE_TEXTURE_RECT:
         return [[s, t, 0.0] for s, t in st]
     elif texture.target == PIPE_TEXTURE_3D:
         depth = texture.get_depth(level)
diff --git a/src/gallium/tools/addr2line.sh b/src/gallium/tools/addr2line.sh
new file mode 100755
index 0000000000..34dec14271
--- /dev/null
+++ b/src/gallium/tools/addr2line.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# This script processes symbols output by Gallium using glibc to human-readable function names
+
+lastbin=
+i=-1
+dir="$(mktemp -d)"
+input="$1"
+
+# Gather all unique addresses for each binary
+sed -nre 's|([^ ]*/[^ ]*)\(\+0x([^)]*).*|\1 \2|p' "$input"|sort|uniq|while read bin addr; do
+	if test "$lastbin" != "$bin"; then
+		((++i))
+		lastbin="$bin"
+		echo "$bin" > "$dir/$i.addrs.bin"
+	fi
+	echo "$addr" >> "$dir/$i.addrs"
+done
+
+# Construct a sed script to convert hex address to human readable form, and apply it
+for i in "$dir"/*.addrs; do
+	bin="$(<"$i.bin")"
+	addr2line -p -e "$bin" -a -f < "$i"|sed -nre 's@^0x0*([^:]*): ([^?]*)$@s|'"$bin"'(+0x\1)|\2|g@gp'
+	rm -f "$i" "$i.bin"
+done|sed -f - "$input"
+
+rmdir "$dir"
diff --git a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
index 660dbd0c33..d4bf124ce6 100644
--- a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
+++ b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
@@ -19,7 +19,8 @@ nouveau_drm_destroy_winsys(struct pipe_winsys *s)
 {
 	struct nouveau_winsys *nv_winsys = nouveau_winsys(s);
 	struct nouveau_screen *nv_screen= nouveau_screen(nv_winsys->pscreen);
-	nouveau_device_close(&nv_screen->device);
+	if (nv_screen)
+		nouveau_device_close(&nv_screen->device);
 	FREE(nv_winsys);
 }
 
diff --git a/src/gallium/winsys/r600/drm/r600_state.c b/src/gallium/winsys/r600/drm/r600_state.c
index d17d6e7954..71d65f0fea 100644
--- a/src/gallium/winsys/r600/drm/r600_state.c
+++ b/src/gallium/winsys/r600/drm/r600_state.c
@@ -30,6 +30,8 @@
 #include "radeon_priv.h"
 #include "r600d.h"
 
+#include "util/u_memory.h"
+
 static int r600_state_pm4_resource(struct radeon_state *state);
 static int r600_state_pm4_cb0(struct radeon_state *state);
 static int r600_state_pm4_vgt(struct radeon_state *state);
@@ -38,24 +40,69 @@ static int r600_state_pm4_shader(struct radeon_state *state);
 static int r600_state_pm4_draw(struct radeon_state *state);
 static int r600_state_pm4_config(struct radeon_state *state);
 static int r600_state_pm4_generic(struct radeon_state *state);
+static int r600_state_pm4_query_begin(struct radeon_state *state);
+static int r600_state_pm4_query_end(struct radeon_state *state);
 static int r700_state_pm4_config(struct radeon_state *state);
 static int r700_state_pm4_cb0(struct radeon_state *state);
 static int r700_state_pm4_db(struct radeon_state *state);
 
 #include "r600_states.h"
 
+
+#define SUB_NONE(param) { { 0, R600_names_##param, (sizeof(R600_names_##param)/sizeof(struct radeon_register)) } }
+#define SUB_PS(param) { R600_SHADER_PS, R600_names_##param, (sizeof(R600_names_##param)/sizeof(struct radeon_register)) }
+#define SUB_VS(param) { R600_SHADER_VS, R600_names_##param, (sizeof(R600_names_##param)/sizeof(struct radeon_register)) }
+#define SUB_GS(param) { R600_SHADER_GS, R600_names_##param, (sizeof(R600_names_##param)/sizeof(struct radeon_register)) }
+#define SUB_FS(param) { R600_SHADER_FS, R600_names_##param, (sizeof(R600_names_##param)/sizeof(struct radeon_register)) }
+
+/* some of these are overriden at runtime for R700 */
+struct radeon_stype_info r600_stypes[] = {
+	{ R600_STATE_CONFIG, 1, 0, r600_state_pm4_config, SUB_NONE(CONFIG), },
+	{ R600_STATE_CB_CNTL, 1, 0, r600_state_pm4_generic, SUB_NONE(CB_CNTL) },
+	{ R600_STATE_RASTERIZER, 1, 0, r600_state_pm4_generic, SUB_NONE(RASTERIZER) },
+	{ R600_STATE_VIEWPORT, 1, 0, r600_state_pm4_generic, SUB_NONE(VIEWPORT) },
+	{ R600_STATE_SCISSOR, 1, 0, r600_state_pm4_generic, SUB_NONE(SCISSOR) },
+	{ R600_STATE_BLEND, 1, 0, r600_state_pm4_generic, SUB_NONE(BLEND), },
+	{ R600_STATE_DSA, 1, 0, r600_state_pm4_generic, SUB_NONE(DSA), },
+	{ R600_STATE_SHADER, 1, 0, r600_state_pm4_shader, { SUB_PS(PS_SHADER), SUB_VS(VS_SHADER) } },
+	{ R600_STATE_CONSTANT, 256, 0x10, r600_state_pm4_generic,  { SUB_PS(PS_CONSTANT), SUB_VS(VS_CONSTANT) } },
+	{ R600_STATE_RESOURCE, 160, 0x1c, r600_state_pm4_resource, { SUB_PS(PS_RESOURCE), SUB_VS(VS_RESOURCE), SUB_GS(GS_RESOURCE), SUB_FS(FS_RESOURCE)} },
+	{ R600_STATE_SAMPLER, 18, 0xc, r600_state_pm4_generic, { SUB_PS(PS_SAMPLER), SUB_VS(VS_SAMPLER), SUB_GS(GS_SAMPLER) } },
+	{ R600_STATE_SAMPLER_BORDER, 18, 0x10, r600_state_pm4_generic, { SUB_PS(PS_SAMPLER_BORDER), SUB_VS(VS_SAMPLER_BORDER), SUB_GS(GS_SAMPLER_BORDER) } },
+	{ R600_STATE_CB0, 1, 0, r600_state_pm4_cb0, SUB_NONE(CB0) },
+	{ R600_STATE_CB1, 1, 0, r600_state_pm4_cb0, SUB_NONE(CB1) },
+	{ R600_STATE_CB2, 1, 0, r600_state_pm4_cb0, SUB_NONE(CB2) },
+	{ R600_STATE_CB3, 1, 0, r600_state_pm4_cb0, SUB_NONE(CB3) },
+	{ R600_STATE_CB4, 1, 0, r600_state_pm4_cb0, SUB_NONE(CB4) },
+	{ R600_STATE_CB5, 1, 0, r600_state_pm4_cb0, SUB_NONE(CB5) },
+	{ R600_STATE_CB6, 1, 0, r600_state_pm4_cb0, SUB_NONE(CB6) },
+	{ R600_STATE_CB7, 1, 0, r600_state_pm4_cb0, SUB_NONE(CB7) },
+	{ R600_STATE_QUERY_BEGIN, 1, 0, r600_state_pm4_query_begin, SUB_NONE(VGT_EVENT) },
+	{ R600_STATE_QUERY_END, 1, 0, r600_state_pm4_query_end, SUB_NONE(VGT_EVENT) },
+	{ R600_STATE_DB, 1, 0, r600_state_pm4_db, SUB_NONE(DB) },
+	{ R600_STATE_UCP, 1, 0, r600_state_pm4_generic, SUB_NONE(UCP) },
+	{ R600_STATE_VGT, 1, 0, r600_state_pm4_vgt, SUB_NONE(VGT) },
+	{ R600_STATE_DRAW, 1, 0, r600_state_pm4_draw, SUB_NONE(DRAW) },
+};
+#define STYPES_SIZE Elements(r600_stypes)
+
+static const struct radeon_register *get_regs(struct radeon_state *state)
+{
+	return state->stype->reginfo[state->shader_index].regs;
+}
+
 /*
  * r600/r700 state functions
  */
 static int r600_state_pm4_bytecode(struct radeon_state *state, unsigned offset, unsigned id, unsigned nreg)
 {
-	const struct radeon_register *regs = state->radeon->type[state->type].regs;
+	const struct radeon_register *regs = get_regs(state);
 	unsigned i;
 	int r;
 
 	if (!offset) {
 		fprintf(stderr, "%s invalid register for state %d %d\n",
-			__func__, state->type, id);
+			__func__, state->stype->stype, id);
 		return -EINVAL;
 	}
 	if (offset >= R600_CONFIG_REG_OFFSET && offset < R600_CONFIG_REG_END) {
@@ -114,19 +161,18 @@ static int r600_state_pm4_bytecode(struct radeon_state *state, unsigned offset,
 
 static int r600_state_pm4_generic(struct radeon_state *state)
 {
-	struct radeon *radeon = state->radeon;
-	unsigned i, offset, nreg, type, coffset, loffset, soffset;
+	const struct radeon_register *regs = get_regs(state);
+	unsigned i, offset, nreg, coffset, loffset, soffset;
 	unsigned start;
 	int r;
 
 	if (!state->nstates)
 		return 0;
-	type = state->type;
-	soffset = (state->id - radeon->type[type].id) * radeon->type[type].stride;
-	offset = loffset = radeon->type[type].regs[0].offset + soffset;
+	soffset = state->id * state->stype->stride;
+	offset = loffset = regs[0].offset + soffset;
 	start = 0;
 	for (i = 1, nreg = 1; i < state->nstates; i++) {
-		coffset = radeon->type[type].regs[i].offset + soffset;
+		coffset = regs[i].offset + soffset;
 		if (coffset == (loffset + 4)) {
 			nreg++;
 			loffset = coffset;
@@ -233,20 +279,54 @@ static int r600_state_pm4_config(struct radeon_state *state)
 	state->pm4[state->cpm4++] = 0x80000000;
 	state->pm4[state->cpm4++] = 0x80000000;
 	state->pm4[state->cpm4++] = PKT3(PKT3_EVENT_WRITE, 0);
-	state->pm4[state->cpm4++] = 0x00000016;
+	state->pm4[state->cpm4++] = EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT;
 	state->pm4[state->cpm4++] = PKT3(PKT3_SET_CONFIG_REG, 1);
 	state->pm4[state->cpm4++] = 0x00000010;
 	state->pm4[state->cpm4++] = 0x00028000;
 	return r600_state_pm4_generic(state);
 }
 
+static int r600_state_pm4_query_begin(struct radeon_state *state)
+{
+	int r;
+
+	state->cpm4 = 0;
+	state->pm4[state->cpm4++] = PKT3(PKT3_EVENT_WRITE, 2);
+	state->pm4[state->cpm4++] = EVENT_TYPE_ZPASS_DONE;
+	state->pm4[state->cpm4++] = state->states[0];
+	state->pm4[state->cpm4++] = 0x0;
+	state->pm4[state->cpm4++] = PKT3(PKT3_NOP, 0);
+	r = radeon_state_reloc(state, state->cpm4, 0);
+	if (r)
+		return r;
+	state->pm4[state->cpm4++] = state->bo[0]->handle;
+	return 0;
+}
+
+static int r600_state_pm4_query_end(struct radeon_state *state)
+{
+	int r;
+
+	state->cpm4 = 0;
+	state->pm4[state->cpm4++] = PKT3(PKT3_EVENT_WRITE, 2);
+	state->pm4[state->cpm4++] = EVENT_TYPE_ZPASS_DONE;
+	state->pm4[state->cpm4++] = state->states[0];
+	state->pm4[state->cpm4++] = 0x0;
+	state->pm4[state->cpm4++] = PKT3(PKT3_NOP, 0);
+	r = radeon_state_reloc(state, state->cpm4, 0);
+	if (r)
+		return r;
+	state->pm4[state->cpm4++] = state->bo[0]->handle;
+	return 0;
+}
+
 static int r700_state_pm4_config(struct radeon_state *state)
 {
 	state->pm4[state->cpm4++] = PKT3(PKT3_CONTEXT_CONTROL, 1);
 	state->pm4[state->cpm4++] = 0x80000000;
 	state->pm4[state->cpm4++] = 0x80000000;
 	state->pm4[state->cpm4++] = PKT3(PKT3_EVENT_WRITE, 0);
-	state->pm4[state->cpm4++] = 0x00000016;
+	state->pm4[state->cpm4++] = EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT;
 	state->pm4[state->cpm4++] = PKT3(PKT3_SET_CONFIG_REG, 1);
 	state->pm4[state->cpm4++] = 0x00000010;
 	state->pm4[state->cpm4++] = 0x00028000;
@@ -287,7 +367,6 @@ static int r600_state_pm4_vgt(struct radeon_state *state)
 
 static int r600_state_pm4_draw(struct radeon_state *state)
 {
-	unsigned i;
 	int r;
 
 	if (state->nbo) {
@@ -301,20 +380,13 @@ static int r600_state_pm4_draw(struct radeon_state *state)
 		if (r)
 			return r;
 		state->pm4[state->cpm4++] = state->bo[0]->handle;
-	} else if  (state->nimmd) {
-		state->pm4[state->cpm4++] = PKT3(PKT3_DRAW_INDEX_IMMD, state->nimmd + 1);
-		state->pm4[state->cpm4++] = state->states[R600_DRAW__VGT_NUM_INDICES];
-		state->pm4[state->cpm4++] = state->states[R600_DRAW__VGT_DRAW_INITIATOR];
-		for (i = 0; i < state->nimmd; i++) {
-			state->pm4[state->cpm4++] = state->immd[i];
-		}
 	} else {
 		state->pm4[state->cpm4++] = PKT3(PKT3_DRAW_INDEX_AUTO, 1);
 		state->pm4[state->cpm4++] = state->states[R600_DRAW__VGT_NUM_INDICES];
 		state->pm4[state->cpm4++] = state->states[R600_DRAW__VGT_DRAW_INITIATOR];
 	}
 	state->pm4[state->cpm4++] = PKT3(PKT3_EVENT_WRITE, 0);
-	state->pm4[state->cpm4++] = 0x00000016;
+	state->pm4[state->cpm4++] = EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT;
 	return 0;
 }
 
@@ -322,8 +394,9 @@ static int r600_state_pm4_resource(struct radeon_state *state)
 {
 	u32 flags, type, nbo, offset, soffset;
 	int r;
+	const struct radeon_register *regs = get_regs(state);
 
-	soffset = (state->id - state->radeon->type[state->type].id) * state->radeon->type[state->type].stride;
+	soffset = state->id * state->stype->stride;
 	type = G_038018_TYPE(state->states[6]);
 	switch (type) {
 	case 2:
@@ -342,7 +415,7 @@ static int r600_state_pm4_resource(struct radeon_state *state)
 		return -EINVAL;
 	}
 	r600_state_pm4_with_flush(state, flags);
-	offset = state->radeon->type[state->type].regs[0].offset + soffset;
+	offset = regs[0].offset + soffset;
 	state->pm4[state->cpm4++] = PKT3(PKT3_SET_RESOURCE, 7);
 	state->pm4[state->cpm4++] = (offset - R_038000_SQ_TEX_RESOURCE_WORD0_0) >> 2;
 	state->pm4[state->cpm4++] = state->states[0];
@@ -367,33 +440,62 @@ static int r600_state_pm4_resource(struct radeon_state *state)
 	return 0;
 }
 
-int r600_init(struct radeon *radeon)
+
+static void r600_modify_type_array(struct radeon *radeon)
 {
+	int i;
 	switch (radeon->family) {
-	case CHIP_R600:
-	case CHIP_RV610:
-	case CHIP_RV630:
-	case CHIP_RV670:
-	case CHIP_RV620:
-	case CHIP_RV635:
-	case CHIP_RS780:
-	case CHIP_RS880:
-		radeon->ntype = R600_NTYPE;
-		radeon->nstate = R600_NSTATE;
-		radeon->type = R600_types;
-		break;
 	case CHIP_RV770:
 	case CHIP_RV730:
 	case CHIP_RV710:
 	case CHIP_RV740:
-		radeon->ntype = R600_NTYPE;
-		radeon->nstate = R600_NSTATE;
-		radeon->type = R700_types;
 		break;
 	default:
-		fprintf(stderr, "%s unknown or unsupported chipset 0x%04X\n",
-			__func__, radeon->device);
-		return -EINVAL;
+		return;
+	}
+
+	/* r700 needs some mods */
+	for (i = 0; i < radeon->nstype; i++) {
+		struct radeon_stype_info *info = &radeon->stype[i];
+		
+		switch(info->stype) {
+		case R600_STATE_CONFIG:
+			info->pm4 = r700_state_pm4_config;
+			break;
+		case R600_STATE_CB0:
+			info->pm4 = r700_state_pm4_cb0;
+			break;
+		case R600_STATE_DB:
+			info->pm4 = r700_state_pm4_db;
+		};
 	}
+}
+
+static void r600_build_types_array(struct radeon *radeon)
+{
+	int i, j;
+	int id = 0;
+
+	for (i = 0; i < STYPES_SIZE; i++) {
+		r600_stypes[i].base_id = id;
+		r600_stypes[i].npm4 = 128;
+		if (r600_stypes[i].reginfo[0].shader_type == 0) {
+			id += r600_stypes[i].num;
+		} else {
+			for (j = 0; j < R600_SHADER_MAX; j++) {
+				if (r600_stypes[i].reginfo[j].shader_type)
+					id += r600_stypes[i].num;
+			}
+		}
+	}
+	radeon->stype = r600_stypes;
+	radeon->nstype = STYPES_SIZE;
+
+	r600_modify_type_array(radeon);
+}
+
+int r600_init(struct radeon *radeon)
+{
+	r600_build_types_array(radeon);
 	return 0;
 }
diff --git a/src/gallium/winsys/r600/drm/r600_states.h b/src/gallium/winsys/r600/drm/r600_states.h
index e40c77d8f6..09d79d498d 100644
--- a/src/gallium/winsys/r600/drm/r600_states.h
+++ b/src/gallium/winsys/r600/drm/r600_states.h
@@ -17,7 +17,7 @@
 #ifndef R600_STATES_H
 #define R600_STATES_H
 
-static const struct radeon_register R600_CONFIG_names[] = {
+static const struct radeon_register R600_names_CONFIG[] = {
 	{0x00008C00, 0, 0, "SQ_CONFIG"},
 	{0x00008C04, 0, 0, "SQ_GPR_RESOURCE_MGMT_1"},
 	{0x00008C08, 0, 0, "SQ_GPR_RESOURCE_MGMT_2"},
@@ -61,7 +61,7 @@ static const struct radeon_register R600_CONFIG_names[] = {
 	{0x00028B20, 0, 0, "VGT_STRMOUT_BUFFER_EN"},
 };
 
-static const struct radeon_register R600_CB_CNTL_names[] = {
+static const struct radeon_register R600_names_CB_CNTL[] = {
 	{0x00028120, 0, 0, "CB_CLEAR_RED"},
 	{0x00028124, 0, 0, "CB_CLEAR_GREEN"},
 	{0x00028128, 0, 0, "CB_CLEAR_BLUE"},
@@ -82,7 +82,7 @@ static const struct radeon_register R600_CB_CNTL_names[] = {
 	{0x00028C48, 0, 0, "PA_SC_AA_MASK"},
 };
 
-static const struct radeon_register R600_RASTERIZER_names[] = {
+static const struct radeon_register R600_names_RASTERIZER[] = {
 	{0x000286D4, 0, 0, "SPI_INTERP_CONTROL_0"},
 	{0x00028810, 0, 0, "PA_CL_CLIP_CNTL"},
 	{0x00028814, 0, 0, "PA_SU_SC_MODE_CNTL"},
@@ -106,7 +106,7 @@ static const struct radeon_register R600_RASTERIZER_names[] = {
 	{0x00028E0C, 0, 0, "PA_SU_POLY_OFFSET_BACK_OFFSET"},
 };
 
-static const struct radeon_register R600_VIEWPORT_names[] = {
+static const struct radeon_register R600_names_VIEWPORT[] = {
 	{0x000282D0, 0, 0, "PA_SC_VPORT_ZMIN_0"},
 	{0x000282D4, 0, 0, "PA_SC_VPORT_ZMAX_0"},
 	{0x0002843C, 0, 0, "PA_CL_VPORT_XSCALE_0"},
@@ -118,7 +118,7 @@ static const struct radeon_register R600_VIEWPORT_names[] = {
 	{0x00028818, 0, 0, "PA_CL_VTE_CNTL"},
 };
 
-static const struct radeon_register R600_SCISSOR_names[] = {
+static const struct radeon_register R600_names_SCISSOR[] = {
 	{0x00028030, 0, 0, "PA_SC_SCREEN_SCISSOR_TL"},
 	{0x00028034, 0, 0, "PA_SC_SCREEN_SCISSOR_BR"},
 	{0x00028200, 0, 0, "PA_SC_WINDOW_OFFSET"},
@@ -140,7 +140,7 @@ static const struct radeon_register R600_SCISSOR_names[] = {
 	{0x00028254, 0, 0, "PA_SC_VPORT_SCISSOR_0_BR"},
 };
 
-static const struct radeon_register R600_BLEND_names[] = {
+static const struct radeon_register R600_names_BLEND[] = {
 	{0x00028414, 0, 0, "CB_BLEND_RED"},
 	{0x00028418, 0, 0, "CB_BLEND_GREEN"},
 	{0x0002841C, 0, 0, "CB_BLEND_BLUE"},
@@ -156,7 +156,7 @@ static const struct radeon_register R600_BLEND_names[] = {
 	{0x00028804, 0, 0, "CB_BLEND_CONTROL"},
 };
 
-static const struct radeon_register R600_DSA_names[] = {
+static const struct radeon_register R600_names_DSA[] = {
 	{0x00028028, 0, 0, "DB_STENCIL_CLEAR"},
 	{0x0002802C, 0, 0, "DB_DEPTH_CLEAR"},
 	{0x00028410, 0, 0, "SX_ALPHA_TEST_CONTROL"},
@@ -175,7 +175,7 @@ static const struct radeon_register R600_DSA_names[] = {
 	{0x00028D44, 0, 0, "DB_ALPHA_TO_MASK"},
 };
 
-static const struct radeon_register R600_VS_SHADER_names[] = {
+static const struct radeon_register R600_names_VS_SHADER[] = {
 	{0x00028380, 0, 0, "SQ_VTX_SEMANTIC_0"},
 	{0x00028384, 0, 0, "SQ_VTX_SEMANTIC_1"},
 	{0x00028388, 0, 0, "SQ_VTX_SEMANTIC_2"},
@@ -227,7 +227,7 @@ static const struct radeon_register R600_VS_SHADER_names[] = {
 	{0x000288DC, 0, 0, "SQ_PGM_CF_OFFSET_FS"},
 };
 
-static const struct radeon_register R600_PS_SHADER_names[] = {
+static const struct radeon_register R600_names_PS_SHADER[] = {
 	{0x00028644, 0, 0, "SPI_PS_INPUT_CNTL_0"},
 	{0x00028648, 0, 0, "SPI_PS_INPUT_CNTL_1"},
 	{0x0002864C, 0, 0, "SPI_PS_INPUT_CNTL_2"},
@@ -269,21 +269,48 @@ static const struct radeon_register R600_PS_SHADER_names[] = {
 	{0x000288CC, 0, 0, "SQ_PGM_CF_OFFSET_PS"},
 };
 
-static const struct radeon_register R600_PS_CONSTANT_names[] = {
+static const struct radeon_register R600_names_PS_CONSTANT[] = {
 	{0x00030000, 0, 0, "SQ_ALU_CONSTANT0_0"},
 	{0x00030004, 0, 0, "SQ_ALU_CONSTANT1_0"},
 	{0x00030008, 0, 0, "SQ_ALU_CONSTANT2_0"},
 	{0x0003000C, 0, 0, "SQ_ALU_CONSTANT3_0"},
 };
 
-static const struct radeon_register R600_VS_CONSTANT_names[] = {
+static const struct radeon_register R600_names_VS_CONSTANT[] = {
 	{0x00031000, 0, 0, "SQ_ALU_CONSTANT0_256"},
 	{0x00031004, 0, 0, "SQ_ALU_CONSTANT1_256"},
 	{0x00031008, 0, 0, "SQ_ALU_CONSTANT2_256"},
 	{0x0003100C, 0, 0, "SQ_ALU_CONSTANT3_256"},
 };
 
-static const struct radeon_register R600_PS_RESOURCE_names[] = {
+static const struct radeon_register R600_names_UCP[] = {
+	{0x00028E20, 0, 0, "PA_CL_UCP0_X"},
+	{0x00028E24, 0, 0, "PA_CL_UCP0_Y"},
+	{0x00028E28, 0, 0, "PA_CL_UCP0_Z"},
+	{0x00028E2C, 0, 0, "PA_CL_UCP0_W"},
+	{0x00028E30, 0, 0, "PA_CL_UCP1_X"},
+	{0x00028E34, 0, 0, "PA_CL_UCP1_Y"},
+	{0x00028E38, 0, 0, "PA_CL_UCP1_Z"},
+	{0x00028E3C, 0, 0, "PA_CL_UCP1_W"},
+	{0x00028E40, 0, 0, "PA_CL_UCP2_X"},
+	{0x00028E44, 0, 0, "PA_CL_UCP2_Y"},
+	{0x00028E48, 0, 0, "PA_CL_UCP2_Z"},
+	{0x00028E4C, 0, 0, "PA_CL_UCP2_W"},
+	{0x00028E50, 0, 0, "PA_CL_UCP3_X"},
+	{0x00028E54, 0, 0, "PA_CL_UCP3_Y"},
+	{0x00028E58, 0, 0, "PA_CL_UCP3_Z"},
+	{0x00028E5C, 0, 0, "PA_CL_UCP3_W"},
+	{0x00028E60, 0, 0, "PA_CL_UCP4_X"},
+	{0x00028E64, 0, 0, "PA_CL_UCP4_Y"},
+	{0x00028E68, 0, 0, "PA_CL_UCP4_Z"},
+	{0x00028E6C, 0, 0, "PA_CL_UCP4_W"},
+	{0x00028E70, 0, 0, "PA_CL_UCP5_X"},
+	{0x00028E74, 0, 0, "PA_CL_UCP5_Y"},
+	{0x00028E78, 0, 0, "PA_CL_UCP5_Z"},
+	{0x00028E7C, 0, 0, "PA_CL_UCP5_W"},
+};
+
+static const struct radeon_register R600_names_PS_RESOURCE[] = {
 	{0x00038000, 0, 0, "RESOURCE0_WORD0"},
 	{0x00038004, 0, 0, "RESOURCE0_WORD1"},
 	{0x00038008, 0, 0, "RESOURCE0_WORD2"},
@@ -293,7 +320,7 @@ static const struct radeon_register R600_PS_RESOURCE_names[] = {
 	{0x00038018, 0, 0, "RESOURCE0_WORD6"},
 };
 
-static const struct radeon_register R600_VS_RESOURCE_names[] = {
+static const struct radeon_register R600_names_VS_RESOURCE[] = {
 	{0x00039180, 0, 0, "RESOURCE160_WORD0"},
 	{0x00039184, 0, 0, "RESOURCE160_WORD1"},
 	{0x00039188, 0, 0, "RESOURCE160_WORD2"},
@@ -303,7 +330,7 @@ static const struct radeon_register R600_VS_RESOURCE_names[] = {
 	{0x00039198, 0, 0, "RESOURCE160_WORD6"},
 };
 
-static const struct radeon_register R600_FS_RESOURCE_names[] = {
+static const struct radeon_register R600_names_FS_RESOURCE[] = {
 	{0x0003A300, 0, 0, "RESOURCE320_WORD0"},
 	{0x0003A304, 0, 0, "RESOURCE320_WORD1"},
 	{0x0003A308, 0, 0, "RESOURCE320_WORD2"},
@@ -313,7 +340,7 @@ static const struct radeon_register R600_FS_RESOURCE_names[] = {
 	{0x0003A318, 0, 0, "RESOURCE320_WORD6"},
 };
 
-static const struct radeon_register R600_GS_RESOURCE_names[] = {
+static const struct radeon_register R600_names_GS_RESOURCE[] = {
 	{0x0003A4C0, 0, 0, "RESOURCE336_WORD0"},
 	{0x0003A4C4, 0, 0, "RESOURCE336_WORD1"},
 	{0x0003A4C8, 0, 0, "RESOURCE336_WORD2"},
@@ -323,46 +350,46 @@ static const struct radeon_register R600_GS_RESOURCE_names[] = {
 	{0x0003A4D8, 0, 0, "RESOURCE336_WORD6"},
 };
 
-static const struct radeon_register R600_PS_SAMPLER_names[] = {
+static const struct radeon_register R600_names_PS_SAMPLER[] = {
 	{0x0003C000, 0, 0, "SQ_TEX_SAMPLER_WORD0_0"},
 	{0x0003C004, 0, 0, "SQ_TEX_SAMPLER_WORD1_0"},
 	{0x0003C008, 0, 0, "SQ_TEX_SAMPLER_WORD2_0"},
 };
 
-static const struct radeon_register R600_VS_SAMPLER_names[] = {
+static const struct radeon_register R600_names_VS_SAMPLER[] = {
 	{0x0003C0D8, 0, 0, "SQ_TEX_SAMPLER_WORD0_18"},
 	{0x0003C0DC, 0, 0, "SQ_TEX_SAMPLER_WORD1_18"},
 	{0x0003C0E0, 0, 0, "SQ_TEX_SAMPLER_WORD2_18"},
 };
 
-static const struct radeon_register R600_GS_SAMPLER_names[] = {
+static const struct radeon_register R600_names_GS_SAMPLER[] = {
 	{0x0003C1B0, 0, 0, "SQ_TEX_SAMPLER_WORD0_36"},
 	{0x0003C1B4, 0, 0, "SQ_TEX_SAMPLER_WORD1_36"},
 	{0x0003C1B8, 0, 0, "SQ_TEX_SAMPLER_WORD2_36"},
 };
 
-static const struct radeon_register R600_PS_SAMPLER_BORDER_names[] = {
+static const struct radeon_register R600_names_PS_SAMPLER_BORDER[] = {
 	{0x0000A400, 0, 0, "TD_PS_SAMPLER0_BORDER_RED"},
 	{0x0000A404, 0, 0, "TD_PS_SAMPLER0_BORDER_GREEN"},
 	{0x0000A408, 0, 0, "TD_PS_SAMPLER0_BORDER_BLUE"},
 	{0x0000A40C, 0, 0, "TD_PS_SAMPLER0_BORDER_ALPHA"},
 };
 
-static const struct radeon_register R600_VS_SAMPLER_BORDER_names[] = {
+static const struct radeon_register R600_names_VS_SAMPLER_BORDER[] = {
 	{0x0000A600, 0, 0, "TD_VS_SAMPLER0_BORDER_RED"},
 	{0x0000A604, 0, 0, "TD_VS_SAMPLER0_BORDER_GREEN"},
 	{0x0000A608, 0, 0, "TD_VS_SAMPLER0_BORDER_BLUE"},
 	{0x0000A60C, 0, 0, "TD_VS_SAMPLER0_BORDER_ALPHA"},
 };
 
-static const struct radeon_register R600_GS_SAMPLER_BORDER_names[] = {
+static const struct radeon_register R600_names_GS_SAMPLER_BORDER[] = {
 	{0x0000A800, 0, 0, "TD_GS_SAMPLER0_BORDER_RED"},
 	{0x0000A804, 0, 0, "TD_GS_SAMPLER0_BORDER_GREEN"},
 	{0x0000A808, 0, 0, "TD_GS_SAMPLER0_BORDER_BLUE"},
 	{0x0000A80C, 0, 0, "TD_GS_SAMPLER0_BORDER_ALPHA"},
 };
 
-static const struct radeon_register R600_CB0_names[] = {
+static const struct radeon_register R600_names_CB0[] = {
 	{0x00028040, 1, 0, "CB_COLOR0_BASE"},
 	{0x000280A0, 0, 0, "CB_COLOR0_INFO"},
 	{0x00028060, 0, 0, "CB_COLOR0_SIZE"},
@@ -372,7 +399,7 @@ static const struct radeon_register R600_CB0_names[] = {
 	{0x00028100, 0, 0, "CB_COLOR0_MASK"},
 };
 
-static const struct radeon_register R600_CB1_names[] = {
+static const struct radeon_register R600_names_CB1[] = {
 	{0x00028044, 1, 0, "CB_COLOR1_BASE"},
 	{0x000280A4, 0, 0, "CB_COLOR1_INFO"},
 	{0x00028064, 0, 0, "CB_COLOR1_SIZE"},
@@ -382,7 +409,7 @@ static const struct radeon_register R600_CB1_names[] = {
 	{0x00028104, 0, 0, "CB_COLOR1_MASK"},
 };
 
-static const struct radeon_register R600_CB2_names[] = {
+static const struct radeon_register R600_names_CB2[] = {
 	{0x00028048, 1, 0, "CB_COLOR2_BASE"},
 	{0x000280A8, 0, 0, "CB_COLOR2_INFO"},
 	{0x00028068, 0, 0, "CB_COLOR2_SIZE"},
@@ -392,7 +419,7 @@ static const struct radeon_register R600_CB2_names[] = {
 	{0x00028108, 0, 0, "CB_COLOR2_MASK"},
 };
 
-static const struct radeon_register R600_CB3_names[] = {
+static const struct radeon_register R600_names_CB3[] = {
 	{0x0002804C, 1, 0, "CB_COLOR3_BASE"},
 	{0x000280AC, 0, 0, "CB_COLOR3_INFO"},
 	{0x0002806C, 0, 0, "CB_COLOR3_SIZE"},
@@ -402,7 +429,7 @@ static const struct radeon_register R600_CB3_names[] = {
 	{0x0002810C, 0, 0, "CB_COLOR3_MASK"},
 };
 
-static const struct radeon_register R600_CB4_names[] = {
+static const struct radeon_register R600_names_CB4[] = {
 	{0x00028050, 1, 0, "CB_COLOR4_BASE"},
 	{0x000280B0, 0, 0, "CB_COLOR4_INFO"},
 	{0x00028070, 0, 0, "CB_COLOR4_SIZE"},
@@ -412,7 +439,7 @@ static const struct radeon_register R600_CB4_names[] = {
 	{0x00028110, 0, 0, "CB_COLOR4_MASK"},
 };
 
-static const struct radeon_register R600_CB5_names[] = {
+static const struct radeon_register R600_names_CB5[] = {
 	{0x00028054, 1, 0, "CB_COLOR5_BASE"},
 	{0x000280B4, 0, 0, "CB_COLOR5_INFO"},
 	{0x00028074, 0, 0, "CB_COLOR5_SIZE"},
@@ -422,7 +449,7 @@ static const struct radeon_register R600_CB5_names[] = {
 	{0x00028114, 0, 0, "CB_COLOR5_MASK"},
 };
 
-static const struct radeon_register R600_CB6_names[] = {
+static const struct radeon_register R600_names_CB6[] = {
 	{0x00028058, 1, 0, "CB_COLOR6_BASE"},
 	{0x000280B8, 0, 0, "CB_COLOR6_INFO"},
 	{0x00028078, 0, 0, "CB_COLOR6_SIZE"},
@@ -432,7 +459,7 @@ static const struct radeon_register R600_CB6_names[] = {
 	{0x00028118, 0, 0, "CB_COLOR6_MASK"},
 };
 
-static const struct radeon_register R600_CB7_names[] = {
+static const struct radeon_register R600_names_CB7[] = {
 	{0x0002805C, 1, 0, "CB_COLOR7_BASE"},
 	{0x000280BC, 0, 0, "CB_COLOR7_INFO"},
 	{0x0002807C, 0, 0, "CB_COLOR7_SIZE"},
@@ -442,7 +469,7 @@ static const struct radeon_register R600_CB7_names[] = {
 	{0x0002811C, 0, 0, "CB_COLOR7_MASK"},
 };
 
-static const struct radeon_register R600_DB_names[] = {
+static const struct radeon_register R600_names_DB[] = {
 	{0x0002800C, 1, 0, "DB_DEPTH_BASE"},
 	{0x00028000, 0, 0, "DB_DEPTH_SIZE"},
 	{0x00028004, 0, 0, "DB_DEPTH_VIEW"},
@@ -451,7 +478,7 @@ static const struct radeon_register R600_DB_names[] = {
 	{0x00028D34, 0, 0, "DB_PREFETCH_LIMIT"},
 };
 
-static const struct radeon_register R600_VGT_names[] = {
+static const struct radeon_register R600_names_VGT[] = {
 	{0x00008958, 0, 0, "VGT_PRIMITIVE_TYPE"},
 	{0x00028400, 0, 0, "VGT_MAX_VTX_INDX"},
 	{0x00028404, 0, 0, "VGT_MIN_VTX_INDX"},
@@ -465,81 +492,15 @@ static const struct radeon_register R600_VGT_names[] = {
 	{0x00028AA4, 0, 0, "VGT_INSTANCE_STEP_RATE_1"},
 };
 
-static const struct radeon_register R600_DRAW_names[] = {
+static const struct radeon_register R600_names_DRAW[] = {
 	{0x00008970, 0, 0, "VGT_NUM_INDICES"},
 	{0x000287E4, 0, 0, "VGT_DMA_BASE_HI"},
 	{0x000287E8, 1, 0, "VGT_DMA_BASE"},
 	{0x000287F0, 0, 0, "VGT_DRAW_INITIATOR"},
 };
 
-static struct radeon_type R600_types[] = {
-	{ 128,    0, 0x00000000, 0x00000000, 0x0000, 0, "R600_CONFIG", 41, r600_state_pm4_config, R600_CONFIG_names},
-	{ 128,    1, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB_CNTL", 18, r600_state_pm4_generic, R600_CB_CNTL_names},
-	{ 128,    2, 0x00000000, 0x00000000, 0x0000, 0, "R600_RASTERIZER", 21, r600_state_pm4_generic, R600_RASTERIZER_names},
-	{ 128,    3, 0x00000000, 0x00000000, 0x0000, 0, "R600_VIEWPORT", 9, r600_state_pm4_generic, R600_VIEWPORT_names},
-	{ 128,    4, 0x00000000, 0x00000000, 0x0000, 0, "R600_SCISSOR", 19, r600_state_pm4_generic, R600_SCISSOR_names},
-	{ 128,    5, 0x00000000, 0x00000000, 0x0000, 0, "R600_BLEND", 13, r600_state_pm4_generic, R600_BLEND_names},
-	{ 128,    6, 0x00000000, 0x00000000, 0x0000, 0, "R600_DSA", 16, r600_state_pm4_generic, R600_DSA_names},
-	{ 128,    7, 0x00000000, 0x00000000, 0x0000, 0, "R600_VS_SHADER", 49, r600_state_pm4_shader, R600_VS_SHADER_names},
-	{ 128,    8, 0x00000000, 0x00000000, 0x0000, 0, "R600_PS_SHADER", 39, r600_state_pm4_shader, R600_PS_SHADER_names},
-	{ 128,    9, 0x00030000, 0x00031000, 0x0010, 0, "R600_PS_CONSTANT", 4, r600_state_pm4_generic, R600_PS_CONSTANT_names},
-	{ 128,  265, 0x00031000, 0x00032000, 0x0010, 0, "R600_VS_CONSTANT", 4, r600_state_pm4_generic, R600_VS_CONSTANT_names},
-	{ 128,  521, 0x00038000, 0x00039180, 0x001C, 0, "R600_PS_RESOURCE", 7, r600_state_pm4_resource, R600_PS_RESOURCE_names},
-	{ 128,  681, 0x00039180, 0x0003A300, 0x001C, 0, "R600_VS_RESOURCE", 7, r600_state_pm4_resource, R600_VS_RESOURCE_names},
-	{ 128,  841, 0x00039180, 0x0003A300, 0x001C, 0, "R600_FS_RESOURCE", 7, r600_state_pm4_resource, R600_FS_RESOURCE_names},
-	{ 128, 1001, 0x00039180, 0x0003A300, 0x001C, 0, "R600_GS_RESOURCE", 7, r600_state_pm4_resource, R600_GS_RESOURCE_names},
-	{ 128, 1161, 0x0003C000, 0x0003C0D8, 0x000C, 0, "R600_PS_SAMPLER", 3, r600_state_pm4_generic, R600_PS_SAMPLER_names},
-	{ 128, 1179, 0x0003C0D8, 0x0003C1B0, 0x000C, 0, "R600_VS_SAMPLER", 3, r600_state_pm4_generic, R600_VS_SAMPLER_names},
-	{ 128, 1197, 0x0003C1B0, 0x0003C288, 0x000C, 0, "R600_GS_SAMPLER", 3, r600_state_pm4_generic, R600_GS_SAMPLER_names},
-	{ 128, 1215, 0x0000A400, 0x0000A520, 0x0010, 0, "R600_PS_SAMPLER_BORDER", 4, r600_state_pm4_generic, R600_PS_SAMPLER_BORDER_names},
-	{ 128, 1233, 0x0000A600, 0x0000A720, 0x0010, 0, "R600_VS_SAMPLER_BORDER", 4, r600_state_pm4_generic, R600_VS_SAMPLER_BORDER_names},
-	{ 128, 1251, 0x0000A800, 0x0000A920, 0x0010, 0, "R600_GS_SAMPLER_BORDER", 4, r600_state_pm4_generic, R600_GS_SAMPLER_BORDER_names},
-	{ 128, 1269, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB0", 7, r600_state_pm4_cb0, R600_CB0_names},
-	{ 128, 1270, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB1", 7, r600_state_pm4_cb0, R600_CB1_names},
-	{ 128, 1271, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB2", 7, r600_state_pm4_cb0, R600_CB2_names},
-	{ 128, 1272, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB3", 7, r600_state_pm4_cb0, R600_CB3_names},
-	{ 128, 1273, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB4", 7, r600_state_pm4_cb0, R600_CB4_names},
-	{ 128, 1274, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB5", 7, r600_state_pm4_cb0, R600_CB5_names},
-	{ 128, 1275, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB6", 7, r600_state_pm4_cb0, R600_CB6_names},
-	{ 128, 1276, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB7", 7, r600_state_pm4_cb0, R600_CB7_names},
-	{ 128, 1277, 0x00000000, 0x00000000, 0x0000, 0, "R600_DB", 6, r600_state_pm4_db, R600_DB_names},
-	{ 128, 1278, 0x00000000, 0x00000000, 0x0000, 0, "R600_VGT", 11, r600_state_pm4_vgt, R600_VGT_names},
-	{ 128, 1279, 0x00000000, 0x00000000, 0x0000, 0, "R600_DRAW", 4, r600_state_pm4_draw, R600_DRAW_names},
-};
-
-static struct radeon_type R700_types[] = {
-	{ 128,    0, 0x00000000, 0x00000000, 0x0000, 0, "R600_CONFIG", 41, r700_state_pm4_config, R600_CONFIG_names},
-	{ 128,    1, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB_CNTL", 18, r600_state_pm4_generic, R600_CB_CNTL_names},
-	{ 128,    2, 0x00000000, 0x00000000, 0x0000, 0, "R600_RASTERIZER", 21, r600_state_pm4_generic, R600_RASTERIZER_names},
-	{ 128,    3, 0x00000000, 0x00000000, 0x0000, 0, "R600_VIEWPORT", 9, r600_state_pm4_generic, R600_VIEWPORT_names},
-	{ 128,    4, 0x00000000, 0x00000000, 0x0000, 0, "R600_SCISSOR", 19, r600_state_pm4_generic, R600_SCISSOR_names},
-	{ 128,    5, 0x00000000, 0x00000000, 0x0000, 0, "R600_BLEND", 13, r600_state_pm4_generic, R600_BLEND_names},
-	{ 128,    6, 0x00000000, 0x00000000, 0x0000, 0, "R600_DSA", 16, r600_state_pm4_generic, R600_DSA_names},
-	{ 128,    7, 0x00000000, 0x00000000, 0x0000, 0, "R600_VS_SHADER", 49, r600_state_pm4_shader, R600_VS_SHADER_names},
-	{ 128,    8, 0x00000000, 0x00000000, 0x0000, 0, "R600_PS_SHADER", 39, r600_state_pm4_shader, R600_PS_SHADER_names},
-	{ 128,    9, 0x00030000, 0x00031000, 0x0010, 0, "R600_PS_CONSTANT", 4, r600_state_pm4_generic, R600_PS_CONSTANT_names},
-	{ 128,  265, 0x00031000, 0x00032000, 0x0010, 0, "R600_VS_CONSTANT", 4, r600_state_pm4_generic, R600_VS_CONSTANT_names},
-	{ 128,  521, 0x00038000, 0x00039180, 0x001C, 0, "R600_PS_RESOURCE", 7, r600_state_pm4_resource, R600_PS_RESOURCE_names},
-	{ 128,  681, 0x00039180, 0x0003A300, 0x001C, 0, "R600_VS_RESOURCE", 7, r600_state_pm4_resource, R600_VS_RESOURCE_names},
-	{ 128,  841, 0x00039180, 0x0003A300, 0x001C, 0, "R600_FS_RESOURCE", 7, r600_state_pm4_resource, R600_FS_RESOURCE_names},
-	{ 128, 1001, 0x00039180, 0x0003A300, 0x001C, 0, "R600_GS_RESOURCE", 7, r600_state_pm4_resource, R600_GS_RESOURCE_names},
-	{ 128, 1161, 0x0003C000, 0x0003C0D8, 0x000C, 0, "R600_PS_SAMPLER", 3, r600_state_pm4_generic, R600_PS_SAMPLER_names},
-	{ 128, 1179, 0x0003C0D8, 0x0003C1B0, 0x000C, 0, "R600_VS_SAMPLER", 3, r600_state_pm4_generic, R600_VS_SAMPLER_names},
-	{ 128, 1197, 0x0003C1B0, 0x0003C288, 0x000C, 0, "R600_GS_SAMPLER", 3, r600_state_pm4_generic, R600_GS_SAMPLER_names},
-	{ 128, 1215, 0x0000A400, 0x0000A520, 0x0010, 0, "R600_PS_SAMPLER_BORDER", 4, r600_state_pm4_generic, R600_PS_SAMPLER_BORDER_names},
-	{ 128, 1233, 0x0000A600, 0x0000A720, 0x0010, 0, "R600_VS_SAMPLER_BORDER", 4, r600_state_pm4_generic, R600_VS_SAMPLER_BORDER_names},
-	{ 128, 1251, 0x0000A800, 0x0000A920, 0x0010, 0, "R600_GS_SAMPLER_BORDER", 4, r600_state_pm4_generic, R600_GS_SAMPLER_BORDER_names},
-	{ 128, 1269, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB0", 7, r700_state_pm4_cb0, R600_CB0_names},
-	{ 128, 1270, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB1", 7, r600_state_pm4_cb0, R600_CB1_names},
-	{ 128, 1271, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB2", 7, r600_state_pm4_cb0, R600_CB2_names},
-	{ 128, 1272, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB3", 7, r600_state_pm4_cb0, R600_CB3_names},
-	{ 128, 1273, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB4", 7, r600_state_pm4_cb0, R600_CB4_names},
-	{ 128, 1274, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB5", 7, r600_state_pm4_cb0, R600_CB5_names},
-	{ 128, 1275, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB6", 7, r600_state_pm4_cb0, R600_CB6_names},
-	{ 128, 1276, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB7", 7, r600_state_pm4_cb0, R600_CB7_names},
-	{ 128, 1277, 0x00000000, 0x00000000, 0x0000, 0, "R600_DB", 6, r700_state_pm4_db, R600_DB_names},
-	{ 128, 1278, 0x00000000, 0x00000000, 0x0000, 0, "R600_VGT", 11, r600_state_pm4_vgt, R600_VGT_names},
-	{ 128, 1279, 0x00000000, 0x00000000, 0x0000, 0, "R600_DRAW", 4, r600_state_pm4_draw, R600_DRAW_names},
+static const struct radeon_register R600_names_VGT_EVENT[] = {
+	{0x00028A90, 1, 0, "VGT_EVENT_INITIATOR"},
 };
 
 #endif
diff --git a/src/gallium/winsys/r600/drm/r600d.h b/src/gallium/winsys/r600/drm/r600d.h
index 5d13378627..e8c2dc0651 100644
--- a/src/gallium/winsys/r600/drm/r600d.h
+++ b/src/gallium/winsys/r600/drm/r600d.h
@@ -82,6 +82,9 @@
 #define PKT3_SET_CTL_CONST                     0x6F
 #define PKT3_SURFACE_BASE_UPDATE               0x73
 
+#define EVENT_TYPE_ZPASS_DONE                  0x15
+#define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT   0x16
+
 #define PKT_TYPE_S(x)                   (((x) & 0x3) << 30)
 #define PKT_TYPE_G(x)                   (((x) >> 30) & 0x3)
 #define PKT_TYPE_C                      0x3FFFFFFF
diff --git a/src/gallium/winsys/r600/drm/radeon.c b/src/gallium/winsys/r600/drm/radeon.c
index 80b0a1d397..e2d813ebac 100644
--- a/src/gallium/winsys/r600/drm/radeon.c
+++ b/src/gallium/winsys/r600/drm/radeon.c
@@ -42,24 +42,13 @@ static int radeon_get_device(struct radeon *radeon)
 	return r;
 }
 
-/* symbol missing drove me crazy hack to get symbol exported */
-static void fake(void)
-{
-	struct radeon_ctx *ctx;
-	struct radeon_draw *draw;
-
-	ctx = radeon_ctx(NULL);
-	draw = radeon_draw(NULL);
-}
-
 struct radeon *radeon_new(int fd, unsigned device)
 {
 	struct radeon *radeon;
-	int r;
+	int r, i, id;
 
 	radeon = calloc(1, sizeof(*radeon));
 	if (radeon == NULL) {
-		fake();
 		return NULL;
 	}
 	radeon->fd = fd;
@@ -131,6 +120,19 @@ struct radeon *radeon_new(int fd, unsigned device)
 			__func__, radeon->device);
 		break;
 	}
+	radeon->state_type_id = calloc(radeon->nstype, sizeof(unsigned));
+	if (radeon->state_type_id == NULL) {
+		return radeon_decref(radeon);
+	}
+	for (i = 0, id = 0; i < radeon->nstype; i++) {
+		radeon->state_type_id[i] = id;
+		for (int j = 0; j < radeon->nstype; j++) {
+			if (radeon->stype[j].stype != i)
+				continue;
+			id += radeon->stype[j].num;
+		}
+	}
+	radeon->nstate_per_shader = id;
 	return radeon;
 }
 
@@ -153,47 +155,3 @@ struct radeon *radeon_decref(struct radeon *radeon)
 	free(radeon);
 	return NULL;
 }
-
-int radeon_reg_id(struct radeon *radeon, unsigned offset, unsigned *typeid, unsigned *stateid, unsigned *id)
-{
-	unsigned i, j;
-
-	for (i = 0; i < radeon->ntype; i++) {
-		if (radeon->type[i].range_start) {
-			if (offset >= radeon->type[i].range_start && offset < radeon->type[i].range_end) {
-				*typeid = i;
-				j = offset - radeon->type[i].range_start;
-				j /= radeon->type[i].stride;
-				*stateid = radeon->type[i].id + j;
-				*id = (offset - radeon->type[i].range_start - radeon->type[i].stride * j) / 4;
-				return 0;
-			}
-		} else {
-			for (j = 0; j < radeon->type[i].nstates; j++) {
-				if (radeon->type[i].regs[j].offset == offset) {
-					*typeid = i;
-					*stateid = radeon->type[i].id;
-					*id = j;
-					return 0;
-				}
-			}
-		}
-	}
-	fprintf(stderr, "%s unknown register 0x%08X\n", __func__, offset);
-	return -EINVAL;
-}
-
-unsigned radeon_type_from_id(struct radeon *radeon, unsigned id)
-{
-	unsigned i;
-
-	for (i = 0; i < radeon->ntype - 1; i++) {
-		if (radeon->type[i].id == id)
-			return i;
-		if (id > radeon->type[i].id && id < radeon->type[i + 1].id)
-			return i;
-	}
-	if (radeon->type[i].id == id)
-		return i;
-	return -1;
-}
diff --git a/src/gallium/winsys/r600/drm/radeon_bo.c b/src/gallium/winsys/r600/drm/radeon_bo.c
index f259ae7fb5..a1306f6e9d 100644
--- a/src/gallium/winsys/r600/drm/radeon_bo.c
+++ b/src/gallium/winsys/r600/drm/radeon_bo.c
@@ -145,7 +145,9 @@ struct radeon_bo *radeon_bo_decref(struct radeon *radeon, struct radeon_bo *bo)
 		return NULL;
 	}
 
-	munmap(bo->data, bo->size);
+	if (bo->map_count) {
+		munmap(bo->data, bo->size);
+	}
 	memset(&args, 0, sizeof(args));
 	args.handle = bo->handle;
 	drmIoctl(radeon->fd, DRM_IOCTL_GEM_CLOSE, &args);
diff --git a/src/gallium/winsys/r600/drm/radeon_ctx.c b/src/gallium/winsys/r600/drm/radeon_ctx.c
index 45b706bb0f..47fca76136 100644
--- a/src/gallium/winsys/r600/drm/radeon_ctx.c
+++ b/src/gallium/winsys/r600/drm/radeon_ctx.c
@@ -30,21 +30,16 @@
 #include "radeon_drm.h"
 #include "bof.h"
 
-int radeon_ctx_set_bo_new(struct radeon_ctx *ctx, struct radeon_bo *bo)
+static int radeon_ctx_set_bo_new(struct radeon_ctx *ctx, struct radeon_bo *bo)
 {
-	void *ptr;
-
-	ptr = realloc(ctx->bo, sizeof(struct radeon_bo) * (ctx->nbo + 1));
-	if (ptr == NULL) {
-		return -ENOMEM;
-	}
-	ctx->bo = ptr;
+	if (ctx->nbo >= RADEON_CTX_MAX_PM4)
+		return -EBUSY;
 	ctx->bo[ctx->nbo] = bo;
 	ctx->nbo++;
 	return 0;
 }
 
-struct radeon_bo *radeon_ctx_get_bo(struct radeon_ctx *ctx, unsigned reloc)
+static struct radeon_bo *radeon_ctx_get_bo(struct radeon_ctx *ctx, unsigned reloc)
 {
 	struct radeon_cs_reloc *greloc;
 	unsigned i;
@@ -59,7 +54,7 @@ struct radeon_bo *radeon_ctx_get_bo(struct radeon_ctx *ctx, unsigned reloc)
 	return NULL;
 }
 
-void radeon_ctx_get_placement(struct radeon_ctx *ctx, unsigned reloc, u32 *placement)
+static void radeon_ctx_get_placement(struct radeon_ctx *ctx, unsigned reloc, u32 *placement)
 {
 	struct radeon_cs_reloc *greloc;
 	unsigned i;
@@ -76,50 +71,57 @@ void radeon_ctx_get_placement(struct radeon_ctx *ctx, unsigned reloc, u32 *place
 	}
 }
 
-struct radeon_ctx *radeon_ctx(struct radeon *radeon)
+void radeon_ctx_clear(struct radeon_ctx *ctx)
 {
-	struct radeon_ctx *ctx;
-
-	if (radeon == NULL)
-		return NULL;
-	ctx = calloc(1, sizeof(*ctx));
-	if (ctx == NULL)
-		return NULL;
-	ctx->radeon = radeon_incref(radeon);
-	return ctx;
+	for (int i = 0; i < ctx->nbo; i++) {
+		ctx->bo[i] = radeon_bo_decref(ctx->radeon, ctx->bo[i]);
+	}
+	ctx->ndwords = RADEON_CTX_MAX_PM4;
+	ctx->cdwords = 0;
+	ctx->nreloc = 0;
+	ctx->nbo = 0;
 }
 
-struct radeon_ctx *radeon_ctx_incref(struct radeon_ctx *ctx)
+int radeon_ctx_init(struct radeon_ctx *ctx, struct radeon *radeon)
 {
-	ctx->refcount++;
-	return ctx;
+	if (radeon == NULL)
+		return -EINVAL;
+	memset(ctx, 0, sizeof(struct radeon_ctx));
+	ctx->radeon = radeon_incref(radeon);
+	radeon_ctx_clear(ctx);
+	ctx->pm4 = malloc(RADEON_CTX_MAX_PM4 * 4);
+	if (ctx->pm4 == NULL) {
+		radeon_ctx_fini(ctx);
+		return -ENOMEM;
+	}
+	ctx->reloc = malloc(sizeof(struct radeon_cs_reloc) * RADEON_CTX_MAX_PM4);
+	if (ctx->reloc == NULL) {
+		radeon_ctx_fini(ctx);
+		return -ENOMEM;
+	}
+	ctx->bo = malloc(sizeof(void *) * RADEON_CTX_MAX_PM4);
+	if (ctx->bo == NULL) {
+		radeon_ctx_fini(ctx);
+		return -ENOMEM;
+	}
+	return 0;
 }
 
-struct radeon_ctx *radeon_ctx_decref(struct radeon_ctx *ctx)
+void radeon_ctx_fini(struct radeon_ctx *ctx)
 {
 	unsigned i;
 
 	if (ctx == NULL)
-		return NULL;
-	if (--ctx->refcount > 0) {
-		return NULL;
-	}
+		return;
 
-	for (i = 0; i < ctx->ndraw; i++) {
-		ctx->draw[i] = radeon_draw_decref(ctx->draw[i]);
-	}
 	for (i = 0; i < ctx->nbo; i++) {
 		ctx->bo[i] = radeon_bo_decref(ctx->radeon, ctx->bo[i]);
 	}
 	ctx->radeon = radeon_decref(ctx->radeon);
-	free(ctx->state);
-	free(ctx->draw);
 	free(ctx->bo);
 	free(ctx->pm4);
 	free(ctx->reloc);
-	memset(ctx, 0, sizeof(*ctx));
-	free(ctx);
-	return NULL;
+	memset(ctx, 0, sizeof(struct radeon_ctx));
 }
 
 static int radeon_ctx_state_bo(struct radeon_ctx *ctx, struct radeon_state *state)
@@ -152,17 +154,17 @@ int radeon_ctx_submit(struct radeon_ctx *ctx)
 	uint64_t chunk_array[2];
 	int r = 0;
 
-	if (!ctx->cpm4)
+	if (!ctx->cdwords)
 		return 0;
 #if 0
-	for (r = 0; r < ctx->cpm4; r++) {
+	for (r = 0; r < ctx->cdwords; r++) {
 		fprintf(stderr, "0x%08X\n", ctx->pm4[r]);
 	}
 #endif
 	drmib.num_chunks = 2;
 	drmib.chunks = (uint64_t)(uintptr_t)chunk_array;
 	chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
-	chunks[0].length_dw = ctx->cpm4;
+	chunks[0].length_dw = ctx->cdwords;
 	chunks[0].chunk_data = (uint64_t)(uintptr_t)ctx->pm4;
 	chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
 	chunks[1].length_dw = ctx->nreloc * sizeof(struct radeon_cs_reloc) / 4;
@@ -180,7 +182,6 @@ static int radeon_ctx_reloc(struct radeon_ctx *ctx, struct radeon_bo *bo,
 			unsigned id, unsigned *placement)
 {
 	unsigned i;
-	struct radeon_cs_reloc *ptr;
 
 	for (i = 0; i < ctx->nreloc; i++) {
 		if (ctx->reloc[i].handle == bo->handle) {
@@ -188,14 +189,13 @@ static int radeon_ctx_reloc(struct radeon_ctx *ctx, struct radeon_bo *bo,
 			return 0;
 		}
 	}
-	ptr = realloc(ctx->reloc, sizeof(struct radeon_cs_reloc) * (ctx->nreloc + 1));
-	if (ptr == NULL)
-		return -ENOMEM;
-	ctx->reloc = ptr;
-	ptr[ctx->nreloc].handle = bo->handle;
-	ptr[ctx->nreloc].read_domain = placement[0] | placement [1];
-	ptr[ctx->nreloc].write_domain = placement[0] | placement [1];
-	ptr[ctx->nreloc].flags = 0;
+	if (ctx->nreloc >= RADEON_CTX_MAX_PM4) {
+		return -EBUSY;
+	}
+	ctx->reloc[ctx->nreloc].handle = bo->handle;
+	ctx->reloc[ctx->nreloc].read_domain = placement[0] | placement [1];
+	ctx->reloc[ctx->nreloc].write_domain = placement[0] | placement [1];
+	ctx->reloc[ctx->nreloc].flags = 0;
 	ctx->pm4[id] = ctx->nreloc * sizeof(struct radeon_cs_reloc) / 4;
 	ctx->nreloc++;
 	return 0;
@@ -208,97 +208,80 @@ static int radeon_ctx_state_schedule(struct radeon_ctx *ctx, struct radeon_state
 
 	if (state == NULL)
 		return 0;
-	memcpy(&ctx->pm4[ctx->id], state->pm4, state->cpm4 * 4);
+	if (state->cpm4 > ctx->ndwords) {
+		return -EBUSY;
+	}
+	memcpy(&ctx->pm4[ctx->cdwords], state->pm4, state->cpm4 * 4);
 	for (i = 0; i < state->nreloc; i++) {
 		rid = state->reloc_pm4_id[i];
 		bid = state->reloc_bo_id[i];
-		cid = ctx->id + rid;
+		cid = ctx->cdwords + rid;
 		r = radeon_ctx_reloc(ctx, state->bo[bid], cid,
 					&state->placement[bid * 2]);
 		if (r) {
-			fprintf(stderr, "%s state %d failed to reloc\n", __func__, state->type);
+			fprintf(stderr, "%s state %d failed to reloc\n", __func__, state->stype->stype);
 			return r;
 		}
 	}
-	ctx->id += state->cpm4;
+	ctx->cdwords += state->cpm4;
+	ctx->ndwords -= state->cpm4;
 	return 0;
 }
 
-int radeon_ctx_set_draw_new(struct radeon_ctx *ctx, struct radeon_draw *draw)
+int radeon_ctx_set_query_state(struct radeon_ctx *ctx, struct radeon_state *state)
 {
-	struct radeon_draw *pdraw = NULL;
-	struct radeon_draw **ndraw;
-	struct radeon_state *nstate, *ostate;
-	unsigned cpm4, i, cstate;
-	void *tmp;
 	int r = 0;
 
-	ndraw = realloc(ctx->draw, sizeof(void*) * (ctx->ndraw + 1));
-	if (ndraw == NULL)
-		return -ENOMEM;
-	ctx->draw = ndraw;
-	for (i = 0; i < draw->nstate; i++) {
-		r = radeon_ctx_state_bo(ctx, draw->state[i]);
-		if (r)
-			return r;
-	}
-	r = radeon_draw_check(draw);
+	/* !!! ONLY ACCEPT QUERY STATE HERE !!! */
+	r = radeon_state_pm4(state);
 	if (r)
 		return r;
-	if (draw->cpm4 >= RADEON_CTX_MAX_PM4) {
-		fprintf(stderr, "%s single draw too big %d, max %d\n",
-			__func__, draw->cpm4, RADEON_CTX_MAX_PM4);
+	/* BEGIN/END query are balanced in the same cs so account for END
+	 * END query when scheduling BEGIN query
+	 */
+	switch (state->stype->stype) {
+	case R600_STATE_QUERY_BEGIN:
+		/* is there enough place for begin & end */
+		if ((state->cpm4 * 2) > ctx->ndwords)
+			return -EBUSY;
+		ctx->ndwords -= state->cpm4;
+		break;
+	case R600_STATE_QUERY_END:
+		ctx->ndwords += state->cpm4;
+		break;
+	default:
 		return -EINVAL;
 	}
-	tmp = realloc(ctx->state, (ctx->nstate + draw->nstate) * sizeof(void*));
-	if (tmp == NULL)
-		return -ENOMEM;
-	ctx->state = tmp;
-	pdraw = ctx->cdraw;
-	for (i = 0, cpm4 = 0, cstate = ctx->nstate; i < draw->nstate - 1; i++) {
-		nstate = draw->state[i];
-		if (nstate) {
-			if (pdraw && pdraw->state[i]) {
-				ostate = pdraw->state[i];
-				if (ostate->pm4_crc != nstate->pm4_crc) {
-					ctx->state[cstate++] = nstate;
-					cpm4 += nstate->cpm4;
-				}
-			} else {
-				ctx->state[cstate++] = nstate;
-				cpm4 += nstate->cpm4;
-			}
-		}
-	}
-	/* The last state is the draw state always add it */
-	if (draw->state[i] == NULL) {
-		fprintf(stderr, "%s no draw command\n", __func__);
-		return -EINVAL;
-	}
-	ctx->state[cstate++] = draw->state[i];
-	cpm4 += draw->state[i]->cpm4;
-	if ((ctx->draw_cpm4 + cpm4) > RADEON_CTX_MAX_PM4) {
-		/* need to flush */
-		return -EBUSY;
-	}
-	ctx->draw_cpm4 += cpm4;
-	ctx->nstate = cstate;
-	ctx->draw[ctx->ndraw++] = draw;
-	ctx->cdraw = draw;
-	return 0;
+	return radeon_ctx_state_schedule(ctx, state);
 }
 
 int radeon_ctx_set_draw(struct radeon_ctx *ctx, struct radeon_draw *draw)
 {
-	int r;
+	unsigned previous_cdwords;
+	int r = 0;
 
-	radeon_draw_incref(draw);
-	r = radeon_ctx_set_draw_new(ctx, draw);
-	if (r)
-		radeon_draw_decref(draw);
-	return r;
+	for (int i = 0; i < (ctx->radeon->nstate_per_shader * R600_SHADER_MAX); i++) {
+		r = radeon_ctx_state_bo(ctx, draw->state[i]);
+		if (r)
+			return r;
+	}
+	previous_cdwords = ctx->cdwords;
+	for (int i = 0, id = 0; i < ctx->radeon->nstate_per_shader; i++) {
+		for (int j = 0; j < R600_SHADER_MAX; j++) {
+			id = j * ctx->radeon->nstate_per_shader + i;
+			if (draw->state[id]) {
+				r = radeon_ctx_state_schedule(ctx, draw->state[id]);
+				if (r) {
+					ctx->cdwords = previous_cdwords;
+					return r;
+				}
+			}
+		}
+	}
+	return 0;
 }
 
+#if 0
 int radeon_ctx_pm4(struct radeon_ctx *ctx)
 {
 	unsigned i;
@@ -310,9 +293,6 @@ int radeon_ctx_pm4(struct radeon_ctx *ctx)
 	if (ctx->pm4 == NULL)
 		return -EINVAL;
 	for (i = 0, ctx->id = 0; i < ctx->nstate; i++) {
-		r = radeon_ctx_state_schedule(ctx, ctx->state[i]);
-		if (r)
-			return r;
 	}
 	if (ctx->id != ctx->draw_cpm4) {
 		fprintf(stderr, "%s miss predicted pm4 size %d for %d\n",
@@ -322,6 +302,7 @@ int radeon_ctx_pm4(struct radeon_ctx *ctx)
 	ctx->cpm4 = ctx->draw_cpm4;
 	return 0;
 }
+#endif
 
 void radeon_ctx_dump_bof(struct radeon_ctx *ctx, const char *file)
 {
@@ -349,8 +330,8 @@ printf("%d relocs\n", ctx->nreloc);
 	bof_decref(blob);
 	blob = NULL;
 	/* dump cs */
-printf("%d pm4\n", ctx->cpm4);
-	blob = bof_blob(ctx->cpm4 * 4, ctx->pm4);
+printf("%d pm4\n", ctx->cdwords);
+	blob = bof_blob(ctx->cdwords * 4, ctx->pm4);
 	if (blob == NULL)
 		goto out_err;
 	if (bof_object_set(root, "pm4", blob))
@@ -366,7 +347,6 @@ printf("%d pm4\n", ctx->cpm4);
 		if (bo == NULL)
 			goto out_err;
 		size = bof_int32(ctx->bo[i]->size);
-printf("[%d] %d bo\n", i, size);
 		if (size == NULL)
 			goto out_err;
 		if (bof_object_set(bo, "size", size))
diff --git a/src/gallium/winsys/r600/drm/radeon_draw.c b/src/gallium/winsys/r600/drm/radeon_draw.c
index 4413ed79fb..b992c4a55d 100644
--- a/src/gallium/winsys/r600/drm/radeon_draw.c
+++ b/src/gallium/winsys/r600/drm/radeon_draw.c
@@ -31,111 +31,27 @@
 /*
  * draw functions
  */
-struct radeon_draw *radeon_draw(struct radeon *radeon)
+int radeon_draw_init(struct radeon_draw *draw, struct radeon *radeon)
 {
-	struct radeon_draw *draw;
-
-	draw = calloc(1, sizeof(*draw));
-	if (draw == NULL)
-		return NULL;
-	draw->nstate = radeon->nstate;
 	draw->radeon = radeon;
-	draw->refcount = 1;
-	draw->state = calloc(1, sizeof(void*) * draw->nstate);
-	if (draw->state == NULL) {
-		free(draw);
-		return NULL;
-	}
-	return draw;
-}
-
-struct radeon_draw *radeon_draw_incref(struct radeon_draw *draw)
-{
-	draw->refcount++;
-	return draw;
-}
-
-struct radeon_draw *radeon_draw_decref(struct radeon_draw *draw)
-{
-	unsigned i;
-
-	if (draw == NULL)
-		return NULL;
-	if (--draw->refcount > 0)
-		return NULL;
-	for (i = 0; i < draw->nstate; i++) {
-		draw->state[i] = radeon_state_decref(draw->state[i]);
-	}
-	free(draw->state);
-	memset(draw, 0, sizeof(*draw));
-	free(draw);
-	return NULL;
-}
-
-int radeon_draw_set_new(struct radeon_draw *draw, struct radeon_state *state)
-{
-	if (state == NULL)
-		return 0;
-	if (state->type >= draw->radeon->ntype)
-		return -EINVAL;
-	draw->state[state->id] = radeon_state_decref(draw->state[state->id]);
-	draw->state[state->id] = state;
+	draw->state = calloc(radeon->nstate_per_shader * R600_SHADER_MAX, sizeof(void*));
+	if (draw->state == NULL)
+		return -ENOMEM;
 	return 0;
 }
 
-int radeon_draw_set(struct radeon_draw *draw, struct radeon_state *state)
+void radeon_draw_bind(struct radeon_draw *draw, struct radeon_state *state)
 {
 	if (state == NULL)
-		return 0;
-	radeon_state_incref(state);
-	return radeon_draw_set_new(draw, state);
+		return;
+	draw->state[state->state_id] = state;
 }
 
-int radeon_draw_check(struct radeon_draw *draw)
+void radeon_draw_unbind(struct radeon_draw *draw, struct radeon_state *state)
 {
-	unsigned i;
-	int r;
-
-	r = radeon_draw_pm4(draw);
-	if (r)
-		return r;
-	for (i = 0, draw->cpm4 = 0; i < draw->nstate; i++) {
-		if (draw->state[i]) {
-			draw->cpm4 += draw->state[i]->cpm4;
-		}
-	}
-	return 0;
-}
-
-struct radeon_draw *radeon_draw_duplicate(struct radeon_draw *draw)
-{
-	struct radeon_draw *ndraw;
-	unsigned i;
-
-	if (draw == NULL)
-		return NULL;
-	ndraw = radeon_draw(draw->radeon);
-	if (ndraw == NULL) {
-		return NULL;
-	}
-	for (i = 0; i < draw->nstate; i++) {
-		if (radeon_draw_set(ndraw, draw->state[i])) {
-			radeon_draw_decref(ndraw);
-			return NULL;
-		}
-	}
-	return ndraw;
-}
-
-int radeon_draw_pm4(struct radeon_draw *draw)
-{
-	unsigned i;
-	int r;
-
-	for (i = 0; i < draw->nstate; i++) {
-		r = radeon_state_pm4(draw->state[i]);
-		if (r)
-			return r;
+	if (state == NULL)
+		return;
+	if (draw->state[state->state_id] == state) {
+		draw->state[state->state_id] = NULL;
 	}
-	return 0;
 }
diff --git a/src/gallium/winsys/r600/drm/radeon_priv.h b/src/gallium/winsys/r600/drm/radeon_priv.h
index 96c0d060f7..84e552ba4d 100644
--- a/src/gallium/winsys/r600/drm/radeon_priv.h
+++ b/src/gallium/winsys/r600/drm/radeon_priv.h
@@ -37,17 +37,20 @@ struct radeon_register {
 	char				name[64];
 };
 
-struct radeon_type {
-	unsigned			npm4;
-	unsigned			id;
-	unsigned			range_start;
-	unsigned			range_end;
-	unsigned			stride;
-	unsigned			immediate;
-	char				name[64];
+struct radeon_sub_type {
+	int				shader_type;
+	const struct radeon_register	*regs;
 	unsigned			nstates;
+};
+
+struct radeon_stype_info {
+	unsigned			stype;
+	unsigned			num;
+	unsigned			stride;
 	radeon_state_pm4_t		pm4;
-	const struct radeon_register	*regs;
+	struct radeon_sub_type		reginfo[R600_SHADER_MAX];
+	unsigned			base_id;
+	unsigned			npm4;
 };
 
 struct radeon {
@@ -55,9 +58,10 @@ struct radeon {
 	int				refcount;
 	unsigned			device;
 	unsigned			family;
-	unsigned			nstate;
-	unsigned			ntype;
-	const struct radeon_type	*type;
+	unsigned			nstype;
+	unsigned			nstate_per_shader;
+	unsigned			*state_type_id;
+	struct radeon_stype_info	*stype;
 };
 
 extern struct radeon *radeon_new(int fd, unsigned device);
@@ -65,15 +69,6 @@ extern struct radeon *radeon_incref(struct radeon *radeon);
 extern struct radeon *radeon_decref(struct radeon *radeon);
 extern unsigned radeon_family_from_device(unsigned device);
 extern int radeon_is_family_compatible(unsigned family1, unsigned family2);
-extern int radeon_reg_id(struct radeon *radeon, unsigned offset, unsigned *typeid, unsigned *stateid, unsigned *id);
-extern unsigned radeon_type_from_id(struct radeon *radeon, unsigned id);
-
-
-int radeon_ctx_set_bo_new(struct radeon_ctx *ctx, struct radeon_bo *bo);
-struct radeon_bo *radeon_ctx_get_bo(struct radeon_ctx *ctx, unsigned reloc);
-void radeon_ctx_get_placement(struct radeon_ctx *ctx, unsigned reloc, u32 *placement);
-int radeon_ctx_set_draw_new(struct radeon_ctx *ctx, struct radeon_draw *draw);
-int radeon_ctx_draw(struct radeon_ctx *ctx);
 
 /*
  * r600/r700 context functions
diff --git a/src/gallium/winsys/r600/drm/radeon_state.c b/src/gallium/winsys/r600/drm/radeon_state.c
index 308288557a..ac60485b28 100644
--- a/src/gallium/winsys/r600/drm/radeon_state.c
+++ b/src/gallium/winsys/r600/drm/radeon_state.c
@@ -32,82 +32,116 @@
 /*
  * state core functions
  */
-struct radeon_state *radeon_state(struct radeon *radeon, u32 type, u32 id)
+int radeon_state_init(struct radeon_state *state, struct radeon *radeon, u32 stype, u32 id, u32 shader_type)
 {
-	struct radeon_state *state;
+	struct radeon_stype_info *found = NULL;
+	int i, j, shader_index = -1;
 
-	if (type > radeon->ntype) {
-		fprintf(stderr, "%s invalid type %d\n", __func__, type);
-		return NULL;
+	/* traverse the stype array */
+	for (i = 0; i < radeon->nstype; i++) {
+		/* if the type doesn't match, if the shader doesn't match */
+		if (stype != radeon->stype[i].stype)
+			continue;
+		if (shader_type) {
+			for (j = 0; j < 4; j++) {
+				if (radeon->stype[i].reginfo[j].shader_type == shader_type) {
+					shader_index = j;
+					break;
+				}
+			}
+			if (shader_index == -1)
+				continue;
+		} else {
+			if (radeon->stype[i].reginfo[0].shader_type)
+				continue;
+			else
+				shader_index = 0;
+		}
+		if (id > radeon->stype[i].num)
+			continue;
+		
+		found = &radeon->stype[i];
+		break;
 	}
-	if (id > radeon->nstate) {
-		fprintf(stderr, "%s invalid state id %d\n", __func__, id);
-		return NULL;
+
+	if (!found) {
+		fprintf(stderr, "%s invalid type %d/id %d/shader class %d\n", __func__, stype, id, shader_type);
+		return -EINVAL;
 	}
-	state = calloc(1, sizeof(*state));
-	if (state == NULL)
-		return NULL;
+
+	memset(state, 0, sizeof(struct radeon_state));
+	state->state_id = radeon->nstate_per_shader * shader_index + radeon->state_type_id[stype] + id;
+	state->stype = found;
 	state->radeon = radeon;
-	state->type = type;
 	state->id = id;
+	state->shader_index = shader_index;
 	state->refcount = 1;
-	state->npm4 = radeon->type[type].npm4;
-	state->nstates = radeon->type[type].nstates;
-	state->states = calloc(1, state->nstates * 4);
-	state->pm4 = calloc(1, radeon->type[type].npm4 * 4);
-	if (state->states == NULL || state->pm4 == NULL) {
-		radeon_state_decref(state);
-		return NULL;
-	}
-	return state;
+	state->npm4 = found->npm4;
+	state->nstates = found->reginfo[shader_index].nstates;
+	return 0;
 }
 
-struct radeon_state *radeon_state_duplicate(struct radeon_state *state)
+int radeon_state_convert(struct radeon_state *state, u32 stype, u32 id, u32 shader_type)
 {
-	struct radeon_state *nstate = radeon_state(state->radeon, state->type, state->id);
-	unsigned i;
+	struct radeon_stype_info *found = NULL;
+	int i, j, shader_index = -1;
 
 	if (state == NULL)
-		return NULL;
-	nstate->cpm4 = state->cpm4;
-	nstate->nbo = state->nbo;
-	nstate->nreloc = state->nreloc;
-	memcpy(nstate->states, state->states, state->nstates * 4);
-	memcpy(nstate->pm4, state->pm4, state->npm4 * 4);
-	memcpy(nstate->placement, state->placement, 8 * 4);
-	memcpy(nstate->reloc_pm4_id, state->reloc_pm4_id, 8 * 4);
-	memcpy(nstate->reloc_bo_id, state->reloc_bo_id, 8 * 4);
-	memcpy(nstate->bo_dirty, state->bo_dirty, 4 * 4);
-	for (i = 0; i < state->nbo; i++) {
-		nstate->bo[i] = radeon_bo_incref(state->radeon, state->bo[i]);
+		return 0;
+	/* traverse the stype array */
+	for (i = 0; i < state->radeon->nstype; i++) {
+		/* if the type doesn't match, if the shader doesn't match */
+		if (stype != state->radeon->stype[i].stype)
+			continue;
+		if (shader_type) {
+			for (j = 0; j < 4; j++) {
+				if (state->radeon->stype[i].reginfo[j].shader_type == shader_type) {
+					shader_index = j;
+					break;
+				}
+			}
+			if (shader_index == -1)
+				continue;
+		} else {
+			if (state->radeon->stype[i].reginfo[0].shader_type)
+				continue;
+			else
+				shader_index = 0;
+		}
+		if (id > state->radeon->stype[i].num)
+			continue;
+		
+		found = &state->radeon->stype[i];
+		break;
 	}
-	return nstate;
-}
 
-struct radeon_state *radeon_state_incref(struct radeon_state *state)
-{
-	state->refcount++;
-	return state;
+	if (!found) {
+		fprintf(stderr, "%s invalid type %d/id %d/shader class %d\n", __func__, stype, id, shader_type);
+		return -EINVAL;
+	}
+
+	if (found->reginfo[shader_index].nstates != state->nstates) {
+		fprintf(stderr, "invalid type change from (%d %d %d) to (%d %d %d)\n",
+			state->stype->stype, state->id, state->shader_index, stype, id, shader_index);
+	}
+
+	state->stype = found;
+	state->id = id;
+	state->shader_index = shader_index;
+	state->state_id = state->radeon->nstate_per_shader * shader_index + state->radeon->state_type_id[stype] + id;
+	return radeon_state_pm4(state);
 }
 
-struct radeon_state *radeon_state_decref(struct radeon_state *state)
+void radeon_state_fini(struct radeon_state *state)
 {
 	unsigned i;
 
 	if (state == NULL)
 		return NULL;
-	if (--state->refcount > 0) {
-		return NULL;
-	}
 	for (i = 0; i < state->nbo; i++) {
 		state->bo[i] = radeon_bo_decref(state->radeon, state->bo[i]);
 	}
-	free(state->immd);
-	free(state->states);
-	free(state->pm4);
-	memset(state, 0, sizeof(*state));
-	free(state);
-	return NULL;
+	memset(state, 0, sizeof(struct radeon_state));
 }
 
 int radeon_state_replace_always(struct radeon_state *ostate,
@@ -147,12 +181,13 @@ int radeon_state_pm4(struct radeon_state *state)
 {
 	int r;
 
-	if (state == NULL || state->cpm4)
+	if (state == NULL)
 		return 0;
-	r = state->radeon->type[state->type].pm4(state);
+	state->cpm4 = 0;
+	r = state->stype->pm4(state);
 	if (r) {
 		fprintf(stderr, "%s failed to build PM4 for state(%d %d)\n",
-			__func__, state->type, state->id);
+			__func__, state->stype->stype, state->id);
 		return r;
 	}
 	state->pm4_crc = crc32(state->pm4, state->cpm4 * 4);
diff --git a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
index b997abda9b..3a76098b65 100644
--- a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
+++ b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
@@ -52,6 +52,7 @@ struct wrapper_sw_winsys
    struct sw_winsys base;
    struct pipe_screen *screen;
    struct pipe_context *pipe;
+   enum pipe_texture_target target;
 };
 
 struct wrapper_sw_displaytarget
@@ -145,7 +146,7 @@ wsw_dt_create(struct sw_winsys *ws,
     * XXX Why don't we just get the template.
     */
    memset(&templ, 0, sizeof(templ));
-   templ.target = PIPE_TEXTURE_2D;
+   templ.target = wsw->target;
    templ.width0 = width;
    templ.height0 = height;
    templ.format = format;
@@ -291,6 +292,11 @@ wrapper_sw_winsys_warp_pipe_screen(struct pipe_screen *screen)
    if (!wsw->pipe)
       goto err_free;
 
+   if(screen->get_param(screen, PIPE_CAP_NPOT_TEXTURES))
+      wsw->target = PIPE_TEXTURE_2D;
+   else
+      wsw->target = PIPE_TEXTURE_RECT;
+
    return &wsw->base;
 
 err_free: