146 files changed, 8704 insertions, 5759 deletions
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index bd48ce7005..d185c6b849 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -41,7 +41,7 @@
 static const char *
 cell_get_vendor(struct pipe_screen *screen)
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 
@@ -64,8 +64,6 @@ cell_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 1;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/i915simple/i915_context.c b/src/gallium/drivers/i915simple/i915_context.c
index b43f735245..e745f3342d 100644
--- a/src/gallium/drivers/i915simple/i915_context.c
+++ b/src/gallium/drivers/i915simple/i915_context.c
@@ -175,12 +175,19 @@ i915_is_buffer_referenced(struct pipe_context *pipe,
 static void i915_destroy(struct pipe_context *pipe)
 {
    struct i915_context *i915 = i915_context(pipe);
+   int i;
 
    draw_destroy(i915->draw);
    
    if(i915->batch)
       i915->iws->batchbuffer_destroy(i915->batch);
 
+   /* unbind framebuffer */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&i915->framebuffer.cbufs[i], NULL);
+   }
+   pipe_surface_reference(&i915->framebuffer.zsbuf, NULL);
+
    FREE(i915);
 }
 
diff --git a/src/gallium/drivers/i915simple/i915_prim_vbuf.c b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
index 508f4560e4..b3a7774fd6 100644
--- a/src/gallium/drivers/i915simple/i915_prim_vbuf.c
+++ b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
@@ -44,6 +44,7 @@
 #include "pipe/p_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_fifo.h"
 
 #include "i915_context.h"
 #include "i915_reg.h"
@@ -76,8 +77,13 @@ struct i915_vbuf_render {
    size_t vbo_size;
    size_t vbo_offset;
    void *vbo_ptr;
-   size_t vbo_alloc_size;
    size_t vbo_max_used;
+
+   /* stuff for the pool */
+   struct util_fifo *pool_fifo;
+   unsigned pool_used;
+   unsigned pool_buffer_size;
+   boolean pool_not_used;
 };
 
 
@@ -106,33 +112,72 @@ i915_vbuf_render_get_vertex_info(struct vbuf_render *render)
 }
 
 static boolean
+i915_vbuf_render_reserve(struct i915_vbuf_render *i915_render, size_t size)
+{
+   struct i915_context *i915 = i915_render->i915;
+
+   if (i915_render->vbo_size < size + i915_render->vbo_offset)
+      return FALSE;
+
+   if (i915->vbo_flushed)
+      return FALSE;
+
+   return TRUE;
+}
+
+static void
+i915_vbuf_render_new_buf(struct i915_vbuf_render *i915_render, size_t size)
+{
+   struct i915_context *i915 = i915_render->i915;
+   struct intel_winsys *iws = i915->iws;
+
+   if (i915_render->vbo) {
+      if (i915_render->pool_not_used)
+         iws->buffer_destroy(iws, i915_render->vbo);
+      else
+         u_fifo_add(i915_render->pool_fifo, i915_render->vbo);
+      i915_render->vbo = NULL;
+   }
+
+   i915->vbo_flushed = 0;
+
+   i915_render->vbo_size = MAX2(size, i915_render->pool_buffer_size);
+   i915_render->vbo_offset = 0;
+
+   if (i915_render->vbo_size != i915_render->pool_buffer_size) {
+      i915_render->pool_not_used = TRUE;
+      i915_render->vbo = iws->buffer_create(iws, i915_render->vbo_size, 64,
+            INTEL_NEW_VERTEX);
+   } else {
+      i915_render->pool_not_used = FALSE;
+
+      if (i915_render->pool_used >= 2) {
+         FLUSH_BATCH(NULL);
+         i915->vbo_flushed = 0;
+         i915_render->pool_used = 0;
+      }
+      u_fifo_pop(i915_render->pool_fifo, (void**)&i915_render->vbo);
+   }
+}
+
+static boolean
 i915_vbuf_render_allocate_vertices(struct vbuf_render *render,
                                    ushort vertex_size,
                                    ushort nr_vertices)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    struct i915_context *i915 = i915_render->i915;
-   struct intel_winsys *iws = i915->iws;
    size_t size = (size_t)vertex_size * (size_t)nr_vertices;
 
    /* FIXME: handle failure */
    assert(!i915->vbo);
 
-   if (i915_render->vbo_size > size + i915_render->vbo_offset && !i915->vbo_flushed) {
-   } else {
-      i915->vbo_flushed = 0;
-      if (i915_render->vbo) {
-         iws->buffer_destroy(iws, i915_render->vbo);
-         i915_render->vbo = NULL;
-      }
-   }
+   if (!i915_vbuf_render_reserve(i915_render, size)) {
 
-   if (!i915_render->vbo) {
-      i915_render->vbo_size = MAX2(size, i915_render->vbo_alloc_size);
-      i915_render->vbo_offset = 0;
-      i915_render->vbo = iws->buffer_create(iws, i915_render->vbo_size, 64,
-                                            INTEL_NEW_VERTEX);
+      if (i915->vbo_flushed)
+         i915_render->pool_used = 0;
 
+      i915_vbuf_render_new_buf(i915_render, size);
    }
 
    i915_render->vertex_size = vertex_size;
@@ -504,6 +549,7 @@ i915_vbuf_render_create(struct i915_context *i915)
 {
    struct i915_vbuf_render *i915_render = CALLOC_STRUCT(i915_vbuf_render);
    struct intel_winsys *iws = i915->iws;
+   int i;
 
    i915_render->i915 = i915;
    
@@ -524,14 +570,24 @@ i915_vbuf_render_create(struct i915_context *i915)
    i915_render->base.release_vertices = i915_vbuf_render_release_vertices;
    i915_render->base.destroy = i915_vbuf_render_destroy;
 
-   i915_render->vbo_alloc_size = 128 * 4096;
-   i915_render->vbo_size = i915_render->vbo_alloc_size;
+
+   i915_render->vbo = NULL;
+   i915_render->vbo_size = 0;
    i915_render->vbo_offset = 0;
-   i915_render->vbo = iws->buffer_create(iws, i915_render->vbo_size, 64,
-                                         INTEL_NEW_VERTEX);
+
+   i915_render->pool_used = FALSE;
+   i915_render->pool_buffer_size = 128 * 4096;
+   i915_render->pool_fifo = u_fifo_create(6);
+   for (i = 0; i < 6; i++)
+      u_fifo_add(i915_render->pool_fifo,
+                 iws->buffer_create(iws, i915_render->pool_buffer_size, 64,
+                                    INTEL_NEW_VERTEX));
+
+#if 0
    /* TODO JB: is this realy needed? */
    i915_render->vbo_ptr = iws->buffer_map(iws, i915_render->vbo, TRUE);
    iws->buffer_unmap(iws, i915_render->vbo);
+#endif
 
    return &i915_render->base;
 }
diff --git a/src/gallium/drivers/i915simple/i915_screen.c b/src/gallium/drivers/i915simple/i915_screen.c
index 9f017a14cc..c66558c320 100644
--- a/src/gallium/drivers/i915simple/i915_screen.c
+++ b/src/gallium/drivers/i915simple/i915_screen.c
@@ -46,7 +46,7 @@
 static const char *
 i915_get_vendor(struct pipe_screen *screen)
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 static const char *
@@ -101,8 +101,6 @@ i915_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 0;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/i915simple/i915_state.c b/src/gallium/drivers/i915simple/i915_state.c
index 0087dfa410..7d48e6e84d 100644
--- a/src/gallium/drivers/i915simple/i915_state.c
+++ b/src/gallium/drivers/i915simple/i915_state.c
@@ -588,9 +588,17 @@ static void i915_set_framebuffer_state(struct pipe_context *pipe,
 				       const struct pipe_framebuffer_state *fb)
 {
    struct i915_context *i915 = i915_context(pipe);
+   int i;
+
    draw_flush(i915->draw);
 
-   i915->framebuffer = *fb; /* struct copy */
+   i915->framebuffer.width = fb->width;
+   i915->framebuffer.height = fb->height;
+   i915->framebuffer.nr_cbufs = fb->nr_cbufs;
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&i915->framebuffer.cbufs[i], fb->cbufs[i]);
+   }
+   pipe_surface_reference(&i915->framebuffer.zsbuf, fb->zsbuf);
 
    i915->dirty |= I915_NEW_FRAMEBUFFER;
 }
diff --git a/src/gallium/drivers/i915simple/intel_winsys.h b/src/gallium/drivers/i915simple/intel_winsys.h
index f949f52a9c..42c5e7470e 100644
--- a/src/gallium/drivers/i915simple/intel_winsys.h
+++ b/src/gallium/drivers/i915simple/intel_winsys.h
@@ -150,6 +150,17 @@ struct intel_winsys {
    void (*buffer_unmap)(struct intel_winsys *iws,
                         struct intel_buffer *buffer);
 
+   /**
+    * Write to a buffer.
+    *
+    * Arguments follows pwrite(2)
+    */
+   int (*buffer_write)(struct intel_winsys *iws,
+                       struct intel_buffer *dst,
+                       const void *src,
+                       size_t size,
+                       size_t offset);
+
    void (*buffer_destroy)(struct intel_winsys *iws,
                           struct intel_buffer *buffer);
    /*@}*/
diff --git a/src/gallium/drivers/i965simple/brw_screen.c b/src/gallium/drivers/i965simple/brw_screen.c
index b22e105f10..4a84c4db23 100644
--- a/src/gallium/drivers/i965simple/brw_screen.c
+++ b/src/gallium/drivers/i965simple/brw_screen.c
@@ -39,7 +39,7 @@
 static const char *
 brw_get_vendor( struct pipe_screen *screen )
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 
@@ -85,8 +85,6 @@ brw_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 0;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index 5ac09de79e..cd7b6356d2 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -17,9 +17,11 @@ C_SOURCES = \
 	lp_bld_depth.c \
 	lp_bld_flow.c \
 	lp_bld_format_aos.c \
+	lp_bld_format_soa.c \
 	lp_bld_interp.c \
 	lp_bld_intr.c \
 	lp_bld_logic.c \
+	lp_bld_sample_soa.c \
 	lp_bld_swizzle.c \
 	lp_bld_struct.c \
 	lp_bld_tgsi_soa.c \
@@ -46,7 +48,8 @@ C_SOURCES = \
 	lp_state_vs.c \
 	lp_surface.c \
 	lp_tex_cache.c \
-	lp_tex_sample.c \
+	lp_tex_sample_c.c \
+	lp_tex_sample_llvm.c \
 	lp_texture.c \
 	lp_tile_cache.c \
 	lp_tile_soa.c
diff --git a/src/gallium/drivers/llvmpipe/README b/src/gallium/drivers/llvmpipe/README
index 498d21dea6..89d08834a3 100644
--- a/src/gallium/drivers/llvmpipe/README
+++ b/src/gallium/drivers/llvmpipe/README
@@ -8,13 +8,16 @@ Done so far is:
 
  - the whole fragment pipeline is code generated in a single function
  
+   - input interpolation
+   
    - depth testing
  
+   - texture sampling (not all state/formats are supported) 
+   
    - fragment shader TGSI translation
      - same level of support as the TGSI SSE2 exec machine, with the exception
        we don't fallback to TGSI interpretation when an unsupported opcode is
        found, but just ignore it
-     - texture sampling via an intrinsic call
      - done in SoA layout
      - input interpolation also code generated
  
@@ -28,16 +31,17 @@ Done so far is:
      any width and length
    - not all operations are implemented for these types yet though
 
-Most mesa/progs/demos/* work. Speed is on par with Keith's softpipe-opt branch,
-which includes hand written fast implementations for common cases.
+Most mesa/progs/demos/* work. 
 
 To do (probably by this order):
 
  - code generate stipple and stencil testing
 
- - code generate texture sampling
+ - translate the remaining bits of texture sampling state
 
  - translate TGSI control flow instructions, and all other remaining opcodes
+ 
+ - integrate with the draw module for VS code generation
 
  - code generate the triangle setup and rasterization
 
@@ -93,7 +97,7 @@ Alternatively, you can build it with GNU make, if you prefer, by invoking it as
 
   make linux-llvm
 
-but the rest of these instructions assume scons is used.
+but the rest of these instructions assume that scons is used.
 
 
 Using
@@ -108,6 +112,9 @@ or
 
   export LD_LIBRARY_PATH=$PWD/build/linux-x86-debug/lib:$LD_LIBRARY_PATH
 
+For performance evaluation pass debug=no to scons, and use the corresponding
+lib directory without the "-debug" suffix.
+
 
 Unit testing
 ============
@@ -119,7 +126,7 @@ build/linux-???-debug/gallium/drivers/llvmpipe:
  - lp_test_conv: SIMD vector conversion
  - lp_test_format: pixel unpacking/packing
 
-Some of this tests can output results and benchmarks to a tab-seperated-file
+Some of this tests can output results and benchmarks to a tab-separated-file
 for posterior analysis, e.g.:
 
   build/linux-x86_64-debug/gallium/drivers/llvmpipe/lp_test_blend -o blend.tsv
@@ -133,10 +140,10 @@ Development Notes
   at the top of the lp_bld_*.c functions.  
 
 - All lp_bld_*.[ch] are isolated from the rest of the driver, and could/may be 
-  put in a standalone Gallium state -> LLVM IR translation module.
+  put in a stand-alone Gallium state -> LLVM IR translation module.
 
 - We use LLVM-C bindings for now. They are not documented, but follow the C++
   interfaces very closely, and appear to be complete enough for code
   generation. See 
   http://npcontemplation.blogspot.com/2008/06/secret-of-llvm-c-bindings.html
-  for a standalone example.
+  for a stand-alone example.
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index 5c29bdac56..f4a9a3b22e 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -3,7 +3,7 @@ Import('*')
 env = env.Clone()
 
 env.Tool('llvm')
-if 'LLVM_VERSION' not in env:
+if not env.has_key('LLVM_VERSION'):
     print 'warning: LLVM not found: not building llvmpipe'
     Return()
 
@@ -23,8 +23,10 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_bld_depth.c',
 		'lp_bld_flow.c',
 		'lp_bld_format_aos.c',
+		'lp_bld_format_soa.c',
 		'lp_bld_interp.c',
 		'lp_bld_intr.c',
+		'lp_bld_sample_soa.c',
 		'lp_bld_struct.c',
 		'lp_bld_logic.c',
 		'lp_bld_swizzle.c',
@@ -52,7 +54,8 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_state_vs.c',
 		'lp_surface.c',
 		'lp_tex_cache.c',
-		'lp_tex_sample.c',
+		'lp_tex_sample_c.c',
+		'lp_tex_sample_llvm.c',
 		'lp_texture.c',
 		'lp_tile_cache.c',
 		'lp_tile_soa.c',
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
index 49c2f911af..2b4bc5c819 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
@@ -45,7 +45,7 @@
 void
 lp_build_alpha_test(LLVMBuilderRef builder,
                     const struct pipe_alpha_state *state,
-                    union lp_type type,
+                    struct lp_type type,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef alpha,
                     LLVMValueRef ref)
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
index 9dbcdb4daa..634575670d 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
@@ -38,14 +38,14 @@
 #include <llvm-c/Core.h>  
 
 struct pipe_alpha_state;
-union lp_type;
+struct lp_type;
 struct lp_build_mask_context;
 
 
 void
 lp_build_alpha_test(LLVMBuilderRef builder,
                     const struct pipe_alpha_state *state,
-                    union lp_type type,
+                    struct lp_type type,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef alpha,
                     LLVMValueRef ref);
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.c b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
index 09a57ff33d..0b115fc9b0 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
@@ -65,7 +65,7 @@ lp_build_min_simple(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    const char *intrinsic = NULL;
    LLVMValueRef cond;
 
@@ -113,7 +113,7 @@ lp_build_max_simple(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    const char *intrinsic = NULL;
    LLVMValueRef cond;
 
@@ -159,7 +159,7 @@ LLVMValueRef
 lp_build_comp(struct lp_build_context *bld,
               LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
 
    if(a == bld->one)
       return bld->zero;
@@ -188,7 +188,7 @@ lp_build_add(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMValueRef res;
 
    if(a == bld->zero)
@@ -241,7 +241,7 @@ lp_build_sub(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMValueRef res;
 
    if(b == bld->zero)
@@ -405,7 +405,7 @@ lp_build_mul(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
 
    if(a == bld->zero)
       return bld->zero;
@@ -477,7 +477,7 @@ lp_build_div(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
 
    if(a == bld->zero)
       return bld->zero;
@@ -502,6 +502,31 @@ lp_build_div(struct lp_build_context *bld,
 }
 
 
+LLVMValueRef
+lp_build_lerp(struct lp_build_context *bld,
+              LLVMValueRef x,
+              LLVMValueRef v0,
+              LLVMValueRef v1)
+{
+   return lp_build_add(bld, v0, lp_build_mul(bld, x, lp_build_sub(bld, v1, v0)));
+}
+
+
+LLVMValueRef
+lp_build_lerp_2d(struct lp_build_context *bld,
+                 LLVMValueRef x,
+                 LLVMValueRef y,
+                 LLVMValueRef v00,
+                 LLVMValueRef v01,
+                 LLVMValueRef v10,
+                 LLVMValueRef v11)
+{
+   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
+   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
+   return lp_build_lerp(bld, y, v0, v1);
+}
+
+
 /**
  * Generate min(a, b)
  * Do checks for special cases.
@@ -565,21 +590,32 @@ LLVMValueRef
 lp_build_abs(struct lp_build_context *bld,
              LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
 
    if(!type.sign)
       return a;
 
-   /* XXX: is this really necessary? */
+   if(type.floating) {
+      /* Mask out the sign bit */
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long)1 << type.width) - 1);
+      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      a = LLVMBuildAnd(bld->builder, a, mask, "");
+      a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
+      return a;
+   }
+
 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(!type.floating && type.width*type.length == 128) {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      if(type.width == 8)
+   if(type.width*type.length == 128) {
+      switch(type.width) {
+      case 8:
          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
-      if(type.width == 16)
+      case 16:
          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
-      if(type.width == 32)
+      case 32:
          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
+      }
    }
 #endif
 
@@ -588,10 +624,188 @@ lp_build_abs(struct lp_build_context *bld,
 
 
 LLVMValueRef
+lp_build_sgn(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMValueRef cond;
+   LLVMValueRef res;
+
+   /* Handle non-zero case */
+   if(!type.sign) {
+      /* if not zero then sign must be positive */
+      res = bld->one;
+   }
+   else if(type.floating) {
+      /* Take the sign bit and add it to 1 constant */
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef one;
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+      one = LLVMConstBitCast(bld->one, int_vec_type);
+      res = LLVMBuildOr(bld->builder, sign, one, "");
+      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+   }
+   else
+   {
+      LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
+      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
+      res = lp_build_select(bld, cond, bld->one, minus_one);
+   }
+
+   /* Handle zero */
+   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
+   res = lp_build_select(bld, cond, bld->zero, bld->one);
+
+   return res;
+}
+
+
+enum lp_build_round_sse41_mode
+{
+   LP_BUILD_ROUND_SSE41_NEAREST = 0,
+   LP_BUILD_ROUND_SSE41_FLOOR = 1,
+   LP_BUILD_ROUND_SSE41_CEIL = 2,
+   LP_BUILD_ROUND_SSE41_TRUNCATE = 3
+};
+
+
+static INLINE LLVMValueRef
+lp_build_round_sse41(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     enum lp_build_round_sse41_mode mode)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   const char *intrinsic;
+
+   assert(type.floating);
+   assert(type.width*type.length == 128);
+
+   switch(type.width) {
+   case 32:
+      intrinsic = "llvm.x86.sse41.round.ps";
+      break;
+   case 64:
+      intrinsic = "llvm.x86.sse41.round.pd";
+      break;
+   default:
+      assert(0);
+      return bld->undef;
+   }
+
+   return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
+                                    LLVMConstInt(LLVMInt32Type(), mode, 0));
+}
+
+
+LLVMValueRef
+lp_build_round(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
+#endif
+
+   /* FIXME */
+   assert(0);
+   return bld->undef;
+}
+
+
+LLVMValueRef
+lp_build_floor(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+#endif
+
+   /* FIXME */
+   assert(0);
+   return bld->undef;
+}
+
+
+LLVMValueRef
+lp_build_ceil(struct lp_build_context *bld,
+              LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
+#endif
+
+   /* FIXME */
+   assert(0);
+   return bld->undef;
+}
+
+
+LLVMValueRef
+lp_build_trunc(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
+#endif
+
+   /* FIXME */
+   assert(0);
+   return bld->undef;
+}
+
+
+/**
+ * Convert to integer, through whichever rounding method that's fastest,
+ * typically truncating to zero.
+ */
+LLVMValueRef
+lp_build_int(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+
+   assert(type.floating);
+
+   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+}
+
+
+LLVMValueRef
+lp_build_ifloor(struct lp_build_context *bld,
+                LLVMValueRef a)
+{
+   a = lp_build_floor(bld, a);
+   a = lp_build_int(bld, a);
+   return a;
+}
+
+
+LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
               LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    char intrinsic[32];
 
@@ -609,7 +823,7 @@ LLVMValueRef
 lp_build_rcp(struct lp_build_context *bld,
              LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
 
    if(a == bld->zero)
       return bld->undef;
@@ -640,7 +854,7 @@ LLVMValueRef
 lp_build_rsqrt(struct lp_build_context *bld,
                LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
 
    assert(type.floating);
 
@@ -661,7 +875,7 @@ LLVMValueRef
 lp_build_cos(struct lp_build_context *bld,
               LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    char intrinsic[32];
 
@@ -681,7 +895,7 @@ LLVMValueRef
 lp_build_sin(struct lp_build_context *bld,
               LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    char intrinsic[32];
 
@@ -752,7 +966,7 @@ lp_build_polynomial(struct lp_build_context *bld,
                     const double *coeffs,
                     unsigned num_coeffs)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMValueRef res = NULL;
    unsigned i;
 
@@ -800,7 +1014,7 @@ lp_build_exp2_approx(struct lp_build_context *bld,
                      LLVMValueRef *p_frac_part,
                      LLVMValueRef *p_exp2)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
    LLVMValueRef ipart = NULL;
@@ -893,7 +1107,7 @@ lp_build_log2_approx(struct lp_build_context *bld,
                      LLVMValueRef *p_floor_log2,
                      LLVMValueRef *p_log2)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.h b/src/gallium/drivers/llvmpipe/lp_bld_arit.h
index fc8cb25966..d68a97c4b8 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.h
@@ -40,7 +40,7 @@
 #include <llvm-c/Core.h>  
 
 
-union lp_type type;
+struct lp_type type;
 struct lp_build_context;
 
 
@@ -72,6 +72,26 @@ lp_build_div(struct lp_build_context *bld,
              LLVMValueRef b);
 
 LLVMValueRef
+lp_build_lerp(struct lp_build_context *bld,
+              LLVMValueRef x,
+              LLVMValueRef v0,
+              LLVMValueRef v1);
+
+/**
+ * Bilinear interpolation.
+ *
+ * Values indices are in v_{yx}.
+ */
+LLVMValueRef
+lp_build_lerp_2d(struct lp_build_context *bld,
+                 LLVMValueRef x,
+                 LLVMValueRef y,
+                 LLVMValueRef v00,
+                 LLVMValueRef v01,
+                 LLVMValueRef v10,
+                 LLVMValueRef v11);
+
+LLVMValueRef
 lp_build_min(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b);
@@ -86,6 +106,34 @@ lp_build_abs(struct lp_build_context *bld,
              LLVMValueRef a);
 
 LLVMValueRef
+lp_build_sgn(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_round(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_floor(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_ceil(struct lp_build_context *bld,
+              LLVMValueRef a);
+
+LLVMValueRef
+lp_build_trunc(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_int(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_ifloor(struct lp_build_context *bld,
+                LLVMValueRef a);
+
+LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
               LLVMValueRef a);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.h b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
index d19e18846c..da272e549f 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
@@ -46,7 +46,7 @@
 
 
 struct pipe_blend_state;
-union lp_type;
+struct lp_type;
 struct lp_build_context;
 
 
@@ -74,7 +74,7 @@ lp_build_blend_func(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_blend_aos(LLVMBuilderRef builder,
                    const struct pipe_blend_state *blend,
-                   union lp_type type,
+                   struct lp_type type,
                    LLVMValueRef src,
                    LLVMValueRef dst,
                    LLVMValueRef const_,
@@ -84,7 +84,7 @@ lp_build_blend_aos(LLVMBuilderRef builder,
 void
 lp_build_blend_soa(LLVMBuilderRef builder,
                    const struct pipe_blend_state *blend,
-                   union lp_type type,
+                   struct lp_type type,
                    LLVMValueRef src[4],
                    LLVMValueRef dst[4],
                    LLVMValueRef const_[4],
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index c11a9398f8..d14f468ba9 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -303,7 +303,7 @@ lp_build_blend_func(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_blend_aos(LLVMBuilderRef builder,
                    const struct pipe_blend_state *blend,
-                   union lp_type type,
+                   struct lp_type type,
                    LLVMValueRef src,
                    LLVMValueRef dst,
                    LLVMValueRef const_,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
index b92254a7d6..9511299d55 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
@@ -199,7 +199,7 @@ lp_build_blend_soa_factor(struct lp_build_blend_soa_context *bld,
 void
 lp_build_blend_soa(LLVMBuilderRef builder,
                    const struct pipe_blend_state *blend,
-                   union lp_type type,
+                   struct lp_type type,
                    LLVMValueRef src[4],
                    LLVMValueRef dst[4],
                    LLVMValueRef con[4],
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_const.c b/src/gallium/drivers/llvmpipe/lp_bld_const.c
index 21487365ea..c8eaa8c394 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_const.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_const.c
@@ -42,7 +42,7 @@
 
 
 unsigned
-lp_mantissa(union lp_type type)
+lp_mantissa(struct lp_type type)
 {
    assert(type.floating);
 
@@ -72,7 +72,7 @@ lp_mantissa(union lp_type type)
  * Same as lp_const_scale(), but in terms of shifts.
  */
 unsigned
-lp_const_shift(union lp_type type)
+lp_const_shift(struct lp_type type)
 {
    if(type.floating)
       return 0;
@@ -86,7 +86,7 @@ lp_const_shift(union lp_type type)
 
 
 unsigned
-lp_const_offset(union lp_type type)
+lp_const_offset(struct lp_type type)
 {
    if(type.floating || type.fixed)
       return 0;
@@ -104,7 +104,7 @@ lp_const_offset(union lp_type type)
  * else for the fixed points types and normalized integers.
  */
 double
-lp_const_scale(union lp_type type)
+lp_const_scale(struct lp_type type)
 {
    unsigned long long llscale;
    double dscale;
@@ -122,7 +122,7 @@ lp_const_scale(union lp_type type)
  * Minimum value representable by the type.
  */
 double
-lp_const_min(union lp_type type)
+lp_const_min(struct lp_type type)
 {
    unsigned bits;
 
@@ -158,7 +158,7 @@ lp_const_min(union lp_type type)
  * Maximum value representable by the type.
  */
 double
-lp_const_max(union lp_type type)
+lp_const_max(struct lp_type type)
 {
    unsigned bits;
 
@@ -190,7 +190,7 @@ lp_const_max(union lp_type type)
 
 
 double
-lp_const_eps(union lp_type type)
+lp_const_eps(struct lp_type type)
 {
    if (type.floating) {
       switch(type.width) {
@@ -211,7 +211,7 @@ lp_const_eps(union lp_type type)
 
 
 LLVMValueRef
-lp_build_undef(union lp_type type)
+lp_build_undef(struct lp_type type)
 {
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    return LLVMGetUndef(vec_type);
@@ -219,7 +219,7 @@ lp_build_undef(union lp_type type)
                
 
 LLVMValueRef
-lp_build_zero(union lp_type type)
+lp_build_zero(struct lp_type type)
 {
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    return LLVMConstNull(vec_type);
@@ -227,7 +227,7 @@ lp_build_zero(union lp_type type)
                
 
 LLVMValueRef
-lp_build_one(union lp_type type)
+lp_build_one(struct lp_type type)
 {
    LLVMTypeRef elem_type;
    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
@@ -269,7 +269,7 @@ lp_build_one(union lp_type type)
                
 
 LLVMValueRef
-lp_build_const_scalar(union lp_type type,
+lp_build_const_scalar(struct lp_type type,
                       double val)
 {
    LLVMTypeRef elem_type = lp_build_elem_type(type);
@@ -295,7 +295,7 @@ lp_build_const_scalar(union lp_type type,
 
 
 LLVMValueRef
-lp_build_int_const_scalar(union lp_type type,
+lp_build_int_const_scalar(struct lp_type type,
                           long long val)
 {
    LLVMTypeRef elem_type = lp_build_int_elem_type(type);
@@ -312,7 +312,7 @@ lp_build_int_const_scalar(union lp_type type,
 
 
 LLVMValueRef
-lp_build_const_aos(union lp_type type, 
+lp_build_const_aos(struct lp_type type, 
                    double r, double g, double b, double a, 
                    const unsigned char *swizzle)
 {
@@ -352,8 +352,8 @@ lp_build_const_aos(union lp_type type,
 
 
 LLVMValueRef
-lp_build_const_mask_aos(union lp_type type,
-                        boolean cond[4])
+lp_build_const_mask_aos(struct lp_type type,
+                        const boolean cond[4])
 {
    LLVMTypeRef elem_type = LLVMIntType(type.width);
    LLVMValueRef masks[LP_MAX_VECTOR_LENGTH];
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_const.h b/src/gallium/drivers/llvmpipe/lp_bld_const.h
index 1934530ea3..ffb302f736 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_const.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_const.h
@@ -42,67 +42,67 @@
 #include <pipe/p_compiler.h>
 
 
-union lp_type type;
+struct lp_type type;
 
 
 unsigned
-lp_mantissa(union lp_type type);
+lp_mantissa(struct lp_type type);
 
 
 unsigned
-lp_const_shift(union lp_type type);
+lp_const_shift(struct lp_type type);
 
 
 unsigned
-lp_const_offset(union lp_type type);
+lp_const_offset(struct lp_type type);
 
 
 double
-lp_const_scale(union lp_type type);
+lp_const_scale(struct lp_type type);
 
 double
-lp_const_min(union lp_type type);
+lp_const_min(struct lp_type type);
 
 
 double
-lp_const_max(union lp_type type);
+lp_const_max(struct lp_type type);
 
 
 double
-lp_const_eps(union lp_type type);
+lp_const_eps(struct lp_type type);
 
 
 LLVMValueRef
-lp_build_undef(union lp_type type);
+lp_build_undef(struct lp_type type);
 
 
 LLVMValueRef
-lp_build_zero(union lp_type type);
+lp_build_zero(struct lp_type type);
 
 
 LLVMValueRef
-lp_build_one(union lp_type type);
+lp_build_one(struct lp_type type);
 
 
 LLVMValueRef
-lp_build_const_scalar(union lp_type type,
+lp_build_const_scalar(struct lp_type type,
                       double val);
 
 
 LLVMValueRef
-lp_build_int_const_scalar(union lp_type type,
+lp_build_int_const_scalar(struct lp_type type,
                           long long val);
 
 
 LLVMValueRef
-lp_build_const_aos(union lp_type type, 
+lp_build_const_aos(struct lp_type type, 
                    double r, double g, double b, double a, 
                    const unsigned char *swizzle);
 
 
 LLVMValueRef
-lp_build_const_mask_aos(union lp_type type,
-                        boolean cond[4]);
+lp_build_const_mask_aos(struct lp_type type,
+                        const boolean cond[4]);
 
 
 #endif /* !LP_BLD_CONST_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_conv.c b/src/gallium/drivers/llvmpipe/lp_bld_conv.c
index c8954c8a34..186cac70f6 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_conv.c
@@ -86,7 +86,7 @@
  */
 LLVMValueRef
 lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
-                                        union lp_type src_type,
+                                        struct lp_type src_type,
                                         unsigned dst_width,
                                         LLVMValueRef src)
 {
@@ -122,7 +122,7 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
       int shift = dst_width - n;
       res = LLVMBuildShl(builder, res, lp_build_int_const_scalar(src_type, shift), "");
 
-      /* Fill in the empty lower bits for added precision? */
+      /* TODO: Fill in the empty lower bits for additional precision? */
 #if 0
       {
          LLVMValueRef msb;
@@ -152,7 +152,7 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
 LLVMValueRef
 lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
                                 unsigned src_width,
-                                union lp_type dst_type,
+                                struct lp_type dst_type,
                                 LLVMValueRef src)
 {
    LLVMTypeRef vec_type = lp_build_vec_type(dst_type);
@@ -244,12 +244,12 @@ lp_build_const_pack_shuffle(unsigned n)
  * Expand the bit width.
  *
  * This will only change the number of bits the values are represented, not the
- * values themselved.
+ * values themselves.
  */
 static void
 lp_build_expand(LLVMBuilderRef builder,
-               union lp_type src_type,
-               union lp_type dst_type,
+               struct lp_type src_type,
+               struct lp_type dst_type,
                LLVMValueRef src,
                LLVMValueRef *dst, unsigned num_dsts)
 {
@@ -266,7 +266,7 @@ lp_build_expand(LLVMBuilderRef builder,
    dst[0] = src;
 
    while(src_type.width < dst_type.width) {
-      union lp_type new_type = src_type;
+      struct lp_type new_type = src_type;
       LLVMTypeRef new_vec_type;
 
       new_type.width *= 2;
@@ -314,8 +314,8 @@ lp_build_expand(LLVMBuilderRef builder,
  */
 static LLVMValueRef
 lp_build_pack2(LLVMBuilderRef builder,
-               union lp_type src_type,
-               union lp_type dst_type,
+               struct lp_type src_type,
+               struct lp_type dst_type,
                boolean clamped,
                LLVMValueRef lo,
                LLVMValueRef hi)
@@ -391,11 +391,11 @@ lp_build_pack2(LLVMBuilderRef builder,
  * TODO: Handle saturation consistently.
  */
 static LLVMValueRef
-lp_build_trunc(LLVMBuilderRef builder,
-               union lp_type src_type,
-               union lp_type dst_type,
-               boolean clamped,
-               const LLVMValueRef *src, unsigned num_srcs)
+lp_build_pack(LLVMBuilderRef builder,
+              struct lp_type src_type,
+              struct lp_type dst_type,
+              boolean clamped,
+              const LLVMValueRef *src, unsigned num_srcs)
 {
    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
    unsigned i;
@@ -410,7 +410,7 @@ lp_build_trunc(LLVMBuilderRef builder,
       tmp[i] = src[i];
 
    while(src_type.width > dst_type.width) {
-      union lp_type new_type = src_type;
+      struct lp_type new_type = src_type;
 
       new_type.width /= 2;
       new_type.length *= 2;
@@ -442,12 +442,12 @@ lp_build_trunc(LLVMBuilderRef builder,
  */
 void
 lp_build_conv(LLVMBuilderRef builder,
-              union lp_type src_type,
-              union lp_type dst_type,
+              struct lp_type src_type,
+              struct lp_type dst_type,
               const LLVMValueRef *src, unsigned num_srcs,
               LLVMValueRef *dst, unsigned num_dsts)
 {
-   union lp_type tmp_type;
+   struct lp_type tmp_type;
    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
    unsigned num_tmps;
    unsigned i;
@@ -470,7 +470,7 @@ lp_build_conv(LLVMBuilderRef builder,
     * Clamp if necessary
     */
 
-   if(src_type.value != dst_type.value) {
+   if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
       struct lp_build_context bld;
       double src_min = lp_const_min(src_type);
       double dst_min = lp_const_min(dst_type);
@@ -565,7 +565,7 @@ lp_build_conv(LLVMBuilderRef builder,
 
    if(tmp_type.width > dst_type.width) {
       assert(num_dsts == 1);
-      tmp[0] = lp_build_trunc(builder, tmp_type, dst_type, TRUE, tmp, num_tmps);
+      tmp[0] = lp_build_pack(builder, tmp_type, dst_type, TRUE, tmp, num_tmps);
       tmp_type.width = dst_type.width;
       tmp_type.length = dst_type.length;
       num_tmps = 1;
@@ -656,8 +656,8 @@ lp_build_conv(LLVMBuilderRef builder,
  */
 void
 lp_build_conv_mask(LLVMBuilderRef builder,
-                   union lp_type src_type,
-                   union lp_type dst_type,
+                   struct lp_type src_type,
+                   struct lp_type dst_type,
                    const LLVMValueRef *src, unsigned num_srcs,
                    LLVMValueRef *dst, unsigned num_dsts)
 {
@@ -689,7 +689,7 @@ lp_build_conv_mask(LLVMBuilderRef builder,
 
    if(src_type.width > dst_type.width) {
       assert(num_dsts == 1);
-      dst[0] = lp_build_trunc(builder, src_type, dst_type, TRUE, src, num_srcs);
+      dst[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs);
    }
    else if(src_type.width < dst_type.width) {
       assert(num_srcs == 1);
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_conv.h b/src/gallium/drivers/llvmpipe/lp_bld_conv.h
index 05c1ef2a10..ca378804d2 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_conv.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_conv.h
@@ -40,33 +40,33 @@
 #include <llvm-c/Core.h>  
 
 
-union lp_type type;
+struct lp_type type;
 
 
 LLVMValueRef
 lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
-                                        union lp_type src_type,
+                                        struct lp_type src_type,
                                         unsigned dst_width,
                                         LLVMValueRef src);
 
 LLVMValueRef
 lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
                                 unsigned src_width,
-                                union lp_type dst_type,
+                                struct lp_type dst_type,
                                 LLVMValueRef src);
 
 
 void
 lp_build_conv(LLVMBuilderRef builder,
-              union lp_type src_type,
-              union lp_type dst_type,
+              struct lp_type src_type,
+              struct lp_type dst_type,
               const LLVMValueRef *srcs, unsigned num_srcs,
               LLVMValueRef *dsts, unsigned num_dsts);
 
 void
 lp_build_conv_mask(LLVMBuilderRef builder,
-                   union lp_type src_type,
-                   union lp_type dst_type,
+                   struct lp_type src_type,
+                   struct lp_type dst_type,
                    const LLVMValueRef *src, unsigned num_srcs,
                    LLVMValueRef *dst, unsigned num_dsts);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_debug.c b/src/gallium/drivers/llvmpipe/lp_bld_debug.c
index 30925b5f41..59d8f492e6 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_debug.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_debug.c
@@ -30,10 +30,27 @@
 #include <udis86.h>
 #endif
 
+#include "util/u_math.h"
 #include "util/u_debug.h"
 #include "lp_bld_debug.h"
 
 
+/**
+ * Check alignment.
+ *
+ * It is important that this check is not implemented as a macro or inlined
+ * function, as the compiler assumptions in respect to alignment of global
+ * and stack variables would often make the check a no op, defeating the
+ * whole purpose of the exercise.
+ */
+boolean
+lp_check_alignment(const void *ptr, unsigned alignment)
+{
+   assert(util_is_pot(alignment));
+   return ((uintptr_t)ptr & (alignment - 1)) == 0;
+}
+
+
 void
 lp_disassemble(const void* func)
 {
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_debug.h b/src/gallium/drivers/llvmpipe/lp_bld_debug.h
index ecdafef76d..583e6132b4 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_debug.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_debug.h
@@ -53,6 +53,10 @@ lp_build_name(LLVMValueRef val, const char *format, ...)
 }
 
 
+boolean
+lp_check_alignment(const void *ptr, unsigned alignment);
+
+
 void
 lp_disassemble(const void* func);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 2cd6e6b921..21c665c4d4 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -71,11 +71,11 @@
 /**
  * Return a type appropriate for depth/stencil testing.
  */
-union lp_type
+struct lp_type
 lp_depth_type(const struct util_format_description *format_desc,
               unsigned length)
 {
-   union lp_type type;
+   struct lp_type type;
    unsigned swizzle;
 
    assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
@@ -85,7 +85,7 @@ lp_depth_type(const struct util_format_description *format_desc,
    swizzle = format_desc->swizzle[0];
    assert(swizzle < 4);
 
-   type.value = 0;
+   memset(&type, 0, sizeof type);
    type.width = format_desc->block.bits;
 
    if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
@@ -114,7 +114,7 @@ lp_depth_type(const struct util_format_description *format_desc,
 void
 lp_build_depth_test(LLVMBuilderRef builder,
                     const struct pipe_depth_state *state,
-                    union lp_type type,
+                    struct lp_type type,
                     const struct util_format_description *format_desc,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef src,
@@ -179,12 +179,13 @@ lp_build_depth_test(LLVMBuilderRef builder,
       padding_right = 0;
       for(chan = 0; chan < z_swizzle; ++chan)
          padding_right += format_desc->channel[chan].size;
-      padding_left = format_desc->block.bits - format_desc->channel[z_swizzle].size;
+      padding_left = format_desc->block.bits -
+                     (padding_right + format_desc->channel[z_swizzle].size);
 
       if(padding_left || padding_right) {
-         const long long mask_left = ((long long)1 << (format_desc->block.bits - padding_left)) - 1;
-         const long long mask_right = ((long long)1 << (padding_right)) - 1;
-         z_bitmask = lp_build_int_const_scalar(type, mask_left & mask_right);
+         const unsigned long long mask_left = ((unsigned long long)1 << (format_desc->block.bits - padding_left)) - 1;
+         const unsigned long long mask_right = ((unsigned long long)1 << (padding_right)) - 1;
+         z_bitmask = lp_build_int_const_scalar(type, mask_left ^ mask_right);
       }
 
       if(padding_left)
@@ -210,5 +211,6 @@ lp_build_depth_test(LLVMBuilderRef builder,
       LLVMBuildStore(builder, dst, dst_ptr);
    }
 
+   /* FIXME */
    assert(!state->occlusion_count);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
index 5d2e042fcc..79d6981bb5 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
@@ -41,11 +41,11 @@
  
 struct pipe_depth_state;
 struct util_format_description;
-union lp_type;
+struct lp_type;
 struct lp_build_mask_context;
 
 
-union lp_type
+struct lp_type
 lp_depth_type(const struct util_format_description *format_desc,
               unsigned length);
 
@@ -53,7 +53,7 @@ lp_depth_type(const struct util_format_description *format_desc,
 void
 lp_build_depth_test(LLVMBuilderRef builder,
                     const struct pipe_depth_state *state,
-                    union lp_type type,
+                    struct lp_type type,
                     const struct util_format_description *format_desc,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef src,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_flow.c b/src/gallium/drivers/llvmpipe/lp_bld_flow.c
index 9d99e1a9d9..dcc25fbff8 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_flow.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_flow.c
@@ -32,59 +32,261 @@
  */
 
 #include "util/u_debug.h"
+#include "util/u_memory.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_flow.h"
 
 
+#define LP_BUILD_FLOW_MAX_VARIABLES 32
+#define LP_BUILD_FLOW_MAX_DEPTH 32
+
+
+/**
+ * Enumeration of all possible flow constructs.
+ */
+enum lp_build_flow_construct_kind {
+   lP_BUILD_FLOW_SCOPE,
+   LP_BUILD_FLOW_SKIP,
+};
+
+
+/**
+ * Variable declaration scope.
+ */
+struct lp_build_flow_scope
+{
+   /** Number of variables declared in this scope */
+   unsigned num_variables;
+};
+
+
+/**
+ * Early exit. Useful to skip to the end of a function or block when
+ * the execution mask becomes zero or when there is an error condition.
+ */
+struct lp_build_flow_skip
+{
+   /** Block to skip to */
+   LLVMBasicBlockRef block;
+
+   /** Number of variables declared at the beginning */
+   unsigned num_variables;
+
+   LLVMValueRef *phi;
+};
+
+
+/**
+ * Union of all possible flow constructs' data
+ */
+union lp_build_flow_construct_data
+{
+   struct lp_build_flow_scope scope;
+   struct lp_build_flow_skip skip;
+};
+
+
+/**
+ * Element of the flow construct stack.
+ */
+struct lp_build_flow_construct
+{
+   enum lp_build_flow_construct_kind kind;
+   union lp_build_flow_construct_data data;
+};
+
+
+/**
+ * All necessary data to generate LLVM control flow constructs.
+ *
+ * Besides keeping track of the control flow construct themselves we also
+ * need to keep track of variables in order to generate SSA Phi values.
+ */
+struct lp_build_flow_context
+{
+   LLVMBuilderRef builder;
+
+   /**
+    * Control flow stack.
+    */
+   struct lp_build_flow_construct constructs[LP_BUILD_FLOW_MAX_DEPTH];
+   unsigned num_constructs;
+
+   /**
+    * Variable stack
+    */
+   LLVMValueRef *variables[LP_BUILD_FLOW_MAX_VARIABLES];
+   unsigned num_variables;
+};
+
+
+struct lp_build_flow_context *
+lp_build_flow_create(LLVMBuilderRef builder)
+{
+   struct lp_build_flow_context *flow;
+
+   flow = CALLOC_STRUCT(lp_build_flow_context);
+   if(!flow)
+      return NULL;
+
+   flow->builder = builder;
+
+   return flow;
+}
+
+
 void
-lp_build_mask_begin(struct lp_build_mask_context *mask,
-                    LLVMBuilderRef builder,
-                    union lp_type type,
-                    LLVMValueRef value)
+lp_build_flow_destroy(struct lp_build_flow_context *flow)
 {
-   memset(mask, 0, sizeof *mask);
+   assert(flow->num_constructs == 0);
+   assert(flow->num_variables == 0);
+   FREE(flow);
+}
 
-   mask->builder = builder;
-   mask->reg_type = LLVMIntType(type.width * type.length);
-   mask->value = value;
+
+static union lp_build_flow_construct_data *
+lp_build_flow_push(struct lp_build_flow_context *flow,
+                   enum lp_build_flow_construct_kind kind)
+{
+   assert(flow->num_constructs < LP_BUILD_FLOW_MAX_DEPTH);
+   if(flow->num_constructs >= LP_BUILD_FLOW_MAX_DEPTH)
+      return NULL;
+
+   flow->constructs[flow->num_constructs].kind = kind;
+   return &flow->constructs[flow->num_constructs++].data;
+}
+
+
+static union lp_build_flow_construct_data *
+lp_build_flow_peek(struct lp_build_flow_context *flow,
+                   enum lp_build_flow_construct_kind kind)
+{
+   assert(flow->num_constructs);
+   if(!flow->num_constructs)
+      return NULL;
+
+   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
+   if(flow->constructs[flow->num_constructs - 1].kind != kind)
+      return NULL;
+
+   return &flow->constructs[flow->num_constructs - 1].data;
 }
 
 
+static union lp_build_flow_construct_data *
+lp_build_flow_pop(struct lp_build_flow_context *flow,
+                  enum lp_build_flow_construct_kind kind)
+{
+   assert(flow->num_constructs);
+   if(!flow->num_constructs)
+      return NULL;
+
+   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
+   if(flow->constructs[flow->num_constructs - 1].kind != kind)
+      return NULL;
+
+   return &flow->constructs[--flow->num_constructs].data;
+}
+
+
+/**
+ * Begin a variable scope.
+ *
+ *
+ */
 void
-lp_build_mask_update(struct lp_build_mask_context *mask,
-                     LLVMValueRef value)
+lp_build_flow_scope_begin(struct lp_build_flow_context *flow)
 {
+   struct lp_build_flow_scope *scope;
 
-   LLVMValueRef cond;
-   LLVMBasicBlockRef current_block;
-   LLVMBasicBlockRef next_block;
-   LLVMBasicBlockRef new_block;
+   scope = &lp_build_flow_push(flow, lP_BUILD_FLOW_SCOPE)->scope;
+   if(!scope)
+      return;
 
-   if(mask->value)
-      mask->value = LLVMBuildAnd(mask->builder, mask->value, value, "");
-   else
-      mask->value = value;
+   scope->num_variables = 0;
+}
 
-   /* FIXME: disabled until we have proper control flow helpers */
-#if 0
-   cond = LLVMBuildICmp(mask->builder,
-                        LLVMIntEQ,
-                        LLVMBuildBitCast(mask->builder, mask->value, mask->reg_type, ""),
-                        LLVMConstNull(mask->reg_type),
-                        "");
 
-   current_block = LLVMGetInsertBlock(mask->builder);
+/**
+ * Declare a variable.
+ *
+ * A variable is a named entity which can have different LLVMValueRef's at
+ * different points of the program. This is relevant for control flow because
+ * when there are mutiple branches to a same location we need to replace
+ * the variable's value with a Phi function as explained in
+ * http://en.wikipedia.org/wiki/Static_single_assignment_form .
+ *
+ * We keep track of variables by keeping around a pointer to where their
+ * current.
+ *
+ * There are a few cautions to observe:
+ *
+ * - Variable's value must not be NULL. If there is no initial value then
+ *   LLVMGetUndef() should be used.
+ *
+ * - Variable's value must be kept up-to-date. If the variable is going to be
+ *   modified by a function then a pointer should be passed so that its value
+ *   is accurate. Failure to do this will cause some of the variables'
+ *   transient values to be lost, leading to wrong results.
+ *
+ * - A program should be written from top to bottom, by always appending
+ *   instructions to the bottom with a single LLVMBuilderRef. Inserting and/or
+ *   modifying existing statements will most likely lead to wrong results.
+ *
+ */
+void
+lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
+                            LLVMValueRef *variable)
+{
+   struct lp_build_flow_scope *scope;
+
+   scope = &lp_build_flow_peek(flow, lP_BUILD_FLOW_SCOPE)->scope;
+   if(!scope)
+      return;
 
-   if(!mask->skip_block) {
-      LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
-      mask->skip_block = LLVMAppendBasicBlock(function, "skip");
+   assert(*variable);
+   if(!*variable)
+      return;
+
+   assert(flow->num_variables < LP_BUILD_FLOW_MAX_VARIABLES);
+   if(flow->num_variables >= LP_BUILD_FLOW_MAX_VARIABLES)
+      return;
+
+   flow->variables[flow->num_variables++] = variable;
+   ++scope->num_variables;
+}
+
+
+void
+lp_build_flow_scope_end(struct lp_build_flow_context *flow)
+{
+   struct lp_build_flow_scope *scope;
+
+   scope = &lp_build_flow_pop(flow, lP_BUILD_FLOW_SCOPE)->scope;
+   if(!scope)
+      return;
 
-      mask->phi = LLVMBuildPhi(mask->builder, LLVMTypeOf(mask->value), "");
+   assert(flow->num_variables >= scope->num_variables);
+   if(flow->num_variables < scope->num_variables) {
+      flow->num_variables = 0;
+      return;
    }
 
+   flow->num_variables -= scope->num_variables;
+}
+
+
+static LLVMBasicBlockRef
+lp_build_flow_insert_block(struct lp_build_flow_context *flow)
+{
+   LLVMBasicBlockRef current_block;
+   LLVMBasicBlockRef next_block;
+   LLVMBasicBlockRef new_block;
+
+   current_block = LLVMGetInsertBlock(flow->builder);
+
    next_block = LLVMGetNextBasicBlock(current_block);
-   assert(next_block);
    if(next_block) {
       new_block = LLVMInsertBasicBlock(next_block, "");
    }
@@ -93,30 +295,148 @@ lp_build_mask_update(struct lp_build_mask_context *mask,
       new_block = LLVMAppendBasicBlock(function, "");
    }
 
-   LLVMAddIncoming(mask->phi, &mask->value, &current_block, 1);
-   LLVMBuildCondBr(mask->builder, cond, mask->skip_block, new_block);
+   return new_block;
+}
+
+void
+lp_build_flow_skip_begin(struct lp_build_flow_context *flow)
+{
+   struct lp_build_flow_skip *skip;
+   LLVMBuilderRef builder;
+   unsigned i;
+
+   skip = &lp_build_flow_push(flow, LP_BUILD_FLOW_SKIP)->skip;
+   if(!skip)
+      return;
+
+   skip->block = lp_build_flow_insert_block(flow);
+   skip->num_variables = flow->num_variables;
+   if(!skip->num_variables) {
+      skip->phi = NULL;
+      return;
+   }
 
-   LLVMPositionBuilderAtEnd(mask->builder, new_block);
-#endif
+   skip->phi = MALLOC(skip->num_variables * sizeof *skip->phi);
+   if(!skip->phi) {
+      skip->num_variables = 0;
+      return;
+   }
+
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, skip->block);
+
+   for(i = 0; i < skip->num_variables; ++i)
+      skip->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
+
+   LLVMDisposeBuilder(builder);
 }
 
 
-LLVMValueRef
-lp_build_mask_end(struct lp_build_mask_context *mask)
+void
+lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+                              LLVMValueRef cond)
+{
+   struct lp_build_flow_skip *skip;
+   LLVMBasicBlockRef current_block;
+   LLVMBasicBlockRef new_block;
+   unsigned i;
+
+   skip = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SKIP)->skip;
+   if(!skip)
+      return;
+
+   current_block = LLVMGetInsertBlock(flow->builder);
+
+   new_block = lp_build_flow_insert_block(flow);
+
+   for(i = 0; i < skip->num_variables; ++i) {
+      assert(*flow->variables[i]);
+      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
+   }
+
+   LLVMBuildCondBr(flow->builder, cond, skip->block, new_block);
+
+   LLVMPositionBuilderAtEnd(flow->builder, new_block);
+ }
+
+
+void
+lp_build_flow_skip_end(struct lp_build_flow_context *flow)
 {
-   if(mask->skip_block) {
-      LLVMBasicBlockRef current_block = LLVMGetInsertBlock(mask->builder);
+   struct lp_build_flow_skip *skip;
+   LLVMBasicBlockRef current_block;
+   unsigned i;
 
-      LLVMAddIncoming(mask->phi, &mask->value, &current_block, 1);
-      LLVMBuildBr(mask->builder, mask->skip_block);
+   skip = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SKIP)->skip;
+   if(!skip)
+      return;
 
-      LLVMPositionBuilderAtEnd(mask->builder, mask->skip_block);
+   current_block = LLVMGetInsertBlock(flow->builder);
 
-      mask->value = mask->phi;
-      mask->phi = NULL;
-      mask->skip_block = NULL;
+   for(i = 0; i < skip->num_variables; ++i) {
+      assert(*flow->variables[i]);
+      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
+      *flow->variables[i] = skip->phi[i];
    }
 
+   LLVMBuildBr(flow->builder, skip->block);
+   LLVMPositionBuilderAtEnd(flow->builder, skip->block);
+
+   FREE(skip->phi);
+}
+
+
+static void
+lp_build_mask_check(struct lp_build_mask_context *mask)
+{
+   LLVMBuilderRef builder = mask->flow->builder;
+   LLVMValueRef cond;
+
+   cond = LLVMBuildICmp(builder,
+                        LLVMIntEQ,
+                        LLVMBuildBitCast(builder, mask->value, mask->reg_type, ""),
+                        LLVMConstNull(mask->reg_type),
+                        "");
+
+   lp_build_flow_skip_cond_break(mask->flow, cond);
+}
+
+
+void
+lp_build_mask_begin(struct lp_build_mask_context *mask,
+                    struct lp_build_flow_context *flow,
+                    struct lp_type type,
+                    LLVMValueRef value)
+{
+   memset(mask, 0, sizeof *mask);
+
+   mask->flow = flow;
+   mask->reg_type = LLVMIntType(type.width * type.length);
+   mask->value = value;
+
+   lp_build_flow_scope_begin(flow);
+   lp_build_flow_scope_declare(flow, &mask->value);
+   lp_build_flow_skip_begin(flow);
+
+   lp_build_mask_check(mask);
+}
+
+
+void
+lp_build_mask_update(struct lp_build_mask_context *mask,
+                     LLVMValueRef value)
+{
+   mask->value = LLVMBuildAnd( mask->flow->builder, mask->value, value, "");
+
+   lp_build_mask_check(mask);
+}
+
+
+LLVMValueRef
+lp_build_mask_end(struct lp_build_mask_context *mask)
+{
+   lp_build_flow_skip_end(mask->flow);
+   lp_build_flow_scope_end(mask->flow);
    return mask->value;
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_flow.h b/src/gallium/drivers/llvmpipe/lp_bld_flow.h
index 1b634ff038..e61999ff06 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_flow.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_flow.h
@@ -38,27 +38,53 @@
 #include <llvm-c/Core.h>  
 
 
-union lp_type;
+struct lp_type;
+
+
+struct lp_build_flow_context;
+
+
+struct lp_build_flow_context *
+lp_build_flow_create(LLVMBuilderRef builder);
+
+void
+lp_build_flow_destroy(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_scope_begin(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
+                            LLVMValueRef *variable);
+
+void
+lp_build_flow_scope_end(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_skip_begin(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+                              LLVMValueRef cond);
+
+void
+lp_build_flow_skip_end(struct lp_build_flow_context *flow);
 
 
 struct lp_build_mask_context
 {
-   LLVMBuilderRef builder;
+   struct lp_build_flow_context *flow;
 
    LLVMTypeRef reg_type;
 
    LLVMValueRef value;
-
-   LLVMValueRef phi;
-
-   LLVMBasicBlockRef skip_block;
 };
 
 
 void
 lp_build_mask_begin(struct lp_build_mask_context *mask,
-                    LLVMBuilderRef builder,
-                    union lp_type type,
+                    struct lp_build_flow_context *flow,
+                    struct lp_type type,
                     LLVMValueRef value);
 
 /**
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format.h b/src/gallium/drivers/llvmpipe/lp_bld_format.h
index 01c8a752d1..6d3f692619 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_format.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format.h
@@ -31,21 +31,15 @@
 
 /**
  * @file
- * LLVM IR building helpers interfaces.
- *
- * We use LLVM-C bindings for now. They are not documented, but follow the C++
- * interfaces very closely, and appear to be complete enough for code
- * genration. See
- * http://npcontemplation.blogspot.com/2008/06/secret-of-llvm-c-bindings.html
- * for a standalone example.
+ * Pixel format helpers.
  */
 
 #include <llvm-c/Core.h>  
- 
-#include "pipe/p_format.h"
 
+#include "pipe/p_format.h"
 
-union lp_type;
+struct util_format_description;
+struct lp_type;
 
 
 /**
@@ -56,9 +50,9 @@ union lp_type;
  * @return RGBA in a 4 floats vector.
  */
 LLVMValueRef
-lp_build_unpack_rgba(LLVMBuilderRef builder,
-                     enum pipe_format format, 
-                     LLVMValueRef packed);
+lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
+                         enum pipe_format format,
+                         LLVMValueRef packed);
 
 
 /**
@@ -67,9 +61,9 @@ lp_build_unpack_rgba(LLVMBuilderRef builder,
  * @param rgba 4 float vector with the unpacked components.
  */
 LLVMValueRef
-lp_build_pack_rgba(LLVMBuilderRef builder,
-                   enum pipe_format format,
-                   LLVMValueRef rgba);
+lp_build_pack_rgba_aos(LLVMBuilderRef builder,
+                       enum pipe_format format,
+                       LLVMValueRef rgba);
 
 
 /**
@@ -81,9 +75,9 @@ lp_build_pack_rgba(LLVMBuilderRef builder,
  * @return RGBA in a 4 floats vector.
  */
 LLVMValueRef
-lp_build_load_rgba(LLVMBuilderRef builder,
-                   enum pipe_format format, 
-                   LLVMValueRef ptr);
+lp_build_load_rgba_aos(LLVMBuilderRef builder,
+                       enum pipe_format format,
+                       LLVMValueRef ptr);
 
 
 /**
@@ -92,10 +86,34 @@ lp_build_load_rgba(LLVMBuilderRef builder,
  * @param rgba 4 float vector with the unpacked components.
  */
 void 
-lp_build_store_rgba(LLVMBuilderRef builder,
-                    enum pipe_format format,
-                    LLVMValueRef ptr,
-                    LLVMValueRef rgba);
+lp_build_store_rgba_aos(LLVMBuilderRef builder,
+                        enum pipe_format format,
+                        LLVMValueRef ptr,
+                        LLVMValueRef rgba);
 
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets);
+
+
+void
+lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
+                         const struct util_format_description *format_desc,
+                         struct lp_type type,
+                         LLVMValueRef packed,
+                         LLVMValueRef *rgba);
+
+
+void
+lp_build_load_rgba_soa(LLVMBuilderRef builder,
+                       const struct util_format_description *format_desc,
+                       struct lp_type type,
+                       LLVMValueRef base_ptr,
+                       LLVMValueRef offsets,
+                       LLVMValueRef *rgba);
 
 #endif /* !LP_BLD_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
index dcbc0076c7..b9b5d84bed 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
@@ -32,9 +32,9 @@
 
 
 LLVMValueRef
-lp_build_unpack_rgba(LLVMBuilderRef builder,
-                     enum pipe_format format,
-                     LLVMValueRef packed)
+lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
+                         enum pipe_format format,
+                         LLVMValueRef packed)
 {
    const struct util_format_description *desc;
    LLVMTypeRef type;
@@ -152,9 +152,9 @@ lp_build_unpack_rgba(LLVMBuilderRef builder,
 
 
 LLVMValueRef
-lp_build_pack_rgba(LLVMBuilderRef builder,
-                   enum pipe_format format,
-                   LLVMValueRef rgba)
+lp_build_pack_rgba_aos(LLVMBuilderRef builder,
+                       enum pipe_format format,
+                       LLVMValueRef rgba)
 {
    const struct util_format_description *desc;
    LLVMTypeRef type;
@@ -250,9 +250,9 @@ lp_build_pack_rgba(LLVMBuilderRef builder,
 
 
 LLVMValueRef
-lp_build_load_rgba(LLVMBuilderRef builder,
-                   enum pipe_format format,
-                   LLVMValueRef ptr)
+lp_build_load_rgba_aos(LLVMBuilderRef builder,
+                       enum pipe_format format,
+                       LLVMValueRef ptr)
 {
    const struct util_format_description *desc;
    LLVMTypeRef type;
@@ -272,15 +272,15 @@ lp_build_load_rgba(LLVMBuilderRef builder,
 
    packed = LLVMBuildLoad(builder, ptr, "");
 
-   return lp_build_unpack_rgba(builder, format, packed);
+   return lp_build_unpack_rgba_aos(builder, format, packed);
 }
 
 
 void
-lp_build_store_rgba(LLVMBuilderRef builder,
-                    enum pipe_format format,
-                    LLVMValueRef ptr,
-                    LLVMValueRef rgba)
+lp_build_store_rgba_aos(LLVMBuilderRef builder,
+                        enum pipe_format format,
+                        LLVMValueRef ptr,
+                        LLVMValueRef rgba)
 {
    const struct util_format_description *desc;
    LLVMTypeRef type;
@@ -294,7 +294,7 @@ lp_build_store_rgba(LLVMBuilderRef builder,
 
    type = LLVMIntType(desc->block.bits);
 
-   packed = lp_build_pack_rgba(builder, format, rgba);
+   packed = lp_build_pack_rgba_aos(builder, format, rgba);
 
    ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, 0), "");
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c
new file mode 100644
index 0000000000..b5ff434e1a
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c
@@ -0,0 +1,208 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "util/u_format.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_format.h"
+
+
+/**
+ * Gather elements from scatter positions in memory into a single vector.
+ *
+ * @param src_width src element width
+ * @param dst_width result element width (source will be expanded to fit)
+ * @param length length of the offsets,
+ * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param offsets vector with offsets
+ */
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets)
+{
+   LLVMTypeRef src_type = LLVMIntType(src_width);
+   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+   LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
+   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
+   LLVMValueRef res;
+   unsigned i;
+
+   res = LLVMGetUndef(dst_vec_type);
+   for(i = 0; i < length; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef elem_offset;
+      LLVMValueRef elem_ptr;
+      LLVMValueRef elem;
+
+      elem_offset = LLVMBuildExtractElement(builder, offsets, index, "");
+      elem_ptr = LLVMBuildGEP(builder, base_ptr, &elem_offset, 1, "");
+      elem_ptr = LLVMBuildBitCast(builder, elem_ptr, src_ptr_type, "");
+      elem = LLVMBuildLoad(builder, elem_ptr, "");
+
+      assert(src_width <= dst_width);
+      if(src_width > dst_width)
+         elem = LLVMBuildTrunc(builder, elem, dst_elem_type, "");
+      if(src_width < dst_width)
+         elem = LLVMBuildZExt(builder, elem, dst_elem_type, "");
+
+      res = LLVMBuildInsertElement(builder, res, elem, index, "");
+   }
+
+   return res;
+}
+
+
+static LLVMValueRef
+lp_build_format_swizzle(struct lp_type type,
+                        const LLVMValueRef *inputs,
+                        enum util_format_swizzle swizzle)
+{
+   switch (swizzle) {
+   case UTIL_FORMAT_SWIZZLE_X:
+   case UTIL_FORMAT_SWIZZLE_Y:
+   case UTIL_FORMAT_SWIZZLE_Z:
+   case UTIL_FORMAT_SWIZZLE_W:
+      return inputs[swizzle];
+   case UTIL_FORMAT_SWIZZLE_0:
+      return lp_build_zero(type);
+   case UTIL_FORMAT_SWIZZLE_1:
+      return lp_build_one(type);
+   case UTIL_FORMAT_SWIZZLE_NONE:
+      return lp_build_undef(type);
+   default:
+      assert(0);
+      return lp_build_undef(type);
+   }
+}
+
+
+void
+lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
+                         const struct util_format_description *format_desc,
+                         struct lp_type type,
+                         LLVMValueRef packed,
+                         LLVMValueRef *rgba)
+{
+   LLVMValueRef inputs[4];
+   unsigned start;
+   unsigned chan;
+
+   /* FIXME: Support more formats */
+   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+   assert(format_desc->block.bits <= 32);
+
+   /* Decode the input vector components */
+   start = 0;
+   for (chan = 0; chan < 4; ++chan) {
+      unsigned width = format_desc->channel[chan].size;
+      unsigned stop = start + width;
+      LLVMValueRef input;
+
+      input = packed;
+
+      switch(format_desc->channel[chan].type) {
+      case UTIL_FORMAT_TYPE_VOID:
+         input = NULL;
+         break;
+
+      case UTIL_FORMAT_TYPE_UNSIGNED:
+         if(type.floating) {
+            if(start)
+               input = LLVMBuildLShr(builder, input, lp_build_int_const_scalar(type, start), "");
+            if(stop < format_desc->block.bits) {
+               unsigned mask = ((unsigned long long)1 << width) - 1;
+               input = LLVMBuildAnd(builder, input, lp_build_int_const_scalar(type, mask), "");
+            }
+
+            if(format_desc->channel[chan].normalized)
+               input = lp_build_unsigned_norm_to_float(builder, width, type, input);
+            else
+               input = LLVMBuildFPToSI(builder, input, lp_build_vec_type(type), "");
+         }
+         else {
+            /* FIXME */
+            assert(0);
+            input = lp_build_undef(type);
+         }
+         break;
+
+      default:
+         /* fall through */
+         input = lp_build_undef(type);
+         break;
+      }
+
+      inputs[chan] = input;
+
+      start = stop;
+   }
+
+   if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      enum util_format_swizzle swizzle = format_desc->swizzle[0];
+      LLVMValueRef depth = lp_build_format_swizzle(type, inputs, swizzle);
+      rgba[2] = rgba[1] = rgba[0] = depth;
+      rgba[3] = lp_build_one(type);
+   }
+   else {
+      for (chan = 0; chan < 4; ++chan) {
+         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
+         rgba[chan] = lp_build_format_swizzle(type, inputs, swizzle);
+      }
+   }
+}
+
+
+void
+lp_build_load_rgba_soa(LLVMBuilderRef builder,
+                       const struct util_format_description *format_desc,
+                       struct lp_type type,
+                       LLVMValueRef base_ptr,
+                       LLVMValueRef offsets,
+                       LLVMValueRef *rgba)
+{
+   LLVMValueRef packed;
+
+   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+   assert(format_desc->block.bits <= 32);
+
+   packed = lp_build_gather(builder,
+                            type.length, format_desc->block.bits, type.width,
+                            base_ptr, offsets);
+
+   lp_build_unpack_rgba_soa(builder, format_desc, type, packed, rgba);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index cfe20a0d75..338dbca6d1 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -292,7 +292,7 @@ void
 lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
                          const struct tgsi_token *tokens,
                          LLVMBuilderRef builder,
-                         union lp_type type,
+                         struct lp_type type,
                          LLVMValueRef a0_ptr,
                          LLVMValueRef dadx_ptr,
                          LLVMValueRef dady_ptr,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
index 9194f6233a..9c57a10879 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -83,7 +83,7 @@ void
 lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
                          const struct tgsi_token *tokens,
                          LLVMBuilderRef builder,
-                         union lp_type type,
+                         struct lp_type type,
                          LLVMValueRef a0_ptr,
                          LLVMValueRef dadx_ptr,
                          LLVMValueRef dady_ptr,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_logic.c b/src/gallium/drivers/llvmpipe/lp_bld_logic.c
index 8631efd6c3..6b6f820769 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_logic.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_logic.c
@@ -45,7 +45,7 @@ lp_build_cmp(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
    LLVMValueRef zeros = LLVMConstNull(int_vec_type);
@@ -301,7 +301,7 @@ lp_build_select(struct lp_build_context *bld,
                 LLVMValueRef a,
                 LLVMValueRef b)
 {
-   union lp_type type = bld->type;
+   struct lp_type type = bld->type;
    LLVMValueRef res;
 
    if(a == b)
@@ -313,8 +313,6 @@ lp_build_select(struct lp_build_context *bld,
       b = LLVMBuildBitCast(bld->builder, b, int_vec_type, "");
    }
 
-   /* TODO: On SSE4 we could do this with a single instruction -- PBLENDVB */
-
    a = LLVMBuildAnd(bld->builder, a, mask, "");
 
    /* This often gets translated to PANDN, but sometimes the NOT is
@@ -339,9 +337,9 @@ LLVMValueRef
 lp_build_select_aos(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b,
-                    boolean cond[4])
+                    const boolean cond[4])
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    const unsigned n = type.length;
    unsigned i, j;
 
@@ -376,9 +374,9 @@ lp_build_select_aos(struct lp_build_context *bld,
 
       return LLVMBuildShuffleVector(bld->builder, a, b, LLVMConstVector(shuffles, n), "");
    }
+   else {
 #if 0
-   else if(0) {
-      /* FIXME: Unfortunately select of vectors do not work */
+      /* XXX: Unfortunately select of vectors do not work */
       /* Use a select */
       LLVMTypeRef elem_type = LLVMInt1Type();
       LLVMValueRef cond[LP_MAX_VECTOR_LENGTH];
@@ -388,10 +386,9 @@ lp_build_select_aos(struct lp_build_context *bld,
             cond[j + i] = LLVMConstInt(elem_type, cond[i] ? 1 : 0, 0);
 
       return LLVMBuildSelect(bld->builder, LLVMConstVector(cond, n), a, b, "");
-   }
-#endif
-   else {
+#else
       LLVMValueRef mask = lp_build_const_mask_aos(type, cond);
       return lp_build_select(bld, mask, a, b);
+#endif
    }
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_logic.h b/src/gallium/drivers/llvmpipe/lp_bld_logic.h
index 29b9e1c45b..a4ee7723b5 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_logic.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_logic.h
@@ -42,7 +42,7 @@
 #include "pipe/p_defines.h" /* For PIPE_FUNC_xxx */
 
 
-union lp_type type;
+struct lp_type type;
 struct lp_build_context;
 
 
@@ -66,7 +66,7 @@ LLVMValueRef
 lp_build_select_aos(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b,
-                    boolean cond[4]);
+                    const boolean cond[4]);
 
 
 #endif /* !LP_BLD_LOGIC_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample.h b/src/gallium/drivers/llvmpipe/lp_bld_sample.h
new file mode 100644
index 0000000000..403d0e4836
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample.h
@@ -0,0 +1,135 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_BLD_SAMPLE_H
+#define LP_BLD_SAMPLE_H
+
+
+#include <llvm-c/Core.h>
+
+struct pipe_texture;
+struct pipe_sampler_state;
+struct lp_type;
+
+
+/**
+ * Sampler static state.
+ *
+ * These are the bits of state from pipe_texture and pipe_sampler_state that
+ * are embedded in the generated code.
+ */
+struct lp_sampler_static_state
+{
+   /* pipe_texture's state */
+   enum pipe_format format;
+   unsigned target:2;
+   unsigned pot_width:1;
+   unsigned pot_height:1;
+   unsigned pot_depth:1;
+
+   /* pipe_sampler_state's state */
+   unsigned wrap_s:3;
+   unsigned wrap_t:3;
+   unsigned wrap_r:3;
+   unsigned min_img_filter:2;
+   unsigned min_mip_filter:2;
+   unsigned mag_img_filter:2;
+   unsigned compare_mode:1;
+   unsigned compare_func:3;
+   unsigned normalized_coords:1;
+   unsigned prefilter:4;
+};
+
+
+/**
+ * Sampler dynamic state.
+ *
+ * These are the bits of state from pipe_texture and pipe_sampler_state that
+ * are computed in runtime.
+ *
+ * There are obtained through callbacks, as we don't want to tie the texture
+ * sampling code generation logic to any particular texture layout or pipe
+ * driver.
+ */
+struct lp_sampler_dynamic_state
+{
+
+   /** Obtain the base texture width. */
+   LLVMValueRef
+   (*width)( struct lp_sampler_dynamic_state *state,
+             LLVMBuilderRef builder,
+             unsigned unit);
+
+   /** Obtain the base texture height. */
+   LLVMValueRef
+   (*height)( struct lp_sampler_dynamic_state *state,
+              LLVMBuilderRef builder,
+              unsigned unit);
+
+   LLVMValueRef
+   (*stride)( struct lp_sampler_dynamic_state *state,
+              LLVMBuilderRef builder,
+              unsigned unit);
+
+   LLVMValueRef
+   (*data_ptr)( struct lp_sampler_dynamic_state *state,
+                LLVMBuilderRef builder,
+                unsigned unit);
+
+};
+
+
+/**
+ * Derive the sampler static state.
+ */
+void
+lp_sampler_static_state(struct lp_sampler_static_state *state,
+                        const struct pipe_texture *texture,
+                        const struct pipe_sampler_state *sampler);
+
+
+void
+lp_build_sample_soa(LLVMBuilderRef builder,
+                    const struct lp_sampler_static_state *static_state,
+                    struct lp_sampler_dynamic_state *dynamic_state,
+                    struct lp_type fp_type,
+                    unsigned unit,
+                    unsigned num_coords,
+                    const LLVMValueRef *coords,
+                    LLVMValueRef lodbias,
+                    LLVMValueRef *texel);
+
+
+
+#endif /* LP_BLD_SAMPLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
new file mode 100644
index 0000000000..8ca1be6f1b
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
@@ -0,0 +1,416 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_format.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_format.h"
+#include "lp_bld_sample.h"
+
+
+void
+lp_sampler_static_state(struct lp_sampler_static_state *state,
+                        const struct pipe_texture *texture,
+                        const struct pipe_sampler_state *sampler)
+{
+   memset(state, 0, sizeof *state);
+
+   if(!texture)
+      return;
+
+   if(!sampler)
+      return;
+
+   state->format            = texture->format;
+   state->target            = texture->target;
+   state->pot_width         = util_is_pot(texture->width[0]);
+   state->pot_height        = util_is_pot(texture->height[0]);
+   state->pot_depth         = util_is_pot(texture->depth[0]);
+
+   state->wrap_s            = sampler->wrap_s;
+   state->wrap_t            = sampler->wrap_t;
+   state->wrap_r            = sampler->wrap_r;
+   state->min_img_filter    = sampler->min_img_filter;
+   state->min_mip_filter    = sampler->min_mip_filter;
+   state->mag_img_filter    = sampler->mag_img_filter;
+   if(sampler->compare_mode) {
+      state->compare_mode      = sampler->compare_mode;
+      state->compare_func      = sampler->compare_func;
+   }
+   state->normalized_coords = sampler->normalized_coords;
+   state->prefilter         = sampler->prefilter;
+}
+
+
+
+/**
+ * Keep all information for sampling code generation in a single place.
+ */
+struct lp_build_sample_context
+{
+   LLVMBuilderRef builder;
+
+   const struct lp_sampler_static_state *static_state;
+
+   struct lp_sampler_dynamic_state *dynamic_state;
+
+   const struct util_format_description *format_desc;
+
+   /** Incoming coordinates type and build context */
+   struct lp_type coord_type;
+   struct lp_build_context coord_bld;
+
+   /** Integer coordinates */
+   struct lp_type int_coord_type;
+   struct lp_build_context int_coord_bld;
+
+   /** Output texels type and build context */
+   struct lp_type texel_type;
+   struct lp_build_context texel_bld;
+};
+
+
+static void
+lp_build_sample_texel(struct lp_build_sample_context *bld,
+                      LLVMValueRef x,
+                      LLVMValueRef y,
+                      LLVMValueRef y_stride,
+                      LLVMValueRef data_ptr,
+                      LLVMValueRef *texel)
+{
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef x_stride;
+   LLVMValueRef offset;
+
+   x_stride = lp_build_const_scalar(bld->int_coord_type, bld->format_desc->block.bits/8);
+
+   if(bld->format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      LLVMValueRef x_lo, x_hi;
+      LLVMValueRef y_lo, y_hi;
+      LLVMValueRef x_stride_lo, x_stride_hi;
+      LLVMValueRef y_stride_lo, y_stride_hi;
+      LLVMValueRef x_offset_lo, x_offset_hi;
+      LLVMValueRef y_offset_lo, y_offset_hi;
+      LLVMValueRef offset_lo, offset_hi;
+
+      x_lo = LLVMBuildAnd(bld->builder, x, int_coord_bld->one, "");
+      y_lo = LLVMBuildAnd(bld->builder, y, int_coord_bld->one, "");
+
+      x_hi = LLVMBuildLShr(bld->builder, x, int_coord_bld->one, "");
+      y_hi = LLVMBuildLShr(bld->builder, y, int_coord_bld->one, "");
+
+      x_stride_lo = x_stride;
+      y_stride_lo = lp_build_const_scalar(bld->int_coord_type, 2*bld->format_desc->block.bits/8);
+
+      x_stride_hi = lp_build_const_scalar(bld->int_coord_type, 4*bld->format_desc->block.bits/8);
+      y_stride_hi = LLVMBuildShl(bld->builder, y_stride, int_coord_bld->one, "");
+
+      x_offset_lo = lp_build_mul(int_coord_bld, x_lo, x_stride_lo);
+      y_offset_lo = lp_build_mul(int_coord_bld, y_lo, y_stride_lo);
+      offset_lo = lp_build_add(int_coord_bld, x_offset_lo, y_offset_lo);
+
+      x_offset_hi = lp_build_mul(int_coord_bld, x_hi, x_stride_hi);
+      y_offset_hi = lp_build_mul(int_coord_bld, y_hi, y_stride_hi);
+      offset_hi = lp_build_add(int_coord_bld, x_offset_hi, y_offset_hi);
+
+      offset = lp_build_add(int_coord_bld, offset_hi, offset_lo);
+   }
+   else {
+      LLVMValueRef x_offset;
+      LLVMValueRef y_offset;
+
+      x_offset = lp_build_mul(int_coord_bld, x, x_stride);
+      y_offset = lp_build_mul(int_coord_bld, y, y_stride);
+
+      offset = lp_build_add(int_coord_bld, x_offset, y_offset);
+   }
+
+   lp_build_load_rgba_soa(bld->builder,
+                          bld->format_desc,
+                          bld->texel_type,
+                          data_ptr,
+                          offset,
+                          texel);
+}
+
+
+static LLVMValueRef
+lp_build_sample_wrap(struct lp_build_sample_context *bld,
+                     LLVMValueRef coord,
+                     LLVMValueRef length,
+                     boolean is_pot,
+                     unsigned wrap_mode)
+{
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef length_minus_one;
+
+   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      if(is_pot)
+         coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, "");
+      else
+         /* Signed remainder won't give the right results for negative
+          * dividends but unsigned remainder does.*/
+         coord = LLVMBuildURem(bld->builder, coord, length, "");
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP:
+      coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
+      coord = lp_build_min(int_coord_bld, coord, length_minus_one);
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      /* FIXME */
+      _debug_printf("warning: failed to translate texture wrap mode %u\n", wrap_mode);
+      coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
+      coord = lp_build_min(int_coord_bld, coord, length_minus_one);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   return coord;
+}
+
+
+static void
+lp_build_sample_2d_nearest_soa(struct lp_build_sample_context *bld,
+                               LLVMValueRef s,
+                               LLVMValueRef t,
+                               LLVMValueRef width,
+                               LLVMValueRef height,
+                               LLVMValueRef stride,
+                               LLVMValueRef data_ptr,
+                               LLVMValueRef *texel)
+{
+   LLVMValueRef x;
+   LLVMValueRef y;
+
+   x = lp_build_ifloor(&bld->coord_bld, s);
+   y = lp_build_ifloor(&bld->coord_bld, t);
+
+   x = lp_build_sample_wrap(bld, x, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
+   y = lp_build_sample_wrap(bld, y, height, bld->static_state->pot_height, bld->static_state->wrap_t);
+
+   lp_build_sample_texel(bld, x, y, stride, data_ptr, texel);
+}
+
+
+static void
+lp_build_sample_2d_linear_soa(struct lp_build_sample_context *bld,
+                              LLVMValueRef s,
+                              LLVMValueRef t,
+                              LLVMValueRef width,
+                              LLVMValueRef height,
+                              LLVMValueRef stride,
+                              LLVMValueRef data_ptr,
+                              LLVMValueRef *texel)
+{
+   LLVMValueRef half;
+   LLVMValueRef s_ipart;
+   LLVMValueRef t_ipart;
+   LLVMValueRef s_fpart;
+   LLVMValueRef t_fpart;
+   LLVMValueRef x0, x1;
+   LLVMValueRef y0, y1;
+   LLVMValueRef neighbors[2][2][4];
+   unsigned chan;
+
+   half = lp_build_const_scalar(bld->coord_type, 0.5);
+   s = lp_build_sub(&bld->coord_bld, s, half);
+   t = lp_build_sub(&bld->coord_bld, t, half);
+
+   s_ipart = lp_build_floor(&bld->coord_bld, s);
+   t_ipart = lp_build_floor(&bld->coord_bld, t);
+
+   s_fpart = lp_build_sub(&bld->coord_bld, s, s_ipart);
+   t_fpart = lp_build_sub(&bld->coord_bld, t, t_ipart);
+
+   x0 = lp_build_int(&bld->coord_bld, s_ipart);
+   y0 = lp_build_int(&bld->coord_bld, t_ipart);
+
+   x0 = lp_build_sample_wrap(bld, x0, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
+   y0 = lp_build_sample_wrap(bld, y0, height, bld->static_state->pot_height, bld->static_state->wrap_t);
+
+   x1 = lp_build_add(&bld->int_coord_bld, x0, bld->int_coord_bld.one);
+   y1 = lp_build_add(&bld->int_coord_bld, y0, bld->int_coord_bld.one);
+
+   x1 = lp_build_sample_wrap(bld, x1, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
+   y1 = lp_build_sample_wrap(bld, y1, height, bld->static_state->pot_height, bld->static_state->wrap_t);
+
+   lp_build_sample_texel(bld, x0, y0, stride, data_ptr, neighbors[0][0]);
+   lp_build_sample_texel(bld, x1, y0, stride, data_ptr, neighbors[0][1]);
+   lp_build_sample_texel(bld, x0, y1, stride, data_ptr, neighbors[1][0]);
+   lp_build_sample_texel(bld, x1, y1, stride, data_ptr, neighbors[1][1]);
+
+   /* TODO: Don't interpolate missing channels */
+   for(chan = 0; chan < 4; ++chan) {
+      texel[chan] = lp_build_lerp_2d(&bld->texel_bld,
+                                     s_fpart, t_fpart,
+                                     neighbors[0][0][chan],
+                                     neighbors[0][1][chan],
+                                     neighbors[1][0][chan],
+                                     neighbors[1][1][chan]);
+   }
+}
+
+
+static void
+lp_build_sample_compare(struct lp_build_sample_context *bld,
+                        LLVMValueRef p,
+                        LLVMValueRef *texel)
+{
+   struct lp_build_context *texel_bld = &bld->texel_bld;
+   LLVMValueRef res;
+   unsigned chan;
+
+   if(!bld->static_state->compare_mode)
+      return;
+
+   /* TODO: Compare before swizzling, to avoid redundant computations */
+   res = NULL;
+   for(chan = 0; chan < 4; ++chan) {
+      LLVMValueRef cmp;
+      cmp = lp_build_cmp(texel_bld, bld->static_state->compare_func, p, texel[chan]);
+      cmp = lp_build_select(texel_bld, cmp, texel_bld->one, texel_bld->zero);
+
+      if(res)
+         res = lp_build_add(texel_bld, res, cmp);
+      else
+         res = cmp;
+   }
+
+   assert(res);
+   res = lp_build_mul(texel_bld, res, lp_build_const_scalar(texel_bld->type, 0.25));
+
+   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
+   for(chan = 0; chan < 3; ++chan)
+      texel[chan] = res;
+   texel[3] = texel_bld->one;
+}
+
+
+void
+lp_build_sample_soa(LLVMBuilderRef builder,
+                    const struct lp_sampler_static_state *static_state,
+                    struct lp_sampler_dynamic_state *dynamic_state,
+                    struct lp_type type,
+                    unsigned unit,
+                    unsigned num_coords,
+                    const LLVMValueRef *coords,
+                    LLVMValueRef lodbias,
+                    LLVMValueRef *texel)
+{
+   struct lp_build_sample_context bld;
+   LLVMValueRef width;
+   LLVMValueRef height;
+   LLVMValueRef stride;
+   LLVMValueRef data_ptr;
+   LLVMValueRef s;
+   LLVMValueRef t;
+   LLVMValueRef p;
+
+   /* Setup our build context */
+   memset(&bld, 0, sizeof bld);
+   bld.builder = builder;
+   bld.static_state = static_state;
+   bld.dynamic_state = dynamic_state;
+   bld.format_desc = util_format_description(static_state->format);
+   bld.coord_type = type;
+   bld.int_coord_type = lp_int_type(type);
+   bld.texel_type = type;
+   lp_build_context_init(&bld.coord_bld, builder, bld.coord_type);
+   lp_build_context_init(&bld.int_coord_bld, builder, bld.int_coord_type);
+   lp_build_context_init(&bld.texel_bld, builder, bld.texel_type);
+
+   /* Get the dynamic state */
+   width = dynamic_state->width(dynamic_state, builder, unit);
+   height = dynamic_state->height(dynamic_state, builder, unit);
+   stride = dynamic_state->stride(dynamic_state, builder, unit);
+   data_ptr = dynamic_state->data_ptr(dynamic_state, builder, unit);
+
+   s = coords[0];
+   t = coords[1];
+   p = coords[2];
+
+   width = lp_build_broadcast_scalar(&bld.int_coord_bld, width);
+   height = lp_build_broadcast_scalar(&bld.int_coord_bld, height);
+   stride = lp_build_broadcast_scalar(&bld.int_coord_bld, stride);
+
+   if(static_state->target == PIPE_TEXTURE_1D)
+      t = bld.coord_bld.zero;
+
+   if(static_state->normalized_coords) {
+      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld.coord_type);
+      LLVMValueRef fp_width = LLVMBuildSIToFP(builder, width, coord_vec_type, "");
+      LLVMValueRef fp_height = LLVMBuildSIToFP(builder, height, coord_vec_type, "");
+      s = lp_build_mul(&bld.coord_bld, s, fp_width);
+      t = lp_build_mul(&bld.coord_bld, t, fp_height);
+   }
+
+   switch (static_state->min_img_filter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      lp_build_sample_2d_nearest_soa(&bld, s, t, width, height, stride, data_ptr, texel);
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+   case PIPE_TEX_FILTER_ANISO:
+      lp_build_sample_2d_linear_soa(&bld, s, t, width, height, stride, data_ptr, texel);
+      break;
+   default:
+      assert(0);
+   }
+
+   /* FIXME: respect static_state->min_mip_filter */;
+   /* FIXME: respect static_state->mag_img_filter */;
+   /* FIXME: respect static_state->prefilter */;
+
+   lp_build_sample_compare(&bld, p, texel);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_struct.c b/src/gallium/drivers/llvmpipe/lp_bld_struct.c
index 14d2b10df9..3998ac374f 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_struct.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_struct.c
@@ -42,17 +42,30 @@
 
 
 LLVMValueRef
+lp_build_struct_get_ptr(LLVMBuilderRef builder,
+                        LLVMValueRef ptr,
+                        unsigned member,
+                        const char *name)
+{
+   LLVMValueRef indices[2];
+   LLVMValueRef member_ptr;
+   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indices[1] = LLVMConstInt(LLVMInt32Type(), member, 0);
+   member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+   lp_build_name(member_ptr, "%s.%s_ptr", LLVMGetValueName(ptr), name);
+   return member_ptr;
+}
+
+
+LLVMValueRef
 lp_build_struct_get(LLVMBuilderRef builder,
                     LLVMValueRef ptr,
                     unsigned member,
                     const char *name)
 {
-   LLVMValueRef indices[2];
    LLVMValueRef member_ptr;
    LLVMValueRef res;
-   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
-   indices[1] = LLVMConstInt(LLVMInt32Type(), member, 0);
-   member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+   member_ptr = lp_build_struct_get_ptr(builder, ptr, member, name);
    res = LLVMBuildLoad(builder, member_ptr, "");
    lp_build_name(res, "%s.%s", LLVMGetValueName(ptr), name);
    return res;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_struct.h b/src/gallium/drivers/llvmpipe/lp_bld_struct.h
index cbefdc9f81..740392f561 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_struct.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_struct.h
@@ -53,6 +53,18 @@
              offsetof(_ctype, _cmember))
 
 
+/**
+ * Get value pointer to a structure member.
+ */
+LLVMValueRef
+lp_build_struct_get_ptr(LLVMBuilderRef builder,
+                        LLVMValueRef ptr,
+                        unsigned member,
+                        const char *name);
+
+/**
+ * Get the value of a structure member.
+ */
 LLVMValueRef
 lp_build_struct_get(LLVMBuilderRef builder,
                     LLVMValueRef ptr,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c
index ac7eed9379..64e81f7b1f 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c
@@ -64,7 +64,7 @@ LLVMValueRef
 lp_build_broadcast_scalar(struct lp_build_context *bld,
                           LLVMValueRef scalar)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMValueRef res;
    unsigned i;
 
@@ -83,7 +83,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
                        LLVMValueRef a,
                        unsigned channel)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    const unsigned n = type.length;
    unsigned i, j;
 
@@ -115,7 +115,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
        *   YY00 YY00 .... YY00
        *   YYYY YYYY .... YYYY  <= output
        */
-      union lp_type type4 = type;
+      struct lp_type type4 = type;
       const char shifts[4][2] = {
          { 1,  2},
          {-1,  2},
@@ -161,7 +161,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_swizzle1_aos(struct lp_build_context *bld,
                       LLVMValueRef a,
-                      unsigned char swizzle[4])
+                      const unsigned char swizzle[4])
 {
    const unsigned n = bld->type.length;
    unsigned i, j;
@@ -192,7 +192,7 @@ LLVMValueRef
 lp_build_swizzle2_aos(struct lp_build_context *bld,
                       LLVMValueRef a,
                       LLVMValueRef b,
-                      unsigned char swizzle[4])
+                      const unsigned char swizzle[4])
 {
    const unsigned n = bld->type.length;
    unsigned i, j;
@@ -201,11 +201,12 @@ lp_build_swizzle2_aos(struct lp_build_context *bld,
       return lp_build_swizzle1_aos(bld, a, swizzle);
 
    if(a == b) {
-      swizzle[0] %= 4;
-      swizzle[1] %= 4;
-      swizzle[2] %= 4;
-      swizzle[3] %= 4;
-      return lp_build_swizzle1_aos(bld, a, swizzle);
+      unsigned char swizzle1[4];
+      swizzle1[0] = swizzle[0] % 4;
+      swizzle1[1] = swizzle[1] % 4;
+      swizzle1[2] = swizzle[2] % 4;
+      swizzle1[3] = swizzle[3] % 4;
+      return lp_build_swizzle1_aos(bld, a, swizzle1);
    }
 
    if(swizzle[0] % 4 == 0 &&
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
index d7dd6a8a60..1f6da80448 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
@@ -40,7 +40,7 @@
 #include <llvm-c/Core.h>  
 
 
-union lp_type type;
+struct lp_type type;
 struct lp_build_context;
 
 
@@ -73,7 +73,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_swizzle1_aos(struct lp_build_context *bld,
                       LLVMValueRef a,
-                      unsigned char swizzle[4]);
+                      const unsigned char swizzle[4]);
 
 
 /**
@@ -85,7 +85,7 @@ LLVMValueRef
 lp_build_swizzle2_aos(struct lp_build_context *bld,
                       LLVMValueRef a,
                       LLVMValueRef b,
-                      unsigned char swizzle[4]);
+                      const unsigned char swizzle[4]);
 
 
 #endif /* !LP_BLD_SWIZZLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h b/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h
index 912db24aec..eddb7a83fa 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h
@@ -39,31 +39,46 @@
 
 
 struct tgsi_token;
-union lp_type;
+struct lp_type;
 struct lp_build_context;
 struct lp_build_mask_context;
 
 
-typedef void
-(*lp_emit_fetch_texel_soa_callback)( LLVMBuilderRef builder,
-                                void *context,
-                                unsigned unit,
-                                unsigned num_coords,
-                                const LLVMValueRef *coords,
-                                LLVMValueRef lodbias,
-                                LLVMValueRef *texel);
+/**
+ * Sampler code generation interface.
+ *
+ * Although texture sampling is a requirement for TGSI translation, it is
+ * a very different problem with several different approaches to it. This
+ * structure establishes an interface for texture sampling code generation, so
+ * that we can easily use different texture sampling strategies.
+ */
+struct lp_build_sampler_soa
+{
+   void
+   (*destroy)( struct lp_build_sampler_soa *sampler );
+
+   void
+   (*emit_fetch_texel)( struct lp_build_sampler_soa *sampler,
+                        LLVMBuilderRef builder,
+                        struct lp_type type,
+                        unsigned unit,
+                        unsigned num_coords,
+                        const LLVMValueRef *coords,
+                        LLVMValueRef lodbias,
+                        LLVMValueRef *texel);
+};
+
 
 void
 lp_build_tgsi_soa(LLVMBuilderRef builder,
                   const struct tgsi_token *tokens,
-                  union lp_type type,
+                  struct lp_type type,
                   struct lp_build_mask_context *mask,
                   LLVMValueRef consts_ptr,
                   const LLVMValueRef *pos,
                   const LLVMValueRef (*inputs)[4],
                   LLVMValueRef (*outputs)[4],
-                  lp_emit_fetch_texel_soa_callback emit_fetch_texel,
-                  void *emit_fetch_texel_context);
+                  struct lp_build_sampler_soa *sampler);
 
 
 #endif /* LP_BLD_TGSI_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
index d4d18febec..adc81569ed 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
@@ -78,6 +78,11 @@
 #define CHAN_Z 2
 #define CHAN_W 3
 
+#define QUAD_TOP_LEFT     0
+#define QUAD_TOP_RIGHT    1
+#define QUAD_BOTTOM_LEFT  2
+#define QUAD_BOTTOM_RIGHT 3
+
 
 struct lp_build_tgsi_soa_context
 {
@@ -88,8 +93,7 @@ struct lp_build_tgsi_soa_context
    const LLVMValueRef (*inputs)[NUM_CHANNELS];
    LLVMValueRef (*outputs)[NUM_CHANNELS];
 
-   lp_emit_fetch_texel_soa_callback emit_fetch_texel;
-   void *emit_fetch_texel_context;
+   struct lp_build_sampler_soa *sampler;
 
    LLVMValueRef immediates[LP_MAX_IMMEDIATES][NUM_CHANNELS];
    LLVMValueRef temps[LP_MAX_TEMPS][NUM_CHANNELS];
@@ -98,6 +102,51 @@ struct lp_build_tgsi_soa_context
 };
 
 
+static const unsigned char
+swizzle_left[4] = {
+   QUAD_TOP_LEFT,     QUAD_TOP_LEFT,
+   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_LEFT
+};
+
+static const unsigned char
+swizzle_right[4] = {
+   QUAD_TOP_RIGHT,    QUAD_TOP_RIGHT,
+   QUAD_BOTTOM_RIGHT, QUAD_BOTTOM_RIGHT
+};
+
+static const unsigned char
+swizzle_top[4] = {
+   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT,
+   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT
+};
+
+static const unsigned char
+swizzle_bottom[4] = {
+   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT,
+   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT
+};
+
+
+static LLVMValueRef
+emit_ddx(struct lp_build_tgsi_soa_context *bld,
+         LLVMValueRef src)
+{
+   LLVMValueRef src_left  = lp_build_swizzle1_aos(&bld->base, src, swizzle_left);
+   LLVMValueRef src_right = lp_build_swizzle1_aos(&bld->base, src, swizzle_right);
+   return lp_build_sub(&bld->base, src_right, src_left);
+}
+
+
+static LLVMValueRef
+emit_ddy(struct lp_build_tgsi_soa_context *bld,
+         LLVMValueRef src)
+{
+   LLVMValueRef src_top    = lp_build_swizzle1_aos(&bld->base, src, swizzle_top);
+   LLVMValueRef src_bottom = lp_build_swizzle1_aos(&bld->base, src, swizzle_bottom);
+   return lp_build_sub(&bld->base, src_top, src_bottom);
+}
+
+
 /**
  * Register fetch.
  */
@@ -168,6 +217,7 @@ emit_fetch(
       break;
 
    case TGSI_UTIL_SIGN_SET:
+      /* TODO: Use bitwese OR for floating point */
       res = lp_build_abs( &bld->base, res );
       res = LLVMBuildNeg( bld->base.builder, res, "" );
       break;
@@ -185,6 +235,36 @@ emit_fetch(
 
 
 /**
+ * Register fetch with derivatives.
+ */
+static void
+emit_fetch_deriv(
+   struct lp_build_tgsi_soa_context *bld,
+   const struct tgsi_full_instruction *inst,
+   unsigned index,
+   const unsigned chan_index,
+   LLVMValueRef *res,
+   LLVMValueRef *ddx,
+   LLVMValueRef *ddy)
+{
+   LLVMValueRef src;
+
+   src = emit_fetch(bld, inst, index, chan_index);
+
+   if(res)
+      *res = src;
+
+   /* TODO: use interpolation coeffs for inputs */
+
+   if(ddx)
+      *ddx = emit_ddx(bld, src);
+
+   if(ddy)
+      *ddy = emit_ddy(bld, src);
+}
+
+
+/**
  * Register store.
  */
 static void
@@ -239,17 +319,18 @@ emit_store(
  * High-level instruction translators.
  */
 
+
 static void
 emit_tex( struct lp_build_tgsi_soa_context *bld,
           const struct tgsi_full_instruction *inst,
           boolean apply_lodbias,
-          boolean projected)
+          boolean projected,
+          LLVMValueRef *texel)
 {
    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
    LLVMValueRef lodbias;
    LLVMValueRef oow;
    LLVMValueRef coords[3];
-   LLVMValueRef texel[4];
    unsigned num_coords;
    unsigned i;
 
@@ -289,12 +370,11 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
          coords[i] = lp_build_mul(&bld->base, coords[i], oow);
    }
 
-   bld->emit_fetch_texel(bld->base.builder, bld->emit_fetch_texel_context,
-                         unit, num_coords, coords, lodbias, texel);
-
-   FOR_EACH_DST0_ENABLED_CHANNEL( inst, i ) {
-      emit_store( bld, inst, 0, i, texel[i] );
-   }
+   bld->sampler->emit_fetch_texel(bld->sampler,
+                                  bld->base.builder,
+                                  bld->base.type,
+                                  unit, num_coords, coords, lodbias,
+                                  texel);
 }
 
 
@@ -347,14 +427,6 @@ emit_kil(
 }
 
 
-static void
-emit_kilp(
-   struct lp_build_tgsi_soa_context *bld )
-{
-   /* XXX todo / fix me */
-}
-
-
 /**
  * Check if inst src/dest regs use indirect addressing into temporary
  * register file.
@@ -382,25 +454,35 @@ indirect_temp_reference(const struct tgsi_full_instruction *inst)
 static int
 emit_instruction(
    struct lp_build_tgsi_soa_context *bld,
-   struct tgsi_full_instruction *inst )
+   const struct tgsi_full_instruction *inst,
+   const struct tgsi_opcode_info *info)
 {
    unsigned chan_index;
    LLVMValueRef src0, src1, src2;
    LLVMValueRef tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-   LLVMValueRef dst0;
+   LLVMValueRef res;
+   LLVMValueRef dst0[NUM_CHANNELS];
 
    /* we can't handle indirect addressing into temp register file yet */
    if (indirect_temp_reference(inst))
       return FALSE;
 
+   assert(info->num_dst <= 1);
+   if(info->num_dst) {
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.undef;
+      }
+   }
+
    switch (inst->Instruction.Opcode) {
 #if 0
    case TGSI_OPCODE_ARL:
+      /* FIXME */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
          emit_flr(bld, 0, 0);
          emit_f2it( bld, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 #endif
@@ -408,19 +490,17 @@ emit_instruction(
    case TGSI_OPCODE_MOV:
    case TGSI_OPCODE_SWZ:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = emit_fetch( bld, inst, 0, chan_index );
       }
       break;
 
    case TGSI_OPCODE_LIT:
       if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ) {
-         emit_store( bld, inst, 0, CHAN_X, bld->base.one);
+         dst0[CHAN_X] = bld->base.one;
       }
       if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ) {
          src0 = emit_fetch( bld, inst, 0, CHAN_X );
-         dst0 = lp_build_max( &bld->base, src0, bld->base.zero);
-         emit_store( bld, inst, 0, CHAN_Y, dst0);
+         dst0[CHAN_Y] = lp_build_max( &bld->base, src0, bld->base.zero);
       }
       if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
          /* XMM[1] = SrcReg[0].yyyy */
@@ -432,20 +512,19 @@ emit_instruction(
          tmp1 = lp_build_pow( &bld->base, tmp1, tmp2);
          tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
          tmp2 = lp_build_cmp(&bld->base, PIPE_FUNC_GREATER, tmp0, bld->base.zero);
-         dst0 = lp_build_select(&bld->base, tmp2, tmp1, bld->base.zero);
-         emit_store( bld, inst, 0, CHAN_Z, dst0);
+         dst0[CHAN_Z] = lp_build_select(&bld->base, tmp2, tmp1, bld->base.zero);
       }
       if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) ) {
-         emit_store( bld, inst, 0, CHAN_W, bld->base.one);
+         dst0[CHAN_W] = bld->base.one;
       }
       break;
 
    case TGSI_OPCODE_RCP:
    /* TGSI_OPCODE_RECIP */
       src0 = emit_fetch( bld, inst, 0, CHAN_X );
-      dst0 = lp_build_rcp(&bld->base, src0);
+      res = lp_build_rcp(&bld->base, src0);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, dst0 );
+         dst0[chan_index] = res;
       }
       break;
 
@@ -453,9 +532,9 @@ emit_instruction(
    /* TGSI_OPCODE_RECIPSQRT */
       src0 = emit_fetch( bld, inst, 0, CHAN_X );
       src0 = lp_build_abs(&bld->base, src0);
-      dst0 = lp_build_rsqrt(&bld->base, src0);
+      res = lp_build_rsqrt(&bld->base, src0);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, dst0 );
+         dst0[chan_index] = res;
       }
       break;
 
@@ -479,16 +558,15 @@ emit_instruction(
          lp_build_exp2_approx(&bld->base, src0, p_exp2_int_part, p_frac_part, p_exp2);
 
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
-            emit_store( bld, inst, 0, CHAN_X, tmp0);
+            dst0[CHAN_X] = tmp0;
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
-            emit_store( bld, inst, 0, CHAN_Y, tmp1);
+            dst0[CHAN_Y] = tmp1;
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
-            emit_store( bld, inst, 0, CHAN_Z, tmp2);
+            dst0[CHAN_Z] = tmp2;
       }
       /* dst.w = 1.0 */
       if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
-         tmp0 = bld->base.one;
-         emit_store( bld, inst, 0, CHAN_W, tmp0);
+         dst0[CHAN_W] = bld->base.one;
       }
       break;
 
@@ -514,20 +592,18 @@ emit_instruction(
 
          /* dst.x = floor(lg2(abs(src.x))) */
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
-            emit_store( bld, inst, 0, CHAN_X, tmp0);
+            dst0[CHAN_X] = tmp0;
          /* dst.y = abs(src)/ex2(floor(lg2(abs(src.x)))) */
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y )) {
-            tmp1 = lp_build_div( &bld->base, src0, tmp1);
-            emit_store( bld, inst, 0, CHAN_Y, tmp1);
+            dst0[CHAN_Y] = lp_build_div( &bld->base, src0, tmp1);
          }
          /* dst.z = lg2(abs(src.x)) */
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
-            emit_store( bld, inst, 0, CHAN_Z, tmp2);
+            dst0[CHAN_Z] = tmp2;
       }
       /* dst.w = 1.0 */
       if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
-         tmp0 = bld->base.one;
-         emit_store( bld, inst, 0, CHAN_W, tmp0);
+         dst0[CHAN_W] = bld->base.one;
       }
       break;
 
@@ -535,8 +611,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0 = lp_build_mul(&bld->base, src0, src1);
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_mul(&bld->base, src0, src1);
       }
       break;
 
@@ -544,8 +619,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0 = lp_build_add(&bld->base, src0, src1);
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_add(&bld->base, src0, src1);
       }
       break;
 
@@ -563,7 +637,7 @@ emit_instruction(
       tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
       tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
@@ -585,28 +659,24 @@ emit_instruction(
       tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
       tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
    case TGSI_OPCODE_DST:
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
-         tmp0 = bld->base.one;
-         emit_store( bld, inst, 0, CHAN_X, tmp0);
+         dst0[CHAN_X] = bld->base.one;
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
          tmp0 = emit_fetch( bld, inst, 0, CHAN_Y );
          tmp1 = emit_fetch( bld, inst, 1, CHAN_Y );
-         tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
-         emit_store( bld, inst, 0, CHAN_Y, tmp0);
+         dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp0, tmp1);
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
-         tmp0 = emit_fetch( bld, inst, 0, CHAN_Z );
-         emit_store( bld, inst, 0, CHAN_Z, tmp0);
+         dst0[CHAN_Z] = emit_fetch( bld, inst, 0, CHAN_Z );
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
-         tmp0 = emit_fetch( bld, inst, 1, CHAN_W );
-         emit_store( bld, inst, 0, CHAN_W, tmp0);
+         dst0[CHAN_W] = emit_fetch( bld, inst, 1, CHAN_W );
       }
       break;
 
@@ -614,8 +684,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0 = lp_build_min( &bld->base, src0, src1 );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_min( &bld->base, src0, src1 );
       }
       break;
 
@@ -623,8 +692,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0 = lp_build_max( &bld->base, src0, src1 );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_max( &bld->base, src0, src1 );
       }
       break;
 
@@ -634,8 +702,7 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
@@ -645,8 +712,7 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GEQUAL, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
@@ -658,7 +724,7 @@ emit_instruction(
          tmp2 = emit_fetch( bld, inst, 2, chan_index );
          tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
          tmp0 = lp_build_add( &bld->base, tmp0, tmp2);
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
@@ -666,8 +732,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
          tmp1 = emit_fetch( bld, inst, 1, chan_index );
-         tmp0 = lp_build_sub( &bld->base, tmp0, tmp1);
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_sub( &bld->base, tmp0, tmp1);
       }
       break;
 
@@ -678,13 +743,19 @@ emit_instruction(
          src2 = emit_fetch( bld, inst, 2, chan_index );
          tmp0 = lp_build_sub( &bld->base, src1, src2 );
          tmp0 = lp_build_mul( &bld->base, src0, tmp0 );
-         dst0 = lp_build_add( &bld->base, tmp0, src2 );
-         emit_store( bld, inst, 0, chan_index, dst0 );
+         dst0[chan_index] = lp_build_add( &bld->base, tmp0, src2 );
       }
       break;
 
    case TGSI_OPCODE_CND:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         src2 = emit_fetch( bld, inst, 2, chan_index );
+         tmp1 = lp_build_const_scalar(bld->base.type, 0.5);
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src2, tmp1);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src0, src1 );
+      }
       break;
 
    case TGSI_OPCODE_DP2A:
@@ -698,45 +769,49 @@ emit_instruction(
       tmp1 = emit_fetch( bld, inst, 2, CHAN_X );  /* xmm1 = src[2].x */
       tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);  /* dest[ch] = xmm0 */
+         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
       }
       break;
 
-#if 0
    case TGSI_OPCODE_FRC:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_frc( bld, 0, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         tmp0 = lp_build_floor(&bld->base, src0);
+         tmp0 = lp_build_sub(&bld->base, tmp0, src0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
    case TGSI_OPCODE_CLAMP:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         src2 = emit_fetch( bld, inst, 2, chan_index );
+         tmp0 = lp_build_max(&bld->base, tmp0, src1);
+         tmp0 = lp_build_min(&bld->base, tmp0, src2);
+         dst0[chan_index] = tmp0;
+      }
       break;
 
    case TGSI_OPCODE_FLR:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_flr( bld, 0, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_floor(&bld->base, tmp0);
       }
       break;
 
    case TGSI_OPCODE_ROUND:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_rnd( bld, 0, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_round(&bld->base, tmp0);
       }
       break;
-#endif
 
    case TGSI_OPCODE_EX2: {
       tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
       tmp0 = lp_build_exp2( &bld->base, tmp0);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
    }
@@ -745,16 +820,16 @@ emit_instruction(
       tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
       tmp0 = lp_build_log2( &bld->base, tmp0);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
    case TGSI_OPCODE_POW:
       src0 = emit_fetch( bld, inst, 0, CHAN_X );
       src1 = emit_fetch( bld, inst, 1, CHAN_X );
-      dst0 = lp_build_pow( &bld->base, src0, src1 );
+      res = lp_build_pow( &bld->base, src0, src1 );
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, dst0 );
+         dst0[chan_index] = res;
       }
       break;
 
@@ -775,7 +850,7 @@ emit_instruction(
          tmp5 = tmp3;
          tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
          tmp2 = lp_build_sub( &bld->base, tmp2, tmp5);
-         emit_store( bld, inst, 0, CHAN_X, tmp2);
+         dst0[CHAN_X] = tmp2;
       }
       if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
           IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
@@ -786,31 +861,30 @@ emit_instruction(
          tmp3 = lp_build_mul( &bld->base, tmp3, tmp2);
          tmp1 = lp_build_mul( &bld->base, tmp1, tmp5);
          tmp3 = lp_build_sub( &bld->base, tmp3, tmp1);
-         emit_store( bld, inst, 0, CHAN_Y, tmp3);
+         dst0[CHAN_Y] = tmp3;
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
          tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
          tmp0 = lp_build_mul( &bld->base, tmp0, tmp2);
          tmp5 = lp_build_sub( &bld->base, tmp5, tmp0);
-         emit_store( bld, inst, 0, CHAN_Z, tmp5);
+         dst0[CHAN_Z] = tmp5;
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
-         tmp0 = bld->base.one;
-         emit_store( bld, inst, 0, CHAN_W, tmp0);
+         dst0[CHAN_W] = bld->base.one;
       }
       break;
 
    case TGSI_OPCODE_ABS:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         tmp0 = lp_build_abs( &bld->base, tmp0 ) ;
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_abs( &bld->base, tmp0 );
       }
       break;
 
    case TGSI_OPCODE_RCC:
+      /* deprecated? */
+      assert(0);
       return 0;
-      break;
 
    case TGSI_OPCODE_DPH:
       tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
@@ -827,7 +901,7 @@ emit_instruction(
       tmp1 = emit_fetch( bld, inst, 1, CHAN_W );
       tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
@@ -835,25 +909,27 @@ emit_instruction(
       tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
       tmp0 = lp_build_cos( &bld->base, tmp0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
    case TGSI_OPCODE_DDX:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, &dst0[chan_index], NULL);
+      }
       break;
 
    case TGSI_OPCODE_DDY:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, NULL, &dst0[chan_index]);
+      }
       break;
 
-#if 0
    case TGSI_OPCODE_KILP:
       /* predicated kill */
-      emit_kilp( bld );
-      return 0; /* XXX fix me */
+      /* FIXME */
+      return 0;
       break;
-#endif
 
    case TGSI_OPCODE_KIL:
       /* conditional kill */
@@ -885,13 +961,14 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_EQUAL, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
    case TGSI_OPCODE_SFL:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.zero;
+      }
       break;
 
    case TGSI_OPCODE_SGT:
@@ -899,8 +976,7 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
@@ -908,7 +984,7 @@ emit_instruction(
       tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
       tmp0 = lp_build_sin( &bld->base, tmp0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
@@ -917,8 +993,7 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LEQUAL, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
@@ -927,85 +1002,99 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_NOTEQUAL, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
    case TGSI_OPCODE_STR:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.one;
+      }
       break;
 
    case TGSI_OPCODE_TEX:
-      emit_tex( bld, inst, FALSE, FALSE );
+      emit_tex( bld, inst, FALSE, FALSE, dst0 );
       break;
 
    case TGSI_OPCODE_TXD:
+      /* FIXME */
       return 0;
       break;
 
    case TGSI_OPCODE_UP2H:
+      /* deprecated */
+      assert (0);
       return 0;
       break;
 
    case TGSI_OPCODE_UP2US:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_UP4B:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_UP4UB:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_X2D:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_ARA:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
 #if 0
    case TGSI_OPCODE_ARR:
+      /* FIXME */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
          emit_rnd( bld, 0, 0 );
          emit_f2it( bld, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 #endif
 
    case TGSI_OPCODE_BRA:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_CAL:
+      /* FIXME */
       return 0;
       break;
 
-#if 0
    case TGSI_OPCODE_RET:
-      emit_ret( bld );
+      /* FIXME */
+      return 0;
       break;
-#endif
 
    case TGSI_OPCODE_END:
       break;
 
-#if 0
    case TGSI_OPCODE_SSG:
    /* TGSI_OPCODE_SGN */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_sgn( bld, 0, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_sgn( &bld->base, tmp0 );
       }
       break;
-#endif
 
    case TGSI_OPCODE_CMP:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
@@ -1013,34 +1102,29 @@ emit_instruction(
          src1 = emit_fetch( bld, inst, 1, chan_index );
          src2 = emit_fetch( bld, inst, 2, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, bld->base.zero );
-         dst0 = lp_build_select( &bld->base, tmp0, src1, src2);
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src1, src2);
       }
       break;
 
    case TGSI_OPCODE_SCS:
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
          tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
-         tmp0 = lp_build_cos( &bld->base, tmp0 );
-         emit_store( bld, inst, 0, CHAN_X, tmp0);
+         dst0[CHAN_X] = lp_build_cos( &bld->base, tmp0 );
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
          tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
-         tmp0 = lp_build_sin( &bld->base, tmp0 );
-         emit_store( bld, inst, 0, CHAN_Y, tmp0);
+         dst0[CHAN_Y] = lp_build_sin( &bld->base, tmp0 );
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
-         tmp0 = bld->base.zero;
-         emit_store( bld, inst, 0, CHAN_Z, tmp0);
+         dst0[CHAN_Z] = bld->base.zero;
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
-         tmp0 = bld->base.one;
-         emit_store( bld, inst, 0, CHAN_W, tmp0);
+         dst0[CHAN_W] = bld->base.one;
       }
       break;
 
    case TGSI_OPCODE_TXB:
-      emit_tex( bld, inst, TRUE, FALSE );
+      emit_tex( bld, inst, TRUE, FALSE, dst0 );
       break;
 
    case TGSI_OPCODE_NRM:
@@ -1099,38 +1183,35 @@ emit_instruction(
 
             /* dst.x = xmm1 * src.x */
             if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X)) {
-               tmp4 = lp_build_mul( &bld->base, tmp4, tmp1);
-               emit_store(bld, inst, 0, CHAN_X, tmp4);
+               dst0[CHAN_X] = lp_build_mul( &bld->base, tmp4, tmp1);
             }
 
             /* dst.y = xmm1 * src.y */
             if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y)) {
-               tmp5 = lp_build_mul( &bld->base, tmp5, tmp1);
-               emit_store(bld, inst, 0, CHAN_Y, tmp5);
+               dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp5, tmp1);
             }
 
             /* dst.z = xmm1 * src.z */
             if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z)) {
-               tmp6 = lp_build_mul( &bld->base, tmp6, tmp1);
-               emit_store(bld, inst, 0, CHAN_Z, tmp6);
+               dst0[CHAN_Z] = lp_build_mul( &bld->base, tmp6, tmp1);
             }
 
             /* dst.w = xmm1 * src.w */
             if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X) && dims == 4) {
-               tmp7 = lp_build_mul( &bld->base, tmp7, tmp1);
-               emit_store(bld, inst, 0, CHAN_W, tmp7);
+               dst0[CHAN_W] = lp_build_mul( &bld->base, tmp7, tmp1);
             }
          }
 
-         /* dst0.w = 1.0 */
+         /* dst.w = 1.0 */
          if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W) && dims == 3) {
-            tmp0 = bld->base.one;
-            emit_store(bld, inst, 0, CHAN_W, tmp0);
+            dst0[CHAN_W] = bld->base.one;
          }
       }
       break;
 
    case TGSI_OPCODE_DIV:
+      /* deprecated */
+      assert( 0 );
       return 0;
       break;
 
@@ -1143,118 +1224,157 @@ emit_instruction(
       tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);              /* xmm1 = xmm1 * xmm2 */
       tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);  /* dest[ch] = xmm0 */
+         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
       }
       break;
 
    case TGSI_OPCODE_TXL:
-      emit_tex( bld, inst, TRUE, FALSE );
+      emit_tex( bld, inst, TRUE, FALSE, dst0 );
       break;
 
    case TGSI_OPCODE_TXP:
-      emit_tex( bld, inst, FALSE, TRUE );
+      emit_tex( bld, inst, FALSE, TRUE, dst0 );
       break;
       
    case TGSI_OPCODE_BRK:
+      /* FIXME */
       return 0;
       break;
 
    case TGSI_OPCODE_IF:
+      /* FIXME */
       return 0;
       break;
 
    case TGSI_OPCODE_BGNFOR:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_REP:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_ELSE:
+      /* FIXME */
       return 0;
       break;
 
    case TGSI_OPCODE_ENDIF:
+      /* FIXME */
       return 0;
       break;
 
    case TGSI_OPCODE_ENDFOR:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_ENDREP:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_PUSHA:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_POPA:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_CEIL:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         dst0[chan_index] = lp_build_ceil(&bld->base, tmp0);
+      }
       break;
 
    case TGSI_OPCODE_I2F:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_NOT:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
-#if 0
    case TGSI_OPCODE_TRUNC:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_f2it( bld, 0 );
-         emit_i2f( bld, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_trunc(&bld->base, tmp0);
       }
       break;
-#endif
 
    case TGSI_OPCODE_SHL:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_SHR:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_AND:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_OR:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_MOD:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_XOR:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_SAD:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_TXF:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_TXQ:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_CONT:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
@@ -1266,10 +1386,28 @@ emit_instruction(
       return 0;
       break;
 
+   case TGSI_OPCODE_NOISE1:
+   case TGSI_OPCODE_NOISE2:
+   case TGSI_OPCODE_NOISE3:
+   case TGSI_OPCODE_NOISE4:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.zero;
+      }
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
    default:
       return 0;
    }
    
+   if(info->num_dst) {
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         emit_store( bld, inst, 0, chan_index, dst0[chan_index]);
+      }
+   }
+
    return 1;
 }
 
@@ -1277,14 +1415,13 @@ emit_instruction(
 void
 lp_build_tgsi_soa(LLVMBuilderRef builder,
                   const struct tgsi_token *tokens,
-                  union lp_type type,
+                  struct lp_type type,
                   struct lp_build_mask_context *mask,
                   LLVMValueRef consts_ptr,
                   const LLVMValueRef *pos,
                   const LLVMValueRef (*inputs)[NUM_CHANNELS],
                   LLVMValueRef (*outputs)[NUM_CHANNELS],
-                  lp_emit_fetch_texel_soa_callback emit_fetch_texel,
-                  void *emit_fetch_texel_context)
+                  struct lp_build_sampler_soa *sampler)
 {
    struct lp_build_tgsi_soa_context bld;
    struct tgsi_parse_context parse;
@@ -1299,8 +1436,7 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
    bld.inputs = inputs;
    bld.outputs = outputs;
    bld.consts_ptr = consts_ptr;
-   bld.emit_fetch_texel = emit_fetch_texel;
-   bld.emit_fetch_texel_context = emit_fetch_texel_context;
+   bld.sampler = sampler;
 
    tgsi_parse_init( &parse, tokens );
 
@@ -1309,16 +1445,18 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
 
       switch( parse.FullToken.Token.Type ) {
       case TGSI_TOKEN_TYPE_DECLARATION:
-         /* Input already interpolated */
+         /* Inputs already interpolated */
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
-         if (!emit_instruction( &bld, &parse.FullToken.FullInstruction )) {
+         {
             unsigned opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
             const struct tgsi_opcode_info *info = tgsi_get_opcode_info(opcode);
-	    _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
-	                  info ? info->mnemonic : "<invalid>");
-	 }
+            if (!emit_instruction( &bld, &parse.FullToken.FullInstruction, info ))
+               _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
+                             info ? info->mnemonic : "<invalid>");
+         }
+
          break;
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_type.c b/src/gallium/drivers/llvmpipe/lp_bld_type.c
index 8e0026fd97..606243d6c5 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_type.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_type.c
@@ -33,7 +33,7 @@
 
 
 LLVMTypeRef
-lp_build_elem_type(union lp_type type)
+lp_build_elem_type(struct lp_type type)
 {
    if (type.floating) {
       switch(type.width) {
@@ -55,7 +55,7 @@ lp_build_elem_type(union lp_type type)
 
 
 LLVMTypeRef
-lp_build_vec_type(union lp_type type)
+lp_build_vec_type(struct lp_type type)
 {
    LLVMTypeRef elem_type = lp_build_elem_type(type);
    return LLVMVectorType(elem_type, type.length);
@@ -69,7 +69,7 @@ lp_build_vec_type(union lp_type type)
  * type and check for identity.
  */
 boolean
-lp_check_elem_type(union lp_type type, LLVMTypeRef elem_type) 
+lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type) 
 {
    LLVMTypeKind elem_kind;
 
@@ -107,7 +107,7 @@ lp_check_elem_type(union lp_type type, LLVMTypeRef elem_type)
 
 
 boolean
-lp_check_vec_type(union lp_type type, LLVMTypeRef vec_type) 
+lp_check_vec_type(struct lp_type type, LLVMTypeRef vec_type) 
 {
    LLVMTypeRef elem_type;
 
@@ -128,7 +128,7 @@ lp_check_vec_type(union lp_type type, LLVMTypeRef vec_type)
 
 
 boolean
-lp_check_value(union lp_type type, LLVMValueRef val) 
+lp_check_value(struct lp_type type, LLVMValueRef val) 
 {
    LLVMTypeRef vec_type;
 
@@ -143,24 +143,36 @@ lp_check_value(union lp_type type, LLVMValueRef val)
 
 
 LLVMTypeRef
-lp_build_int_elem_type(union lp_type type)
+lp_build_int_elem_type(struct lp_type type)
 {
    return LLVMIntType(type.width);
 }
 
 
 LLVMTypeRef
-lp_build_int_vec_type(union lp_type type)
+lp_build_int_vec_type(struct lp_type type)
 {
    LLVMTypeRef elem_type = lp_build_int_elem_type(type);
    return LLVMVectorType(elem_type, type.length);
 }
 
 
+struct lp_type
+lp_int_type(struct lp_type type)
+{
+   struct lp_type int_type;
+
+   memset(&int_type, 0, sizeof int_type);
+   int_type.width = type.width;
+   int_type.length = type.length;
+   return int_type;
+}
+
+
 void
 lp_build_context_init(struct lp_build_context *bld,
                       LLVMBuilderRef builder,
-                      union lp_type type)
+                      struct lp_type type)
 {
    bld->builder = builder;
    bld->type = type;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_type.h b/src/gallium/drivers/llvmpipe/lp_bld_type.h
index 3ce566be64..ee5ca3483c 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_type.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_type.h
@@ -56,58 +56,55 @@
  * on the types used for intermediate computations, such as signed vs unsigned,
  * normalized values, or fixed point.
  */
-union lp_type {
-   struct {
-      /** 
-       * Floating-point. Cannot be used with fixed. Integer numbers are
-       * represented by this zero.
-       */
-      unsigned floating:1;
-
-      /** 
-       * Fixed-point. Cannot be used with floating. Integer numbers are
-       * represented by this zero.
-       */
-      unsigned fixed:1;
-      
-      /** 
-       * Whether it can represent negative values or not.
-       *
-       * If this is not set for floating point, it means that all values are
-       * assumed to be positive.
-       */
-      unsigned sign:1;
-
-      /**
-       * Whether values are normalized to fit [0, 1] interval, or [-1, 1]
-       * interval for signed types.
-       *
-       * For integer types it means the representable integer range should be
-       * interpreted as the interval above.
-       *
-       * For floating and fixed point formats it means the values should be
-       * clamped to the interval above.
-       */
-      unsigned norm:1;
-
-      /**
-       * Element width.
-       *
-       * For fixed point values, the fixed point is assumed to be at half the
-       * width.
-       */
-      unsigned width:14;
-
-      /** 
-       * Vector length.
-       *
-       * width*length should be a power of two greater or equal to eight.
-       *
-       * @sa LP_MAX_VECTOR_LENGTH
-       */
-      unsigned length:14;
-   };
-   uint32_t value;
+struct lp_type {
+   /**
+    * Floating-point. Cannot be used with fixed. Integer numbers are
+    * represented by this zero.
+    */
+   unsigned floating:1;
+
+   /**
+    * Fixed-point. Cannot be used with floating. Integer numbers are
+    * represented by this zero.
+    */
+   unsigned fixed:1;
+
+   /**
+    * Whether it can represent negative values or not.
+    *
+    * If this is not set for floating point, it means that all values are
+    * assumed to be positive.
+    */
+   unsigned sign:1;
+
+   /**
+    * Whether values are normalized to fit [0, 1] interval, or [-1, 1]
+    * interval for signed types.
+    *
+    * For integer types it means the representable integer range should be
+    * interpreted as the interval above.
+    *
+    * For floating and fixed point formats it means the values should be
+    * clamped to the interval above.
+    */
+   unsigned norm:1;
+
+   /**
+    * Element width.
+    *
+    * For fixed point values, the fixed point is assumed to be at half the
+    * width.
+    */
+   unsigned width:14;
+
+   /**
+    * Vector length.
+    *
+    * width*length should be a power of two greater or equal to eight.
+    *
+    * @sa LP_MAX_VECTOR_LENGTH
+    */
+   unsigned length:14;
 };
 
 
@@ -124,7 +121,7 @@ struct lp_build_context
     * This not only describes the input/output LLVM types, but also whether
     * to normalize/clamp the results.
     */
-   union lp_type type;
+   struct lp_type type;
 
    /** Same as lp_build_undef(type) */
    LLVMValueRef undef;
@@ -138,37 +135,41 @@ struct lp_build_context
 
 
 LLVMTypeRef
-lp_build_elem_type(union lp_type type);
+lp_build_elem_type(struct lp_type type);
 
 
 LLVMTypeRef
-lp_build_vec_type(union lp_type type);
+lp_build_vec_type(struct lp_type type);
 
 
 boolean
-lp_check_elem_type(union lp_type type, LLVMTypeRef elem_type);
+lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type);
 
 
 boolean
-lp_check_vec_type(union lp_type type, LLVMTypeRef vec_type);
+lp_check_vec_type(struct lp_type type, LLVMTypeRef vec_type);
 
 
 boolean
-lp_check_value(union lp_type type, LLVMValueRef val);
+lp_check_value(struct lp_type type, LLVMValueRef val);
 
 
 LLVMTypeRef
-lp_build_int_elem_type(union lp_type type);
+lp_build_int_elem_type(struct lp_type type);
 
 
 LLVMTypeRef
-lp_build_int_vec_type(union lp_type type);
+lp_build_int_vec_type(struct lp_type type);
+
+
+struct lp_type
+lp_int_type(struct lp_type type);
 
 
 void
 lp_build_context_init(struct lp_build_context *bld,
                       LLVMBuilderRef builder,
-                      union lp_type type);
+                      struct lp_type type);
 
 
 #endif /* !LP_BLD_TYPE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_clear.c b/src/gallium/drivers/llvmpipe/lp_clear.c
index 580cca5b46..bdcff94b9b 100644
--- a/src/gallium/drivers/llvmpipe/lp_clear.c
+++ b/src/gallium/drivers/llvmpipe/lp_clear.c
@@ -67,6 +67,7 @@ llvmpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
          util_pack_color(rgba, ps->format, &cv);
          lp_tile_cache_clear(llvmpipe->cbuf_cache[i], rgba, cv);
       }
+      llvmpipe->dirty_render_cache = TRUE;
    }
 
    if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 233d1df0e1..a4b2bd8c2a 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -141,8 +141,6 @@ llvmpipe_is_texture_referenced( struct pipe_context *pipe,
          return PIPE_REFERENCED_FOR_WRITE;
    }
    
-   /* FIXME: we also need to do the same for the texture cache */
-   
    return PIPE_UNREFERENCED;
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index d288460a1b..b4a22ff4a9 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -44,15 +44,47 @@
 static void
 lp_jit_init_globals(struct llvmpipe_screen *screen)
 {
-   /* struct lp_jit_context */
+   LLVMTypeRef texture_type;
+
+   /* struct lp_jit_texture */
    {
       LLVMTypeRef elem_types[4];
+
+      elem_types[LP_JIT_TEXTURE_WIDTH]  = LLVMInt32Type();
+      elem_types[LP_JIT_TEXTURE_HEIGHT] = LLVMInt32Type();
+      elem_types[LP_JIT_TEXTURE_STRIDE] = LLVMInt32Type();
+      elem_types[LP_JIT_TEXTURE_DATA]   = LLVMPointerType(LLVMInt8Type(), 0);
+
+      texture_type = LLVMStructType(elem_types, Elements(elem_types), 0);
+
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, width,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_WIDTH);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, height,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_HEIGHT);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, stride,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_STRIDE);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, data,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_DATA);
+      LP_CHECK_STRUCT_SIZE(struct lp_jit_texture,
+                           screen->target, texture_type);
+
+      LLVMAddTypeName(screen->module, "texture", texture_type);
+   }
+
+   /* struct lp_jit_context */
+   {
+      LLVMTypeRef elem_types[5];
       LLVMTypeRef context_type;
 
       elem_types[0] = LLVMPointerType(LLVMFloatType(), 0); /* constants */
       elem_types[1] = LLVMPointerType(LLVMInt8Type(), 0);  /* samplers */
       elem_types[2] = LLVMFloatType();                     /* alpha_ref_value */
       elem_types[3] = LLVMPointerType(LLVMInt8Type(), 0);  /* blend_color */
+      elem_types[4] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); /* textures */
 
       context_type = LLVMStructType(elem_types, Elements(elem_types), 0);
 
@@ -64,6 +96,9 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
                              screen->target, context_type, 2);
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, blend_color,
                              screen->target, context_type, 3);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, textures,
+                             screen->target, context_type,
+                             LP_JIT_CONTEXT_TEXTURES_INDEX);
       LP_CHECK_STRUCT_SIZE(struct lp_jit_context,
                            screen->target, context_type);
 
@@ -117,7 +152,7 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
    screen->provider = LLVMCreateModuleProviderForExistingModule(screen->module);
 
    if (LLVMCreateJITCompiler(&screen->engine, screen->provider, 1, &error)) {
-      fprintf(stderr, "%s\n", error);
+      _debug_printf("%s\n", error);
       LLVMDisposeMessage(error);
       abort();
    }
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index a7fb60f9f5..58f716ede2 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -38,11 +38,31 @@
 
 #include "lp_bld_struct.h"
 
+#include "pipe/p_state.h"
+
 
 struct tgsi_sampler;
 struct llvmpipe_screen;
 
 
+struct lp_jit_texture
+{
+   uint32_t width;
+   uint32_t height;
+   uint32_t stride;
+   const void *data;
+};
+
+
+enum {
+   LP_JIT_TEXTURE_WIDTH = 0,
+   LP_JIT_TEXTURE_HEIGHT,
+   LP_JIT_TEXTURE_STRIDE,
+   LP_JIT_TEXTURE_DATA
+};
+
+
+
 /**
  * This structure is passed directly to the generated fragment shader.
  *
@@ -60,11 +80,12 @@ struct lp_jit_context
 
    struct tgsi_sampler **samplers;
 
-   /* TODO: alpha reference value */
    float alpha_ref_value;
 
-   /* TODO: blend constant color */
+   /* FIXME: store (also?) in floats */
    uint8_t *blend_color;
+
+   struct lp_jit_texture textures[PIPE_MAX_SAMPLERS];
 };
 
 
@@ -80,6 +101,11 @@ struct lp_jit_context
 #define lp_jit_context_blend_color(_builder, _ptr) \
    lp_build_struct_get(_builder, _ptr, 3, "blend_color")
 
+#define LP_JIT_CONTEXT_TEXTURES_INDEX 4
+
+#define lp_jit_context_textures(_builder, _ptr) \
+   lp_build_struct_get_ptr(_builder, _ptr, LP_JIT_CONTEXT_TEXTURES_INDEX, "textures")
+
 
 typedef void
 (*lp_jit_frag_func)(struct lp_jit_context *context,
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 125035771e..ff7ef8658a 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -27,8 +27,6 @@
 
 
 #include "util/u_memory.h"
-#include "util/u_simple_screen.h"
-#include "pipe/internal/p_winsys_screen.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 
@@ -67,8 +65,6 @@ llvmpipe_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 1;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
@@ -196,8 +192,7 @@ static void
 llvmpipe_destroy_screen( struct pipe_screen *_screen )
 {
    struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
-
-   struct pipe_winsys *winsys = _screen->winsys;
+   struct llvmpipe_winsys *winsys = screen->winsys;
 
    lp_jit_screen_cleanup(screen);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index d145f6d6bb..2d2fc19a65 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -44,6 +44,7 @@
 #include "pipe/p_thread.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "lp_bld_debug.h"
 #include "lp_tile_cache.h"
 #include "lp_tile_soa.h"
 
@@ -162,12 +163,12 @@ shade_quads(struct llvmpipe_context *llvmpipe,
    else
       depth = NULL;
 
-   /* TODO: blend color */
+   /* XXX: This will most likely fail on 32bit x86 without -mstackrealign */
+   assert(lp_check_alignment(mask, 16));
 
-   assert((((uintptr_t)mask) & 0xf) == 0);
-   assert((((uintptr_t)depth) & 0xf) == 0);
-   assert((((uintptr_t)color) & 0xf) == 0);
-   assert((((uintptr_t)llvmpipe->jit_context.blend_color) & 0xf) == 0);
+   assert(lp_check_alignment(depth, 16));
+   assert(lp_check_alignment(color, 16));
+   assert(lp_check_alignment(llvmpipe->jit_context.blend_color, 16));
 
    /* run shader */
    fs->current->jit_function( &llvmpipe->jit_context,
diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h
index fb10329887..7b26ce61a3 100644
--- a/src/gallium/drivers/llvmpipe/lp_state.h
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@@ -36,6 +36,7 @@
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_scan.h"
 #include "lp_jit.h"
+#include "lp_bld_sample.h" /* for struct lp_sampler_static_state */
 
 
 #define LP_NEW_VIEWPORT      0x1
@@ -57,16 +58,20 @@
 
 struct tgsi_sampler;
 struct vertex_info;
-
+struct pipe_context;
+struct llvmpipe_context;
 
 struct lp_fragment_shader;
 
 
 struct lp_fragment_shader_variant_key
 {
+   enum pipe_format zsbuf_format;
    struct pipe_depth_state depth;
    struct pipe_alpha_state alpha;
    struct pipe_blend_state blend;
+
+   struct lp_sampler_static_state sampler[PIPE_MAX_SAMPLERS];
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index 6fbb057937..30fb41ea65 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -93,6 +93,23 @@ llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
       vinfo->num_attribs = 0;
       for (i = 0; i < lpfs->info.num_inputs; i++) {
          int src;
+         enum interp_mode interp;
+
+         switch (lpfs->info.input_interpolate[i]) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = INTERP_CONSTANT;
+            break;
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = INTERP_LINEAR;
+            break;
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = INTERP_PERSPECTIVE;
+            break;
+         default:
+            assert(0);
+            interp = INTERP_LINEAR;
+         }
+
          switch (lpfs->info.input_semantic_name[i]) {
          case TGSI_SEMANTIC_POSITION:
             src = draw_find_vs_output(llvmpipe->draw,
@@ -108,7 +125,7 @@ llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
 
          case TGSI_SEMANTIC_FOG:
             src = draw_find_vs_output(llvmpipe->draw, TGSI_SEMANTIC_FOG, 0);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
             break;
 
          case TGSI_SEMANTIC_GENERIC:
@@ -116,7 +133,7 @@ llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
             /* this includes texcoords and varying vars */
             src = draw_find_vs_output(llvmpipe->draw, TGSI_SEMANTIC_GENERIC,
                                       lpfs->info.input_semantic_index[i]);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
             break;
 
          default:
@@ -250,7 +267,9 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
 
    if (llvmpipe->dirty & (LP_NEW_FS |
                           LP_NEW_BLEND |
-                          LP_NEW_DEPTH_STENCIL_ALPHA))
+                          LP_NEW_DEPTH_STENCIL_ALPHA |
+                          LP_NEW_SAMPLER |
+                          LP_NEW_TEXTURE))
       llvmpipe_update_fs( llvmpipe );
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 94170bd716..9faed5a0b1 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -85,6 +85,7 @@
 #include "lp_context.h"
 #include "lp_state.h"
 #include "lp_quad.h"
+#include "lp_tex_sample.h"
 
 
 static const unsigned char quad_offset_x[4] = {0, 1, 0, 1};
@@ -130,21 +131,20 @@ generate_pos0(LLVMBuilderRef builder,
  * Generate the depth test.
  */
 static void
-generate_depth(struct llvmpipe_context *lp,
-               LLVMBuilderRef builder,
-               const struct pipe_depth_state *state,
-               union lp_type src_type,
+generate_depth(LLVMBuilderRef builder,
+               const struct lp_fragment_shader_variant_key *key,
+               struct lp_type src_type,
                struct lp_build_mask_context *mask,
                LLVMValueRef src,
                LLVMValueRef dst_ptr)
 {
    const struct util_format_description *format_desc;
-   union lp_type dst_type;
+   struct lp_type dst_type;
 
-   if(!lp->framebuffer.zsbuf)
+   if(!key->depth.enabled)
       return;
 
-   format_desc = util_format_description(lp->framebuffer.zsbuf->format);
+   format_desc = util_format_description(key->zsbuf_format);
    assert(format_desc);
 
    /* Pick the depth type. */
@@ -164,7 +164,7 @@ generate_depth(struct llvmpipe_context *lp,
 #endif
 
    lp_build_depth_test(builder,
-                       state,
+                       &key->depth,
                        dst_type,
                        format_desc,
                        mask,
@@ -173,107 +173,6 @@ generate_depth(struct llvmpipe_context *lp,
 }
 
 
-struct build_fetch_texel_context
-{
-   LLVMValueRef context_ptr;
-
-   LLVMValueRef samplers_ptr;
-
-   /** Coords/texels store */
-   LLVMValueRef store_ptr;
-};
-
-
-void PIPE_CDECL
-lp_fetch_texel_soa( struct tgsi_sampler **samplers,
-                    uint32_t unit,
-                    float *store )
-{
-   struct tgsi_sampler *sampler = samplers[unit];
-
-#if 0
-   uint j;
-
-   debug_printf("%s sampler: %p (%p) store: %p\n",
-                __FUNCTION__,
-                sampler, *sampler,
-                store );
-
-   debug_printf("lodbias %f\n", store[12]);
-
-   for (j = 0; j < 4; j++)
-      debug_printf("sample %d texcoord %f %f\n",
-                   j,
-                   store[0+j],
-                   store[4+j]);
-#endif
-
-   {
-      float rgba[NUM_CHANNELS][QUAD_SIZE];
-      sampler->get_samples(sampler,
-                           &store[0],
-                           &store[4],
-                           &store[8],
-                           0.0f, /*store[12],  lodbias */
-                           rgba);
-      memcpy(store, rgba, sizeof rgba);
-   }
-
-#if 0
-   for (j = 0; j < 4; j++)
-      debug_printf("sample %d result %f %f %f %f\n",
-                   j,
-                   store[0+j],
-                   store[4+j],
-                   store[8+j],
-                   store[12+j]);
-#endif
-}
-
-
-static void
-emit_fetch_texel( LLVMBuilderRef builder,
-                  void *context,
-                  unsigned unit,
-                  unsigned num_coords,
-                  const LLVMValueRef *coords,
-                  LLVMValueRef lodbias,
-                  LLVMValueRef *texel)
-{
-   struct build_fetch_texel_context *bld = context;
-   LLVMTypeRef vec_type = LLVMTypeOf(coords[0]);
-   LLVMValueRef args[3];
-   unsigned i;
-
-   if(!bld->samplers_ptr)
-      bld->samplers_ptr = lp_jit_context_samplers(builder, bld->context_ptr);
-
-   if(!bld->store_ptr)
-      bld->store_ptr = LLVMBuildArrayAlloca(builder,
-                                            vec_type,
-                                            LLVMConstInt(LLVMInt32Type(), 4, 0),
-                                            "texel_store");
-
-   for (i = 0; i < num_coords; i++) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef coord_ptr = LLVMBuildGEP(builder, bld->store_ptr, &index, 1, "");
-      LLVMBuildStore(builder, coords[i], coord_ptr);
-   }
-
-   args[0] = bld->samplers_ptr;
-   args[1] = LLVMConstInt(LLVMInt32Type(), unit, 0);
-   args[2] = bld->store_ptr;
-
-   lp_build_intrinsic(builder, "fetch_texel", LLVMVoidType(), args, 3);
-
-   for (i = 0; i < NUM_CHANNELS; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef texel_ptr = LLVMBuildGEP(builder, bld->store_ptr, &index, 1, "");
-      texel[i] = LLVMBuildLoad(builder, texel_ptr, "");
-   }
-}
-
-
 /**
  * Generate the fragment shader, depth/stencil test, and alpha tests.
  */
@@ -282,11 +181,11 @@ generate_fs(struct llvmpipe_context *lp,
             struct lp_fragment_shader *shader,
             const struct lp_fragment_shader_variant_key *key,
             LLVMBuilderRef builder,
-            union lp_type type,
+            struct lp_type type,
             LLVMValueRef context_ptr,
             unsigned i,
             const struct lp_build_interp_soa_context *interp,
-            struct build_fetch_texel_context *sampler,
+            struct lp_build_sampler_soa *sampler,
             LLVMValueRef *pmask,
             LLVMValueRef *color,
             LLVMValueRef depth_ptr)
@@ -298,6 +197,7 @@ generate_fs(struct llvmpipe_context *lp,
    LLVMValueRef consts_ptr;
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
    LLVMValueRef z = interp->pos[2];
+   struct lp_build_flow_context *flow;
    struct lp_build_mask_context mask;
    boolean early_depth_test;
    unsigned attrib;
@@ -309,25 +209,35 @@ generate_fs(struct llvmpipe_context *lp,
 
    consts_ptr = lp_jit_context_constants(builder, context_ptr);
 
-   lp_build_mask_begin(&mask, builder, type, *pmask);
+   flow = lp_build_flow_create(builder);
+
+   memset(outputs, 0, sizeof outputs);
+
+   lp_build_flow_scope_begin(flow);
+
+   /* Declare the color and z variables */
+   for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+      color[chan] = LLVMGetUndef(vec_type);
+      lp_build_flow_scope_declare(flow, &color[chan]);
+   }
+   lp_build_flow_scope_declare(flow, &z);
+
+   lp_build_mask_begin(&mask, flow, type, *pmask);
 
    early_depth_test =
-      lp->depth_stencil->depth.enabled &&
-      lp->framebuffer.zsbuf &&
-      !lp->depth_stencil->alpha.enabled &&
-      !lp->fs->info.uses_kill &&
-      !lp->fs->info.writes_z;
+      key->depth.enabled &&
+      !key->alpha.enabled &&
+      !shader->info.uses_kill &&
+      !shader->info.writes_z;
 
    if(early_depth_test)
-      generate_depth(lp, builder, &key->depth,
+      generate_depth(builder, key,
                      type, &mask,
                      z, depth_ptr);
 
-   memset(outputs, 0, sizeof outputs);
-
    lp_build_tgsi_soa(builder, tokens, type, &mask,
                      consts_ptr, interp->pos, interp->inputs,
-                     outputs, emit_fetch_texel, sampler);
+                     outputs, sampler);
 
    for (attrib = 0; attrib < shader->info.num_outputs; ++attrib) {
       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
@@ -368,12 +278,16 @@ generate_fs(struct llvmpipe_context *lp,
    }
 
    if(!early_depth_test)
-      generate_depth(lp, builder, &key->depth,
+      generate_depth(builder, key,
                      type, &mask,
                      z, depth_ptr);
 
    lp_build_mask_end(&mask);
 
+   lp_build_flow_scope_end(flow);
+
+   lp_build_flow_destroy(flow);
+
    *pmask = mask.value;
 
 }
@@ -385,13 +299,15 @@ generate_fs(struct llvmpipe_context *lp,
 static void
 generate_blend(const struct pipe_blend_state *blend,
                LLVMBuilderRef builder,
-               union lp_type type,
+               struct lp_type type,
                LLVMValueRef context_ptr,
                LLVMValueRef mask,
                LLVMValueRef *src,
                LLVMValueRef dst_ptr)
 {
    struct lp_build_context bld;
+   struct lp_build_flow_context *flow;
+   struct lp_build_mask_context mask_ctx;
    LLVMTypeRef vec_type;
    LLVMTypeRef int_vec_type;
    LLVMValueRef const_ptr;
@@ -400,11 +316,14 @@ generate_blend(const struct pipe_blend_state *blend,
    LLVMValueRef res[4];
    unsigned chan;
 
+   lp_build_context_init(&bld, builder, type);
+
+   flow = lp_build_flow_create(builder);
+   lp_build_mask_begin(&mask_ctx, flow, type, mask);
+
    vec_type = lp_build_vec_type(type);
    int_vec_type = lp_build_int_vec_type(type);
 
-   lp_build_context_init(&bld, builder, type);
-
    const_ptr = lp_jit_context_blend_color(builder, context_ptr);
    const_ptr = LLVMBuildBitCast(builder, const_ptr,
                                 LLVMPointerType(vec_type, 0), "");
@@ -422,11 +341,16 @@ generate_blend(const struct pipe_blend_state *blend,
    lp_build_blend_soa(builder, blend, type, src, dst, con, res);
 
    for(chan = 0; chan < 4; ++chan) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), chan, 0);
-      lp_build_name(res[chan], "res.%c", "rgba"[chan]);
-      res[chan] = lp_build_select(&bld, mask, res[chan], dst[chan]);
-      LLVMBuildStore(builder, res[chan], LLVMBuildGEP(builder, dst_ptr, &index, 1, ""));
+      if(blend->colormask & (1 << chan)) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), chan, 0);
+         lp_build_name(res[chan], "res.%c", "rgba"[chan]);
+         res[chan] = lp_build_select(&bld, mask, res[chan], dst[chan]);
+         LLVMBuildStore(builder, res[chan], LLVMBuildGEP(builder, dst_ptr, &index, 1, ""));
+      }
    }
+
+   lp_build_mask_end(&mask_ctx);
+   lp_build_flow_destroy(flow);
 }
 
 
@@ -440,8 +364,8 @@ generate_fragment(struct llvmpipe_context *lp,
 {
    struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
    struct lp_fragment_shader_variant *variant;
-   union lp_type fs_type;
-   union lp_type blend_type;
+   struct lp_type fs_type;
+   struct lp_type blend_type;
    LLVMTypeRef fs_elem_type;
    LLVMTypeRef fs_vec_type;
    LLVMTypeRef fs_int_vec_type;
@@ -462,7 +386,7 @@ generate_fragment(struct llvmpipe_context *lp,
    LLVMBuilderRef builder;
    LLVMValueRef x0;
    LLVMValueRef y0;
-   struct build_fetch_texel_context sampler;
+   struct lp_build_sampler_soa *sampler;
    struct lp_build_interp_soa_context interp;
    LLVMValueRef fs_mask[LP_MAX_VECTOR_LENGTH];
    LLVMValueRef fs_out_color[NUM_CHANNELS][LP_MAX_VECTOR_LENGTH];
@@ -507,7 +431,7 @@ generate_fragment(struct llvmpipe_context *lp,
    /* TODO: actually pick these based on the fs and color buffer
     * characteristics. */
 
-   fs_type.value = 0;
+   memset(&fs_type, 0, sizeof fs_type);
    fs_type.floating = TRUE; /* floating point values */
    fs_type.sign = TRUE;     /* values are signed */
    fs_type.norm = FALSE;    /* values are not limited to [0,1] or [-1,1] */
@@ -515,7 +439,7 @@ generate_fragment(struct llvmpipe_context *lp,
    fs_type.length = 4;      /* 4 element per vector */
    num_fs = 4;
 
-   blend_type.value = 0;
+   memset(&blend_type, 0, sizeof blend_type);
    blend_type.floating = FALSE; /* values are integers */
    blend_type.sign = FALSE;     /* values are unsigned */
    blend_type.norm = TRUE;      /* values are in [0,1] or [-1,1] */
@@ -586,8 +510,13 @@ generate_fragment(struct llvmpipe_context *lp,
                             a0_ptr, dadx_ptr, dady_ptr,
                             x0, y0, 2, 0);
 
-   memset(&sampler, 0, sizeof sampler);
-   sampler.context_ptr = context_ptr;
+#if 0
+   /* C texture sampling */
+   sampler = lp_c_sampler_soa_create(context_ptr);
+#else
+   /* code generated texture sampling */
+   sampler = lp_llvm_sampler_soa_create(key->sampler, context_ptr);
+#endif
 
    for(i = 0; i < num_fs; ++i) {
       LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
@@ -606,7 +535,7 @@ generate_fragment(struct llvmpipe_context *lp,
                   context_ptr,
                   i,
                   &interp,
-                  &sampler,
+                  sampler,
                   &fs_mask[i],
                   out_color,
                   depth_ptr_i);
@@ -615,6 +544,8 @@ generate_fragment(struct llvmpipe_context *lp,
          fs_out_color[chan][i] = out_color[chan];
    }
 
+   sampler->destroy(sampler);
+
    /* 
     * Convert the fs's output color and mask to fit to the blending type. 
     */
@@ -765,18 +696,45 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
  */
 static void
 make_variant_key(struct llvmpipe_context *lp,
+                 struct lp_fragment_shader *shader,
                  struct lp_fragment_shader_variant_key *key)
 {
+   unsigned i;
+
    memset(key, 0, sizeof *key);
 
-   memcpy(&key->depth, &lp->depth_stencil->depth, sizeof key->depth);
+   if(lp->framebuffer.zsbuf &&
+      lp->depth_stencil->depth.enabled) {
+      key->zsbuf_format = lp->framebuffer.zsbuf->format;
+      memcpy(&key->depth, &lp->depth_stencil->depth, sizeof key->depth);
+   }
 
    key->alpha.enabled = lp->depth_stencil->alpha.enabled;
    if(key->alpha.enabled)
       key->alpha.func = lp->depth_stencil->alpha.func;
    /* alpha.ref_value is passed in jit_context */
 
-   memcpy(&key->blend, lp->blend, sizeof key->blend);
+   if(lp->framebuffer.cbufs[0]) {
+      const struct util_format_description *format_desc;
+      unsigned chan;
+
+      memcpy(&key->blend, lp->blend, sizeof key->blend);
+
+      format_desc = util_format_description(lp->framebuffer.cbufs[0]->format);
+      assert(format_desc->layout == UTIL_FORMAT_COLORSPACE_RGB ||
+             format_desc->layout == UTIL_FORMAT_COLORSPACE_SRGB);
+
+      /* mask out color channels not present in the color buffer */
+      for(chan = 0; chan < 4; ++chan) {
+         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
+         if(swizzle > 4)
+            key->blend.colormask &= ~(1 << chan);
+      }
+   }
+
+   for(i = 0; i < PIPE_MAX_SAMPLERS; ++i)
+      if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i))
+         lp_sampler_static_state(&key->sampler[i], lp->texture[i], lp->sampler[i]);
 }
 
 
@@ -787,7 +745,7 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
    struct lp_fragment_shader_variant_key key;
    struct lp_fragment_shader_variant *variant;
 
-   make_variant_key(lp, &key);
+   make_variant_key(lp, shader, &key);
 
    variant = shader->variants;
    while(variant) {
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 4fef541b1e..c69d90c723 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -98,6 +98,16 @@ llvmpipe_set_sampler_textures(struct pipe_context *pipe,
 
       pipe_texture_reference(&llvmpipe->texture[i], tex);
       lp_tex_tile_cache_set_texture(llvmpipe->tex_cache[i], tex);
+
+      if(tex) {
+         struct llvmpipe_texture *lp_tex = llvmpipe_texture(tex);
+         struct lp_jit_texture *jit_tex = &llvmpipe->jit_context.textures[i];
+         jit_tex->width = tex->width[0];
+         jit_tex->height = tex->height[0];
+         jit_tex->stride = lp_tex->stride[0];
+         if(!lp_tex->dt)
+            jit_tex->data = lp_tex->data;
+      }
    }
 
    llvmpipe->num_textures = num;
diff --git a/src/gallium/drivers/llvmpipe/lp_test.h b/src/gallium/drivers/llvmpipe/lp_test.h
index 69aaae26e0..a88e110c66 100644
--- a/src/gallium/drivers/llvmpipe/lp_test.h
+++ b/src/gallium/drivers/llvmpipe/lp_test.h
@@ -86,43 +86,43 @@ random_float(void);
 
 
 void
-dump_type(FILE *fp, union lp_type type);
+dump_type(FILE *fp, struct lp_type type);
 
 
 double
-read_elem(union lp_type type, const void *src, unsigned index);
+read_elem(struct lp_type type, const void *src, unsigned index);
 
 
 void
-write_elem(union lp_type type, void *dst, unsigned index, double src);
+write_elem(struct lp_type type, void *dst, unsigned index, double src);
 
 
 void
-random_elem(union lp_type type, void *dst, unsigned index);
+random_elem(struct lp_type type, void *dst, unsigned index);
 
 
 void
-read_vec(union lp_type type, const void *src, double *dst);
+read_vec(struct lp_type type, const void *src, double *dst);
 
 
 void
-write_vec(union lp_type type, void *dst, const double *src);
+write_vec(struct lp_type type, void *dst, const double *src);
 
 
 void
-random_vec(union lp_type type, void *dst);
+random_vec(struct lp_type type, void *dst);
 
 
 boolean
-compare_vec_with_eps(union lp_type type, const void *res, const void *ref, double eps);
+compare_vec_with_eps(struct lp_type type, const void *res, const void *ref, double eps);
 
 
 boolean
-compare_vec(union lp_type type, const void *res, const void *ref);
+compare_vec(struct lp_type type, const void *res, const void *ref);
 
 
 void
-dump_vec(FILE *fp, union lp_type type, const void *src);
+dump_vec(FILE *fp, struct lp_type type, const void *src);
 
 
 #endif /* !LP_TEST_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index 8dfad468e3..94b661dcba 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -80,7 +80,7 @@ static void
 write_tsv_row(FILE *fp,
               const struct pipe_blend_state *blend,
               enum vector_mode mode,
-              union lp_type type,
+              struct lp_type type,
               double cycles,
               boolean success)
 {
@@ -125,7 +125,7 @@ static void
 dump_blend_type(FILE *fp,
                 const struct pipe_blend_state *blend,
                 enum vector_mode mode,
-                union lp_type type)
+                struct lp_type type)
 {
    fprintf(fp, "%s", mode ? "soa" : "aos");
 
@@ -153,7 +153,7 @@ static LLVMValueRef
 add_blend_test(LLVMModuleRef module,
                const struct pipe_blend_state *blend,
                enum vector_mode mode,
-               union lp_type type)
+               struct lp_type type)
 {
    LLVMTypeRef ret_type;
    LLVMTypeRef vec_type;
@@ -467,7 +467,7 @@ test_one(unsigned verbose,
          FILE *fp,
          const struct pipe_blend_state *blend,
          enum vector_mode mode,
-         union lp_type type)
+         struct lp_type type)
 {
    LLVMModuleRef module = NULL;
    LLVMValueRef func = NULL;
@@ -765,10 +765,10 @@ blend_funcs[] = {
 };
 
 
-const union lp_type blend_types[] = {
+const struct lp_type blend_types[] = {
    /* float, fixed,  sign,  norm, width, len */
-   {{  TRUE, FALSE, FALSE,  TRUE,    32,   4 }}, /* f32 x 4 */
-   {{ FALSE, FALSE, FALSE,  TRUE,     8,  16 }}, /* u8n x 16 */
+   {   TRUE, FALSE, FALSE,  TRUE,    32,   4 }, /* f32 x 4 */
+   {  FALSE, FALSE, FALSE,  TRUE,     8,  16 }, /* u8n x 16 */
 };
 
 
@@ -788,7 +788,7 @@ test_all(unsigned verbose, FILE *fp)
    const unsigned *alpha_dst_factor;
    struct pipe_blend_state blend;
    enum vector_mode mode;
-   const union lp_type *type;
+   const struct lp_type *type;
    bool success = TRUE;
 
    for(rgb_func = blend_funcs; rgb_func < &blend_funcs[num_funcs]; ++rgb_func) {
@@ -841,27 +841,27 @@ test_some(unsigned verbose, FILE *fp, unsigned long n)
    const unsigned *alpha_dst_factor;
    struct pipe_blend_state blend;
    enum vector_mode mode;
-   const union lp_type *type;
+   const struct lp_type *type;
    unsigned long i;
    bool success = TRUE;
 
    for(i = 0; i < n; ++i) {
-      rgb_func = &blend_funcs[random() % num_funcs];
-      alpha_func = &blend_funcs[random() % num_funcs];
-      rgb_src_factor = &blend_factors[random() % num_factors];
-      alpha_src_factor = &blend_factors[random() % num_factors];
+      rgb_func = &blend_funcs[rand() % num_funcs];
+      alpha_func = &blend_funcs[rand() % num_funcs];
+      rgb_src_factor = &blend_factors[rand() % num_factors];
+      alpha_src_factor = &blend_factors[rand() % num_factors];
       
       do {
-         rgb_dst_factor = &blend_factors[random() % num_factors];
+         rgb_dst_factor = &blend_factors[rand() % num_factors];
       } while(*rgb_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE);
 
       do {
-         alpha_dst_factor = &blend_factors[random() % num_factors];
+         alpha_dst_factor = &blend_factors[rand() % num_factors];
       } while(*alpha_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE);
 
-      mode = random() & 1;
+      mode = rand() & 1;
 
-      type = &blend_types[random() % num_types];
+      type = &blend_types[rand() % num_types];
 
       memset(&blend, 0, sizeof blend);
       blend.blend_enable      = 1;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index e6489834af..9dcf58e5dc 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -59,8 +59,8 @@ write_tsv_header(FILE *fp)
 
 static void
 write_tsv_row(FILE *fp,
-              union lp_type src_type,
-              union lp_type dst_type,
+              struct lp_type src_type,
+              struct lp_type dst_type,
               double cycles,
               boolean success)
 {
@@ -80,8 +80,8 @@ write_tsv_row(FILE *fp,
 
 static void
 dump_conv_types(FILE *fp,
-               union lp_type src_type,
-               union lp_type dst_type)
+               struct lp_type src_type,
+               struct lp_type dst_type)
 {
    fprintf(fp, "src_type=");
    dump_type(fp, src_type);
@@ -96,8 +96,8 @@ dump_conv_types(FILE *fp,
 
 static LLVMValueRef
 add_conv_test(LLVMModuleRef module,
-              union lp_type src_type, unsigned num_srcs,
-              union lp_type dst_type, unsigned num_dsts)
+              struct lp_type src_type, unsigned num_srcs,
+              struct lp_type dst_type, unsigned num_dsts)
 {
    LLVMTypeRef args[2];
    LLVMValueRef func;
@@ -145,8 +145,8 @@ add_conv_test(LLVMModuleRef module,
 static boolean
 test_one(unsigned verbose,
          FILE *fp,
-         union lp_type src_type,
-         union lp_type dst_type)
+         struct lp_type src_type,
+         struct lp_type dst_type)
 {
    LLVMModuleRef module = NULL;
    LLVMValueRef func = NULL;
@@ -343,35 +343,35 @@ test_one(unsigned verbose,
 }
 
 
-const union lp_type conv_types[] = {
+const struct lp_type conv_types[] = {
    /* float, fixed,  sign,  norm, width, len */
 
-   {{  TRUE, FALSE,  TRUE,  TRUE,    32,   4 }},
-   {{  TRUE, FALSE,  TRUE, FALSE,    32,   4 }},
-   {{  TRUE, FALSE, FALSE,  TRUE,    32,   4 }},
-   {{  TRUE, FALSE, FALSE, FALSE,    32,   4 }},
+   {   TRUE, FALSE,  TRUE,  TRUE,    32,   4 },
+   {   TRUE, FALSE,  TRUE, FALSE,    32,   4 },
+   {   TRUE, FALSE, FALSE,  TRUE,    32,   4 },
+   {   TRUE, FALSE, FALSE, FALSE,    32,   4 },
 
    /* TODO: test fixed formats too */
 
-   {{ FALSE, FALSE,  TRUE,  TRUE,    16,   8 }},
-   {{ FALSE, FALSE,  TRUE, FALSE,    16,   8 }},
-   {{ FALSE, FALSE, FALSE,  TRUE,    16,   8 }},
-   {{ FALSE, FALSE, FALSE, FALSE,    16,   8 }},
-
-   {{ FALSE, FALSE,  TRUE,  TRUE,    32,   4 }},
-   {{ FALSE, FALSE,  TRUE, FALSE,    32,   4 }},
-   {{ FALSE, FALSE, FALSE,  TRUE,    32,   4 }},
-   {{ FALSE, FALSE, FALSE, FALSE,    32,   4 }},
-
-   {{ FALSE, FALSE,  TRUE,  TRUE,    16,   8 }},
-   {{ FALSE, FALSE,  TRUE, FALSE,    16,   8 }},
-   {{ FALSE, FALSE, FALSE,  TRUE,    16,   8 }},
-   {{ FALSE, FALSE, FALSE, FALSE,    16,   8 }},
-
-   {{ FALSE, FALSE,  TRUE,  TRUE,     8,  16 }},
-   {{ FALSE, FALSE,  TRUE, FALSE,     8,  16 }},
-   {{ FALSE, FALSE, FALSE,  TRUE,     8,  16 }},
-   {{ FALSE, FALSE, FALSE, FALSE,     8,  16 }},
+   {  FALSE, FALSE,  TRUE,  TRUE,    16,   8 },
+   {  FALSE, FALSE,  TRUE, FALSE,    16,   8 },
+   {  FALSE, FALSE, FALSE,  TRUE,    16,   8 },
+   {  FALSE, FALSE, FALSE, FALSE,    16,   8 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,    32,   4 },
+   {  FALSE, FALSE,  TRUE, FALSE,    32,   4 },
+   {  FALSE, FALSE, FALSE,  TRUE,    32,   4 },
+   {  FALSE, FALSE, FALSE, FALSE,    32,   4 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,    16,   8 },
+   {  FALSE, FALSE,  TRUE, FALSE,    16,   8 },
+   {  FALSE, FALSE, FALSE,  TRUE,    16,   8 },
+   {  FALSE, FALSE, FALSE, FALSE,    16,   8 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,     8,  16 },
+   {  FALSE, FALSE,  TRUE, FALSE,     8,  16 },
+   {  FALSE, FALSE, FALSE,  TRUE,     8,  16 },
+   {  FALSE, FALSE, FALSE, FALSE,     8,  16 },
 };
 
 
@@ -381,8 +381,8 @@ const unsigned num_types = sizeof(conv_types)/sizeof(conv_types[0]);
 boolean
 test_all(unsigned verbose, FILE *fp)
 {
-   const union lp_type *src_type;
-   const union lp_type *dst_type;
+   const struct lp_type *src_type;
+   const struct lp_type *dst_type;
    bool success = TRUE;
 
    for(src_type = conv_types; src_type < &conv_types[num_types]; ++src_type) {
@@ -407,16 +407,16 @@ test_all(unsigned verbose, FILE *fp)
 boolean
 test_some(unsigned verbose, FILE *fp, unsigned long n)
 {
-   const union lp_type *src_type;
-   const union lp_type *dst_type;
+   const struct lp_type *src_type;
+   const struct lp_type *dst_type;
    unsigned long i;
    bool success = TRUE;
 
    for(i = 0; i < n; ++i) {
-      src_type = &conv_types[random() % num_types];
+      src_type = &conv_types[rand() % num_types];
       
       do {
-         dst_type = &conv_types[random() % num_types];
+         dst_type = &conv_types[rand() % num_types];
       } while (src_type == dst_type || src_type->norm != dst_type->norm);
 
       if(!test_one(verbose, fp, *src_type, *dst_type))
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
index 1d192355ee..d8455e5649 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -119,7 +119,7 @@ add_load_rgba_test(LLVMModuleRef module,
 
    lp_build_loop_begin(builder, LLVMConstInt(LLVMInt32Type(), 1, 0), &loop);
 
-   rgba = lp_build_load_rgba(builder, format, ptr);
+   rgba = lp_build_load_rgba_aos(builder, format, ptr);
    LLVMBuildStore(builder, rgba, rgba_ptr);
 
    lp_build_loop_end(builder, LLVMConstInt(LLVMInt32Type(), 4, 0), NULL, &loop);
@@ -160,7 +160,7 @@ add_store_rgba_test(LLVMModuleRef module,
 
    rgba = LLVMBuildLoad(builder, rgba_ptr, "");
 
-   lp_build_store_rgba(builder, format, ptr, rgba);
+   lp_build_store_rgba_aos(builder, format, ptr, rgba);
 
    LLVMBuildRetVoid(builder);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c b/src/gallium/drivers/llvmpipe/lp_test_main.c
index 49213fb4f0..4592dc0b2d 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_main.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_main.c
@@ -40,7 +40,7 @@
 
 void
 dump_type(FILE *fp,
-          union lp_type type)
+          struct lp_type type)
 {
    fprintf(fp, "%s%s%u%sx%u",
            type.sign ? (type.floating || type.fixed ? "" : "s") : "u",
@@ -52,7 +52,7 @@ dump_type(FILE *fp,
 
 
 double
-read_elem(union lp_type type, const void *src, unsigned index)
+read_elem(struct lp_type type, const void *src, unsigned index)
 {
    double scale = lp_const_scale(type);
    double value;
@@ -115,7 +115,7 @@ read_elem(union lp_type type, const void *src, unsigned index)
 
 
 void
-write_elem(union lp_type type, void *dst, unsigned index, double value)
+write_elem(struct lp_type type, void *dst, unsigned index, double value)
 {
    assert(index < type.length);
    if(!type.sign && value < 0.0)
@@ -184,11 +184,11 @@ write_elem(union lp_type type, void *dst, unsigned index, double value)
 
 
 void
-random_elem(union lp_type type, void *dst, unsigned index)
+random_elem(struct lp_type type, void *dst, unsigned index)
 {
    double value;
    assert(index < type.length);
-   value = (double)random()/(double)RAND_MAX;
+   value = (double)rand()/(double)RAND_MAX;
    if(!type.norm) {
       unsigned long long mask;
       if (type.floating)
@@ -199,17 +199,17 @@ random_elem(union lp_type type, void *dst, unsigned index)
          mask = ((unsigned long long)1 << (type.width - 1)) - 1;
       else
          mask = ((unsigned long long)1 << type.width) - 1;
-      value += (double)(mask & random());
+      value += (double)(mask & rand());
    }
    if(!type.sign)
-      if(random() & 1)
+      if(rand() & 1)
          value = -value;
    write_elem(type, dst, index, value);
 }
 
 
 void
-read_vec(union lp_type type, const void *src, double *dst)
+read_vec(struct lp_type type, const void *src, double *dst)
 {
    unsigned i;
    for (i = 0; i < type.length; ++i)
@@ -218,7 +218,7 @@ read_vec(union lp_type type, const void *src, double *dst)
 
 
 void
-write_vec(union lp_type type, void *dst, const double *src)
+write_vec(struct lp_type type, void *dst, const double *src)
 {
    unsigned i;
    for (i = 0; i < type.length; ++i)
@@ -229,12 +229,12 @@ write_vec(union lp_type type, void *dst, const double *src)
 float
 random_float(void)
 {
-    return (float)((double)random()/(double)RAND_MAX);
+    return (float)((double)rand()/(double)RAND_MAX);
 }
 
 
 void
-random_vec(union lp_type type, void *dst)
+random_vec(struct lp_type type, void *dst)
 {
    unsigned i;
    for (i = 0; i < type.length; ++i)
@@ -243,7 +243,7 @@ random_vec(union lp_type type, void *dst)
 
 
 boolean
-compare_vec_with_eps(union lp_type type, const void *res, const void *ref, double eps)
+compare_vec_with_eps(struct lp_type type, const void *res, const void *ref, double eps)
 {
    unsigned i;
    for (i = 0; i < type.length; ++i) {
@@ -259,7 +259,7 @@ compare_vec_with_eps(union lp_type type, const void *res, const void *ref, doubl
 
 
 boolean
-compare_vec(union lp_type type, const void *res, const void *ref)
+compare_vec(struct lp_type type, const void *res, const void *ref)
 {
    double eps = lp_const_eps(type);
    return compare_vec_with_eps(type, res, ref, eps);
@@ -267,7 +267,7 @@ compare_vec(union lp_type type, const void *res, const void *ref)
 
 
 void
-dump_vec(FILE *fp, union lp_type type, const void *src)
+dump_vec(FILE *fp, struct lp_type type, const void *src)
 {
    unsigned i;
    for (i = 0; i < type.length; ++i) {
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_cache.c b/src/gallium/drivers/llvmpipe/lp_tex_cache.c
index 23a94b5b0d..773e848242 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_cache.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_cache.c
@@ -154,7 +154,7 @@ lp_tex_tile_cache_validate_texture(struct llvmpipe_tex_tile_cache *tc)
       if (lpt->timestamp != tc->timestamp) {
          /* texture was modified, invalidate all cached tiles */
          uint i;
-         _debug_printf("INV %d %d\n", tc->timestamp, lpt->timestamp);
+         debug_printf("INV %d %d\n", tc->timestamp, lpt->timestamp);
          for (i = 0; i < NUM_ENTRIES; i++) {
             tc->entries[i].addr.bits.invalid = 1;
          }
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.h b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
index 628ec3f1ef..9ad1bde956 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.h
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
@@ -29,10 +29,13 @@
 #define LP_TEX_SAMPLE_H
 
 
+#include <llvm-c/Core.h>
+
 #include "tgsi/tgsi_exec.h"
 
 
 struct llvmpipe_tex_tile_cache;
+struct lp_sampler_static_state;
 
 
 /**
@@ -75,4 +78,24 @@ lp_get_samples(struct tgsi_sampler *tgsi_sampler,
                float rgba[NUM_CHANNELS][QUAD_SIZE]);
 
 
+/**
+ * Texture sampling code generator that just calls lp_get_samples C function
+ * for the actual sampling computation.
+ *
+ * @param context_ptr LLVM value with the pointer to the struct lp_jit_context.
+ */
+struct lp_build_sampler_soa *
+lp_c_sampler_soa_create(LLVMValueRef context_ptr);
+
+
+/**
+ * Pure-LLVM texture sampling code generator.
+ *
+ * @param context_ptr LLVM value with the pointer to the struct lp_jit_context.
+ */
+struct lp_build_sampler_soa *
+lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *key,
+                           LLVMValueRef context_ptr);
+
+
 #endif /* LP_TEX_SAMPLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
index 94eb6dad5a..a1365a045f 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
@@ -1578,3 +1578,136 @@ out:
    tgsi_sampler->get_samples( tgsi_sampler, s, t, p, lodbias, rgba );
 }
 
+
+void PIPE_CDECL
+lp_fetch_texel_soa( struct tgsi_sampler **samplers,
+                    uint32_t unit,
+                    float *store )
+{
+   struct tgsi_sampler *sampler = samplers[unit];
+
+#if 0
+   uint j;
+
+   debug_printf("%s sampler: %p (%p) store: %p\n",
+                __FUNCTION__,
+                sampler, *sampler,
+                store );
+
+   debug_printf("lodbias %f\n", store[12]);
+
+   for (j = 0; j < 4; j++)
+      debug_printf("sample %d texcoord %f %f\n",
+                   j,
+                   store[0+j],
+                   store[4+j]);
+#endif
+
+   {
+      float rgba[NUM_CHANNELS][QUAD_SIZE];
+      sampler->get_samples(sampler,
+                           &store[0],
+                           &store[4],
+                           &store[8],
+                           0.0f, /*store[12],  lodbias */
+                           rgba);
+      memcpy(store, rgba, sizeof rgba);
+   }
+
+#if 0
+   for (j = 0; j < 4; j++)
+      debug_printf("sample %d result %f %f %f %f\n",
+                   j,
+                   store[0+j],
+                   store[4+j],
+                   store[8+j],
+                   store[12+j]);
+#endif
+}
+
+
+#include "lp_bld_type.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_tgsi.h"
+
+
+struct lp_c_sampler_soa
+{
+   struct lp_build_sampler_soa base;
+
+   LLVMValueRef context_ptr;
+
+   LLVMValueRef samplers_ptr;
+
+   /** Coords/texels store */
+   LLVMValueRef store_ptr;
+};
+
+
+static void
+lp_c_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
+{
+   FREE(sampler);
+}
+
+
+static void
+lp_c_sampler_soa_emit_fetch_texel(struct lp_build_sampler_soa *_sampler,
+                                  LLVMBuilderRef builder,
+                                  struct lp_type type,
+                                  unsigned unit,
+                                  unsigned num_coords,
+                                  const LLVMValueRef *coords,
+                                  LLVMValueRef lodbias,
+                                  LLVMValueRef *texel)
+{
+   struct lp_c_sampler_soa *sampler = (struct lp_c_sampler_soa *)_sampler;
+   LLVMTypeRef vec_type = LLVMTypeOf(coords[0]);
+   LLVMValueRef args[3];
+   unsigned i;
+
+   if(!sampler->samplers_ptr)
+      sampler->samplers_ptr = lp_jit_context_samplers(builder, sampler->context_ptr);
+
+   if(!sampler->store_ptr)
+      sampler->store_ptr = LLVMBuildArrayAlloca(builder,
+                                            vec_type,
+                                            LLVMConstInt(LLVMInt32Type(), 4, 0),
+                                            "texel_store");
+
+   for (i = 0; i < num_coords; i++) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef coord_ptr = LLVMBuildGEP(builder, sampler->store_ptr, &index, 1, "");
+      LLVMBuildStore(builder, coords[i], coord_ptr);
+   }
+
+   args[0] = sampler->samplers_ptr;
+   args[1] = LLVMConstInt(LLVMInt32Type(), unit, 0);
+   args[2] = sampler->store_ptr;
+
+   lp_build_intrinsic(builder, "fetch_texel", LLVMVoidType(), args, 3);
+
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef texel_ptr = LLVMBuildGEP(builder, sampler->store_ptr, &index, 1, "");
+      texel[i] = LLVMBuildLoad(builder, texel_ptr, "");
+   }
+}
+
+
+struct lp_build_sampler_soa *
+lp_c_sampler_soa_create(LLVMValueRef context_ptr)
+{
+   struct lp_c_sampler_soa *sampler;
+
+   sampler = CALLOC_STRUCT(lp_c_sampler_soa);
+   if(!sampler)
+      return NULL;
+
+   sampler->base.destroy = lp_c_sampler_soa_destroy;
+   sampler->base.emit_fetch_texel = lp_c_sampler_soa_emit_fetch_texel;
+   sampler->context_ptr = context_ptr;
+
+   return &sampler->base;
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample_llvm.c b/src/gallium/drivers/llvmpipe/lp_tex_sample_llvm.c
new file mode 100644
index 0000000000..d2a6ae21f5
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample_llvm.c
@@ -0,0 +1,196 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture sampling code generation
+ *
+ * This file is nothing more than ugly glue between three largely independent
+ * entities:
+ * - TGSI -> LLVM translation (i.e., lp_build_tgsi_soa)
+ * - texture sampling code generation (i.e., lp_build_sample_soa)
+ * - LLVM pipe driver
+ *
+ * All interesting code is in the functions mentioned above. There is really
+ * nothing to see here.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_sample.h"
+#include "lp_bld_tgsi.h"
+#include "lp_state.h"
+#include "lp_tex_sample.h"
+
+
+/**
+ * This provides the bridge between the sampler state store in lp_jit_context
+ * and lp_jit_texture and the sampler code generator. It provides the
+ * texture layout information required by the texture sampler code generator
+ * in terms of the state stored in lp_jit_context and lp_jit_texture in runtime.
+ */
+struct llvmpipe_sampler_dynamic_state
+{
+   struct lp_sampler_dynamic_state base;
+
+   const struct lp_sampler_static_state *static_state;
+
+   LLVMValueRef context_ptr;
+};
+
+
+/**
+ * This is the bridge between our sampler and the TGSI translator.
+ */
+struct lp_llvm_sampler_soa
+{
+   struct lp_build_sampler_soa base;
+
+   struct llvmpipe_sampler_dynamic_state dynamic_state;
+};
+
+
+/**
+ * Fetch the specified member of the lp_jit_texture structure.
+ *
+ * @sa http://llvm.org/docs/GetElementPtr.html
+ */
+static LLVMValueRef
+lp_llvm_texture_member(struct lp_sampler_dynamic_state *base,
+                       LLVMBuilderRef builder,
+                       unsigned unit,
+                       unsigned member_index,
+                       const char *member_name)
+{
+   struct llvmpipe_sampler_dynamic_state *state = (struct llvmpipe_sampler_dynamic_state *)base;
+   LLVMValueRef indices[4];
+   LLVMValueRef ptr;
+   LLVMValueRef res;
+
+   assert(unit < PIPE_MAX_SAMPLERS);
+
+   /* context[0] */
+   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   /* context[0].textures */
+   indices[1] = LLVMConstInt(LLVMInt32Type(), LP_JIT_CONTEXT_TEXTURES_INDEX, 0);
+   /* context[0].textures[unit] */
+   indices[2] = LLVMConstInt(LLVMInt32Type(), unit, 0);
+   /* context[0].textures[unit].member */
+   indices[3] = LLVMConstInt(LLVMInt32Type(), member_index, 0);
+
+   ptr = LLVMBuildGEP(builder, state->context_ptr, indices, Elements(indices), "");
+
+   res = LLVMBuildLoad(builder, ptr, "");
+
+   lp_build_name(res, "context.texture%u.%s", unit, member_name);
+
+   return res;
+}
+
+
+/**
+ * Helper macro to instantiate the functions that generate the code to fetch
+ * the members of lp_jit_texture to fulfill the sampler code generator requests.
+ *
+ * This complexity is the price we have to pay to keep the texture sampler code
+ * generator a reusable module without dependencies to llvmpipe internals.
+ */
+#define LP_LLVM_TEXTURE_MEMBER(_name, _index) \
+   static LLVMValueRef \
+   lp_llvm_texture_##_name( struct lp_sampler_dynamic_state *base, \
+                            LLVMBuilderRef builder, \
+                            unsigned unit) \
+   { \
+      return lp_llvm_texture_member(base, builder, unit, _index, #_name ); \
+   }
+
+
+LP_LLVM_TEXTURE_MEMBER(width,    LP_JIT_TEXTURE_WIDTH)
+LP_LLVM_TEXTURE_MEMBER(height,   LP_JIT_TEXTURE_HEIGHT)
+LP_LLVM_TEXTURE_MEMBER(stride,   LP_JIT_TEXTURE_STRIDE)
+LP_LLVM_TEXTURE_MEMBER(data_ptr, LP_JIT_TEXTURE_DATA)
+
+
+static void
+lp_llvm_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
+{
+   FREE(sampler);
+}
+
+
+static void
+lp_llvm_sampler_soa_emit_fetch_texel(struct lp_build_sampler_soa *base,
+                                     LLVMBuilderRef builder,
+                                     struct lp_type type,
+                                     unsigned unit,
+                                     unsigned num_coords,
+                                     const LLVMValueRef *coords,
+                                     LLVMValueRef lodbias,
+                                     LLVMValueRef *texel)
+{
+   struct lp_llvm_sampler_soa *sampler = (struct lp_llvm_sampler_soa *)base;
+
+   assert(unit < PIPE_MAX_SAMPLERS);
+
+   lp_build_sample_soa(builder,
+                       &sampler->dynamic_state.static_state[unit],
+                       &sampler->dynamic_state.base,
+                       type,
+                       unit,
+                       num_coords,
+                       coords,
+                       lodbias,
+                       texel);
+}
+
+
+struct lp_build_sampler_soa *
+lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
+                           LLVMValueRef context_ptr)
+{
+   struct lp_llvm_sampler_soa *sampler;
+
+   sampler = CALLOC_STRUCT(lp_llvm_sampler_soa);
+   if(!sampler)
+      return NULL;
+
+   sampler->base.destroy = lp_llvm_sampler_soa_destroy;
+   sampler->base.emit_fetch_texel = lp_llvm_sampler_soa_emit_fetch_texel;
+   sampler->dynamic_state.base.width = lp_llvm_texture_width;
+   sampler->dynamic_state.base.height = lp_llvm_texture_height;
+   sampler->dynamic_state.base.stride = lp_llvm_texture_stride;
+   sampler->dynamic_state.base.data_ptr = lp_llvm_texture_data_ptr;
+   sampler->dynamic_state.static_state = static_state;
+   sampler->dynamic_state.context_ptr = context_ptr;
+
+   return &sampler->base;
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.c b/src/gallium/drivers/llvmpipe/lp_tile_cache.c
index 143afec3d3..0c06b659a1 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_cache.c
+++ b/src/gallium/drivers/llvmpipe/lp_tile_cache.c
@@ -44,10 +44,53 @@
 #include "lp_tile_cache.h"
 
 
+#define MAX_WIDTH 4096
+#define MAX_HEIGHT 4096
+
+
+enum llvmpipe_tile_status
+{
+   LP_TILE_STATUS_UNDEFINED = 0,
+   LP_TILE_STATUS_CLEAR = 1,
+   LP_TILE_STATUS_DEFINED = 2
+};
+
+
+struct llvmpipe_cached_tile
+{
+   enum llvmpipe_tile_status status;
+
+   /** color in SOA format */
+   uint8_t *color;
+};
+
+
+struct llvmpipe_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_surface *surface;  /**< the surface we're caching */
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct llvmpipe_cached_tile entries[MAX_WIDTH/TILE_SIZE][MAX_HEIGHT/TILE_SIZE];
+
+   uint8_t clear_color[4];  /**< for color bufs */
+   uint clear_val;        /**< for z+stencil, or packed color clear value */
+
+   struct llvmpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
+};
+
+
 struct llvmpipe_tile_cache *
 lp_create_tile_cache( struct pipe_screen *screen )
 {
    struct llvmpipe_tile_cache *tc;
+   int maxLevels, maxTexSize;
+
+   /* sanity checking: max sure MAX_WIDTH/HEIGHT >= largest texture image */
+   maxLevels = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS);
+   maxTexSize = 1 << (maxLevels - 1);
+   assert(MAX_WIDTH >= maxTexSize);
 
    tc = CALLOC_STRUCT( llvmpipe_tile_cache );
    if(!tc)
@@ -225,11 +268,14 @@ lp_flush_tile_cache(struct llvmpipe_tile_cache *tc)
                            tc->clear_val);
 
             screen->transfer_unmap(screen, pt);
+
+            tile->status = LP_TILE_STATUS_UNDEFINED;
             break;
          }
 
          case LP_TILE_STATUS_DEFINED:
             lp_put_tile_rgba_soa(pt, x, y, tile->color);
+            tile->status = LP_TILE_STATUS_UNDEFINED;
             break;
          }
       }
@@ -257,7 +303,7 @@ lp_get_cached_tile(struct llvmpipe_tile_cache *tc,
 
    case LP_TILE_STATUS_UNDEFINED:
       /* get new tile data from transfer */
-      lp_get_tile_rgba_soa(pt, x, y, tile->color);
+      lp_get_tile_rgba_soa(pt, x & ~(TILE_SIZE - 1), y & ~(TILE_SIZE - 1), tile->color);
       tile->status = LP_TILE_STATUS_DEFINED;
       break;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.h b/src/gallium/drivers/llvmpipe/lp_tile_cache.h
index 6d8ba5ece7..161bab3799 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_cache.h
+++ b/src/gallium/drivers/llvmpipe/lp_tile_cache.h
@@ -33,42 +33,7 @@
 #include "lp_tile_soa.h"
 
 
-enum llvmpipe_tile_status
-{
-   LP_TILE_STATUS_UNDEFINED = 0,
-   LP_TILE_STATUS_CLEAR = 1,
-   LP_TILE_STATUS_DEFINED = 2
-};
-
-
-struct llvmpipe_cached_tile
-{
-   enum llvmpipe_tile_status status;
-
-   /** color in SOA format */
-   uint8_t *color;
-};
-
-
-/** XXX move these */
-#define MAX_WIDTH 2048
-#define MAX_HEIGHT 2048
-
-
-struct llvmpipe_tile_cache
-{
-   struct pipe_screen *screen;
-   struct pipe_surface *surface;  /**< the surface we're caching */
-   struct pipe_transfer *transfer;
-   void *transfer_map;
-
-   struct llvmpipe_cached_tile entries[MAX_WIDTH/TILE_SIZE][MAX_HEIGHT/TILE_SIZE];
-
-   uint8_t clear_color[4];  /**< for color bufs */
-   uint clear_val;        /**< for z+stencil, or packed color clear value */
-
-   struct llvmpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
-};
+struct llvmpipe_tile_cache;  /* opaque */
 
 
 extern struct llvmpipe_tile_cache *
diff --git a/src/gallium/drivers/nv04/nv04_screen.c b/src/gallium/drivers/nv04/nv04_screen.c
index ff2febb668..170ce3eb7e 100644
--- a/src/gallium/drivers/nv04/nv04_screen.c
+++ b/src/gallium/drivers/nv04/nv04_screen.c
@@ -16,8 +16,6 @@ nv04_screen_get_param(struct pipe_screen *screen, int param)
 		return 0;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 0;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv10/nv10_screen.c b/src/gallium/drivers/nv10/nv10_screen.c
index 4469b22d91..ee5901e743 100644
--- a/src/gallium/drivers/nv10/nv10_screen.c
+++ b/src/gallium/drivers/nv10/nv10_screen.c
@@ -15,8 +15,6 @@ nv10_screen_get_param(struct pipe_screen *screen, int param)
 		return 0;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv20/nv20_screen.c b/src/gallium/drivers/nv20/nv20_screen.c
index e6924ad71e..4eeacd1afd 100644
--- a/src/gallium/drivers/nv20/nv20_screen.c
+++ b/src/gallium/drivers/nv20/nv20_screen.c
@@ -15,8 +15,6 @@ nv20_screen_get_param(struct pipe_screen *screen, int param)
 		return 0;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv30/nv30_screen.c b/src/gallium/drivers/nv30/nv30_screen.c
index f8285e4455..41af38450b 100644
--- a/src/gallium/drivers/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nv30/nv30_screen.c
@@ -22,8 +22,6 @@ nv30_screen_get_param(struct pipe_screen *pscreen, int param)
 		return 1;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv40/nv40_screen.c b/src/gallium/drivers/nv40/nv40_screen.c
index 5d2a4216c5..bd13dfddd1 100644
--- a/src/gallium/drivers/nv40/nv40_screen.c
+++ b/src/gallium/drivers/nv40/nv40_screen.c
@@ -21,8 +21,6 @@ nv40_screen_get_param(struct pipe_screen *pscreen, int param)
 		return 1;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 1;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv50/nv50_context.c b/src/gallium/drivers/nv50/nv50_context.c
index 6e8f4f9750..fca078b174 100644
--- a/src/gallium/drivers/nv50/nv50_context.c
+++ b/src/gallium/drivers/nv50/nv50_context.c
@@ -37,11 +37,12 @@ nv50_flush(struct pipe_context *pipe, unsigned flags,
 
 	/* We need this in the ddx for reliable composite, not sure what we're
 	 * actually flushing. We generate all our own flushes with flags = 0. */
-	WAIT_RING(chan, 3);
+	WAIT_RING(chan, 2);
 	BEGIN_RING(chan, eng2d, 0x0110, 1);
 	OUT_RING  (chan, 0);
 
-	FIRE_RING(chan);
+	if (flags & PIPE_FLUSH_FRAME)
+		FIRE_RING(chan);
 }
 
 static void
@@ -110,6 +111,9 @@ nv50_create(struct pipe_screen *pscreen, unsigned pctx_id)
 	nv50->pipe.is_texture_referenced = nv50_is_texture_referenced;
 	nv50->pipe.is_buffer_referenced = nv50_is_buffer_referenced;
 
+	screen->base.channel->user_private = nv50;
+	screen->base.channel->flush_notify = nv50_state_flush_notify;
+
 	nv50_init_surface_functions(nv50);
 	nv50_init_state_functions(nv50);
 	nv50_init_query_functions(nv50);
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
index 1e9e8e49bf..4608854d71 100644
--- a/src/gallium/drivers/nv50/nv50_context.h
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -116,6 +116,7 @@ struct nv50_state {
 	unsigned miptree_nr;
 	struct nouveau_stateobj *vertprog;
 	struct nouveau_stateobj *fragprog;
+	struct nouveau_stateobj *programs;
 	struct nouveau_stateobj *vtxfmt;
 	struct nouveau_stateobj *vtxbuf;
 	struct nouveau_stateobj *vtxattr;
@@ -190,10 +191,12 @@ extern void nv50_clear(struct pipe_context *pipe, unsigned buffers,
 /* nv50_program.c */
 extern void nv50_vertprog_validate(struct nv50_context *nv50);
 extern void nv50_fragprog_validate(struct nv50_context *nv50);
+extern void nv50_linkage_validate(struct nv50_context *nv50);
 extern void nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p);
 
 /* nv50_state_validate.c */
 extern boolean nv50_state_validate(struct nv50_context *nv50);
+extern void nv50_state_flush_notify(struct nouveau_channel *chan);
 
 /* nv50_tex.c */
 extern void nv50_tex_validate(struct nv50_context *);
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
index 03b9243b82..93479a0314 100644
--- a/src/gallium/drivers/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -148,6 +148,7 @@ nv50_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 	mt->image_nr = 1;
 	mt->level[0].pitch = *stride;
 	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+	mt->level[0].tile_mode = bo->tile_mode;
 
 	nouveau_bo_ref(bo, &mt->base.bo);
 	return &mt->base.base;
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 4a838529de..eb90d5e66f 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -112,6 +112,10 @@ struct nv50_pc {
 	struct nv50_reg *temp_temp[16];
 	unsigned temp_temp_nr;
 
+	/* broadcast and destination replacement regs */
+	struct nv50_reg *r_brdc;
+	struct nv50_reg *r_dst[4];
+
 	unsigned interp_mode[32];
 	/* perspective interpolation registers */
 	struct nv50_reg *iv_p;
@@ -124,6 +128,25 @@ struct nv50_pc {
 	boolean allow32;
 };
 
+static INLINE void
+ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
+{
+	reg->type = type;
+	reg->index = index;
+	reg->hw = hw;
+	reg->neg = 0;
+	reg->rhw = -1;
+	reg->acc = 0;
+}
+
+static INLINE unsigned
+popcnt4(uint32_t val)
+{
+	static const unsigned cnt[16]
+	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+	return cnt[val & 0xf];
+}
+
 static void
 alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
 {
@@ -184,11 +207,8 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
 
 	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
 		if (!pc->r_temp[i]) {
-			r = CALLOC_STRUCT(nv50_reg);
-			r->type = P_TEMP;
-			r->index = -1;
-			r->hw = i;
-			r->rhw = -1;
+			r = MALLOC_STRUCT(nv50_reg);
+			ctor_reg(r, P_TEMP, -1, i);
 			pc->r_temp[i] = r;
 			return r;
 		}
@@ -254,10 +274,8 @@ alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
 		return alloc_temp4(pc, dst, idx + 4);
 
 	for (i = 0; i < 4; i++) {
-		dst[i] = CALLOC_STRUCT(nv50_reg);
-		dst[i]->type = P_TEMP;
-		dst[i]->index = -1;
-		dst[i]->hw = idx + i;
+		dst[i] = MALLOC_STRUCT(nv50_reg);
+		ctor_reg(dst[i], P_TEMP, -1, idx + i);
 		pc->r_temp[idx + i] = dst[i];
 	}
 
@@ -309,7 +327,7 @@ ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
 static struct nv50_reg *
 alloc_immd(struct nv50_pc *pc, float f)
 {
-	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
+	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
 	unsigned hw;
 
 	for (hw = 0; hw < pc->immd_nr * 4; hw++)
@@ -319,9 +337,7 @@ alloc_immd(struct nv50_pc *pc, float f)
 	if (hw == pc->immd_nr * 4)
 		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
 
-	r->type = P_IMMD;
-	r->hw = hw;
-	r->index = -1;
+	ctor_reg(r, P_IMMD, -1, hw);
 	return r;
 }
 
@@ -786,6 +802,9 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 #define CVTOP_SAT	0x08
 #define CVTOP_ABS	0x10
 
+/* 0x04 == 32 bit */
+/* 0x40 == dst is float */
+/* 0x80 == src is float */
 #define CVT_F32_F32 0xc4
 #define CVT_F32_S32 0x44
 #define CVT_F32_U32 0x64
@@ -795,7 +814,7 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 
 static void
 emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
-	 int wp, unsigned cop, unsigned fmt)
+	 int wp, unsigned cvn, unsigned fmt)
 {
 	struct nv50_program_exec *e;
 
@@ -804,7 +823,7 @@ emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
 
 	e->inst[0] |= 0xa0000000;
 	e->inst[1] |= 0x00004000;
-	e->inst[1] |= (cop << 16);
+	e->inst[1] |= (cvn << 16);
 	e->inst[1] |= (fmt << 24);
 	set_src_0(pc, src, e);
 
@@ -821,49 +840,80 @@ emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
 	emit(pc, e);
 }
 
+/* nv50 Condition codes:
+ *  0x1 = LT
+ *  0x2 = EQ
+ *  0x3 = LE
+ *  0x4 = GT
+ *  0x5 = NE
+ *  0x6 = GE
+ *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
+ *  0x8 = unordered bit (allows NaN)
+ */
 static void
-emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
+emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
 	 struct nv50_reg *src0, struct nv50_reg *src1)
 {
+	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
+
 	struct nv50_program_exec *e = exec(pc);
-	unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
 	struct nv50_reg *rdst;
 
-	assert(c_op <= 7);
+	assert(ccode < 16);
 	if (check_swap_src_0_1(pc, &src0, &src1))
-		c_op = inv_cop[c_op];
+		ccode = cc_swapped[ccode & 7] | (ccode & 8);
 
 	rdst = dst;
-	if (dst->type != P_TEMP)
+	if (dst && dst->type != P_TEMP)
 		dst = alloc_temp(pc, NULL);
 
 	/* set.u32 */
 	set_long(pc, e);
 	e->inst[0] |= 0xb0000000;
-	e->inst[1] |= (3 << 29);
-	e->inst[1] |= (c_op << 14);
-	/*XXX: breaks things, .u32 by default?
-	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
-	 *     doesn't seem to match what the hw actually does.
-	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
+	e->inst[1] |= 0x60000000 | (ccode << 14);
+
+	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
+	 * that doesn't seem to match what the hw actually does
+	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
 	 */
-	set_dst(pc, dst, e);
+
+	if (wp >= 0)
+		set_pred_wr(pc, 1, wp, e);
+	if (dst)
+		set_dst(pc, dst, e);
+	else {
+		e->inst[0] |= 0x000001fc;
+		e->inst[1] |= 0x00000008;
+	}
+
 	set_src_0(pc, src0, e);
 	set_src_1(pc, src1, e);
-	emit(pc, e);
 
-	/* cvt.f32.u32 */
-	e = exec(pc);
-	e->inst[0] = 0xa0000001;
-	e->inst[1] = 0x64014780;
-	set_dst(pc, rdst, e);
-	set_src_0(pc, dst, e);
 	emit(pc, e);
 
-	if (dst != rdst)
+	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
+	if (rdst)
+		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
+	if (rdst && rdst != dst)
 		free_temp(pc, dst);
 }
 
+static INLINE unsigned
+map_tgsi_setop_cc(unsigned op)
+{
+	switch (op) {
+	case TGSI_OPCODE_SLT: return 0x1;
+	case TGSI_OPCODE_SGE: return 0x6;
+	case TGSI_OPCODE_SEQ: return 0x2;
+	case TGSI_OPCODE_SGT: return 0x4;
+	case TGSI_OPCODE_SLE: return 0x3;
+	case TGSI_OPCODE_SNE: return 0xd;
+	default:
+		assert(0);
+		return 0;
+	}
+}
+
 static INLINE void
 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
@@ -890,6 +940,12 @@ emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
 }
 
+static INLINE void
+emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
+}
+
 static void
 emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 	 struct nv50_reg **src)
@@ -1159,6 +1215,70 @@ negate_supported(const struct tgsi_full_instruction *insn, int i)
 	}
 }
 
+/* Return a read mask for source registers deduced from opcode & write mask. */
+static unsigned
+nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
+{
+	unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
+
+	switch (insn->Instruction.Opcode) {
+	case TGSI_OPCODE_COS:
+	case TGSI_OPCODE_SIN:
+		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
+	case TGSI_OPCODE_DP3:
+		return 0x7;
+	case TGSI_OPCODE_DP4:
+	case TGSI_OPCODE_DPH:
+	case TGSI_OPCODE_KIL: /* WriteMask ignored */
+		return 0xf;
+	case TGSI_OPCODE_DST:
+		return mask & (c ? 0xa : 0x6);
+	case TGSI_OPCODE_EX2:
+	case TGSI_OPCODE_LG2:
+	case TGSI_OPCODE_POW:
+	case TGSI_OPCODE_RCP:
+	case TGSI_OPCODE_RSQ:
+	case TGSI_OPCODE_SCS:
+		return 0x1;
+	case TGSI_OPCODE_LIT:
+		return 0xb;
+	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXP:
+	{
+		const struct tgsi_instruction_ext_texture *tex;
+
+		assert(insn->Instruction.Extended);
+		tex = &insn->InstructionExtTexture;
+
+		mask = 0x7;
+		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
+			mask |= 0x8;
+
+		switch (tex->Texture) {
+		case TGSI_TEXTURE_1D:
+			mask &= 0x9;
+			break;
+		case TGSI_TEXTURE_2D:
+			mask &= 0xb;
+			break;
+		default:
+			break;
+		}
+	}
+		return mask;
+	case TGSI_OPCODE_XPD:
+		x = 0;
+		if (mask & 1) x |= 0x6;
+		if (mask & 2) x |= 0x5;
+		if (mask & 4) x |= 0x3;
+		return x;
+	default:
+		break;
+	}
+
+	return mask;
+}
+
 static struct nv50_reg *
 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 {
@@ -1258,93 +1378,126 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 	return r;
 }
 
-/* returns TRUE if instruction can overwrite sources before they're read */
+/* return TRUE for ops that produce only a single result */
 static boolean
-direct2dest_op(const struct tgsi_full_instruction *insn)
+is_scalar_op(unsigned op)
 {
-	if (insn->Instruction.Saturate)
-		return FALSE;
-
-	switch (insn->Instruction.Opcode) {
+	switch (op) {
 	case TGSI_OPCODE_COS:
+	case TGSI_OPCODE_DP2:
 	case TGSI_OPCODE_DP3:
 	case TGSI_OPCODE_DP4:
 	case TGSI_OPCODE_DPH:
-	case TGSI_OPCODE_KIL:
-	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_EX2:
+	case TGSI_OPCODE_LG2:
 	case TGSI_OPCODE_POW:
 	case TGSI_OPCODE_RCP:
 	case TGSI_OPCODE_RSQ:
-	case TGSI_OPCODE_SCS:
 	case TGSI_OPCODE_SIN:
+		/*
+	case TGSI_OPCODE_KIL:
+	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_SCS:
+		*/
+		return TRUE;
+	default:
+		return FALSE;
+	}
+}
+
+/* Returns a bitmask indicating which dst components depend
+ * on source s, component c (reverse of nv50_tgsi_src_mask).
+ */
+static unsigned
+nv50_tgsi_dst_revdep(unsigned op, int s, int c)
+{
+	if (is_scalar_op(op))
+		return 0x1;
+
+	switch (op) {
+	case TGSI_OPCODE_DST:
+		return (1 << c) & (s ? 0xa : 0x6);
+	case TGSI_OPCODE_XPD:
+		switch (c) {
+		case 0: return 0x6;
+		case 1: return 0x5;
+		case 2: return 0x3;
+		case 3: return 0x0;
+		default:
+			assert(0);
+			return 0x0;
+		}
+	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_SCS:
 	case TGSI_OPCODE_TEX:
 	case TGSI_OPCODE_TXP:
-		return FALSE;
+		/* these take care of dangerous swizzles themselves */
+		return 0x0;
+	case TGSI_OPCODE_IF:
+	case TGSI_OPCODE_KIL:
+		/* don't call this function for these ops */
+		assert(0);
+		return 0;
 	default:
-		return TRUE;
+		/* linear vector instruction */
+		return (1 << c);
 	}
 }
 
 static boolean
-nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
+nv50_program_tx_insn(struct nv50_pc *pc,
+		     const struct tgsi_full_instruction *inst)
 {
-	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
-	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
+	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
 	unsigned mask, sat, unit;
-	boolean assimilate = FALSE;
 	int i, c;
 
 	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
 	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
 
+	memset(src, 0, sizeof(src));
+
 	for (c = 0; c < 4; c++) {
-		if (mask & (1 << c))
+		if ((mask & (1 << c)) && !pc->r_dst[c])
 			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
 		else
-			dst[c] = NULL;
-		rdst[c] = NULL;
-		src[0][c] = NULL;
-		src[1][c] = NULL;
-		src[2][c] = NULL;
+			dst[c] = pc->r_dst[c];
+		rdst[c] = dst[c];
 	}
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
+		unsigned src_mask;
+		boolean neg_supp;
+
+		src_mask = nv50_tgsi_src_mask(inst, i);
+		neg_supp = negate_supported(inst, i);
 
 		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
 			unit = fs->SrcRegister.Index;
 
 		for (c = 0; c < 4; c++)
-			src[i][c] = tgsi_src(pc, c, fs,
-					     negate_supported(inst, i));
+			if (src_mask & (1 << c))
+				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
 	}
 
-	if (sat) {
-		for (c = 0; c < 4; c++) {
-			rdst[c] = dst[c];
-			dst[c] = temp_temp(pc);
-		}
+	brdc = temp = pc->r_brdc;
+	if (brdc && brdc->type != P_TEMP) {
+		temp = temp_temp(pc);
+		if (sat)
+			brdc = temp;
 	} else
-	if (direct2dest_op(inst)) {
+	if (sat) {
 		for (c = 0; c < 4; c++) {
-			if (!dst[c] || dst[c]->type != P_TEMP)
+			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
 				continue;
-
-			for (i = c + 1; i < 4; i++) {
-				if (dst[c] == src[0][i] ||
-				    dst[c] == src[1][i] ||
-				    dst[c] == src[2][i])
-					break;
-			}
-			if (i == 4)
-				continue;
-
-			assimilate = TRUE;
 			rdst[c] = dst[c];
-			dst[c] = alloc_temp(pc, NULL);
+			dst[c] = temp_temp(pc);
 		}
 	}
 
+	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
+
 	switch (inst->Instruction.Opcode) {
 	case TGSI_OPCODE_ABS:
 		for (c = 0; c < 4; c++) {
@@ -1360,74 +1513,56 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 			emit_add(pc, dst[c], src[0][c], src[1][c]);
 		}
 		break;
-	case TGSI_OPCODE_COS:
-		temp = temp_temp(pc);
-		emit_precossin(pc, temp, src[0][0]);
-		emit_flop(pc, 5, temp, temp);
+	case TGSI_OPCODE_CEIL:
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_mov(pc, dst[c], temp);
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVTOP_CEIL, CVT_F32_F32);
+		}
+		break;
+	case TGSI_OPCODE_COS:
+		if (mask & 8) {
+			emit_precossin(pc, temp, src[0][3]);
+			emit_flop(pc, 5, dst[3], temp);
+			if (!(mask &= 7))
+				break;
+			if (temp == dst[3])
+				temp = brdc = temp_temp(pc);
 		}
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, 5, brdc, temp);
 		break;
 	case TGSI_OPCODE_DP3:
-		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
-		emit_mad(pc, temp, src[0][2], src[1][2], temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
 		break;
 	case TGSI_OPCODE_DP4:
-		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
 		emit_mad(pc, temp, src[0][2], src[1][2], temp);
-		emit_mad(pc, temp, src[0][3], src[1][3], temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
 		break;
 	case TGSI_OPCODE_DPH:
-		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
 		emit_mad(pc, temp, src[0][2], src[1][2], temp);
-		emit_add(pc, temp, src[1][3], temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_add(pc, brdc, src[1][3], temp);
 		break;
 	case TGSI_OPCODE_DST:
-	{
-		struct nv50_reg *one = alloc_immd(pc, 1.0);
-		if (mask & (1 << 0))
-			emit_mov(pc, dst[0], one);
 		if (mask & (1 << 1))
 			emit_mul(pc, dst[1], src[0][1], src[1][1]);
 		if (mask & (1 << 2))
 			emit_mov(pc, dst[2], src[0][2]);
 		if (mask & (1 << 3))
 			emit_mov(pc, dst[3], src[1][3]);
-		FREE(one);
-	}
+		if (mask & (1 << 0))
+			emit_mov_immdval(pc, dst[0], 1.0f);
 		break;
 	case TGSI_OPCODE_EX2:
-		temp = temp_temp(pc);
 		emit_preex2(pc, temp, src[0][0]);
-		emit_flop(pc, 6, temp, temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_flop(pc, 6, brdc, temp);
 		break;
 	case TGSI_OPCODE_FLR:
 		for (c = 0; c < 4; c++) {
@@ -1450,19 +1585,12 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		emit_kil(pc, src[0][1]);
 		emit_kil(pc, src[0][2]);
 		emit_kil(pc, src[0][3]);
-		pc->p->cfg.fp.regs[2] |= 0x00100000;
 		break;
 	case TGSI_OPCODE_LIT:
 		emit_lit(pc, &dst[0], mask, &src[0][0]);
 		break;
 	case TGSI_OPCODE_LG2:
-		temp = temp_temp(pc);
-		emit_flop(pc, 3, temp, src[0][0]);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_flop(pc, 3, brdc, src[0][0]);
 		break;
 	case TGSI_OPCODE_LRP:
 		temp = temp_temp(pc);
@@ -1510,31 +1638,18 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_POW:
-		temp = temp_temp(pc);
-		emit_pow(pc, temp, src[0][0], src[1][0]);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_pow(pc, brdc, src[0][0], src[1][0]);
 		break;
 	case TGSI_OPCODE_RCP:
-		for (c = 3; c >= 0; c--) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_flop(pc, 0, dst[c], src[0][0]);
-		}
+		emit_flop(pc, 0, brdc, src[0][0]);
 		break;
 	case TGSI_OPCODE_RSQ:
-		for (c = 3; c >= 0; c--) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_flop(pc, 2, dst[c], src[0][0]);
-		}
+		emit_flop(pc, 2, brdc, src[0][0]);
 		break;
 	case TGSI_OPCODE_SCS:
 		temp = temp_temp(pc);
-		emit_precossin(pc, temp, src[0][0]);
+		if (mask & 3)
+			emit_precossin(pc, temp, src[0][0]);
 		if (mask & (1 << 0))
 			emit_flop(pc, 5, dst[0], temp);
 		if (mask & (1 << 1))
@@ -1544,28 +1659,29 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		if (mask & (1 << 3))
 			emit_mov_immdval(pc, dst[3], 1.0);
 		break;
-	case TGSI_OPCODE_SGE:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
-		}
-		break;
 	case TGSI_OPCODE_SIN:
-		temp = temp_temp(pc);
-		emit_precossin(pc, temp, src[0][0]);
-		emit_flop(pc, 4, temp, temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
+		if (mask & 8) {
+			emit_precossin(pc, temp, src[0][3]);
+			emit_flop(pc, 4, dst[3], temp);
+			if (!(mask &= 7))
+				break;
+			if (temp == dst[3])
+				temp = brdc = temp_temp(pc);
 		}
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, 4, brdc, temp);
 		break;
 	case TGSI_OPCODE_SLT:
+	case TGSI_OPCODE_SGE:
+	case TGSI_OPCODE_SEQ:
+	case TGSI_OPCODE_SGT:
+	case TGSI_OPCODE_SLE:
+	case TGSI_OPCODE_SNE:
+		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
+			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
 		}
 		break;
 	case TGSI_OPCODE_SUB:
@@ -1583,6 +1699,14 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		emit_tex(pc, dst, mask, src[0], unit,
 			 inst->InstructionExtTexture.Texture, TRUE);
 		break;
+	case TGSI_OPCODE_TRUNC:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVTOP_TRUNC, CVT_F32_F32);
+		}
+		break;
 	case TGSI_OPCODE_XPD:
 		temp = temp_temp(pc);
 		if (mask & (1 << 0)) {
@@ -1607,17 +1731,22 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		return FALSE;
 	}
 
+	if (brdc) {
+		if (sat)
+			emit_sat(pc, brdc, brdc);
+		for (c = 0; c < 4; c++)
+			if ((mask & (1 << c)) && dst[c] != brdc)
+				emit_mov(pc, dst[c], brdc);
+	} else
 	if (sat) {
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT,
-				 CVT_F32_F32);
+			/* in this case we saturate later */
+			if (dst[c]->type == P_TEMP && dst[c]->index < 0)
+				continue;
+			emit_sat(pc, rdst[c], dst[c]);
 		}
-	} else if (assimilate) {
-		for (c = 0; c < 4; c++)
-			if (rdst[c])
-				assimilate_temp(pc, rdst[c], dst[c]);
 	}
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
@@ -1626,9 +1755,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 				continue;
 			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
 				FREE(src[i][c]);
-			else
-			if (src[i][c]->acc == pc->insn_cur)
-				release_hw(pc, src[i][c]);
 		}
 	}
 
@@ -1636,180 +1762,271 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 	return TRUE;
 }
 
-/* Adjust a bitmask that indicates what components of a source are used,
- * we use this in tx_prep so we only load interpolants that are needed.
- */
 static void
-insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)
+prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
 {
-	const struct tgsi_instruction_ext_texture *tex;
-
-	switch (insn->Instruction.Opcode) {
-	case TGSI_OPCODE_DP3:
-		*mask = 0x7;
-		break;
-	case TGSI_OPCODE_DP4:
-	case TGSI_OPCODE_DPH:
-		*mask = 0xF;
-		break;
-	case TGSI_OPCODE_LIT:
-		*mask = 0xB;
-		break;
-	case TGSI_OPCODE_RCP:
-	case TGSI_OPCODE_RSQ:
-		*mask = 0x1;
-		break;
-	case TGSI_OPCODE_TEX:
-	case TGSI_OPCODE_TXP:
-		assert(insn->Instruction.Extended);
-		tex = &insn->InstructionExtTexture;
-
-		*mask = 0x7;
-		if (tex->Texture == TGSI_TEXTURE_1D)
-			*mask = 0x1;
-		else
-		if (tex->Texture == TGSI_TEXTURE_2D)
-			*mask = 0x3;
-
-		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
-			*mask |= 0x8;
-		break;
-	default:
-		break;
-	}
-}
-
-static void
-prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
-		  unsigned *r_usage[2])
-{
-	const struct tgsi_full_instruction *insn;
+	struct nv50_reg *reg = NULL;
 	const struct tgsi_full_src_register *src;
 	const struct tgsi_dst_register *dst;
+	unsigned i, c, k, mask;
 
-	unsigned i, c, k, n, mask, *acc_p;
-
-	insn = &tok->FullInstruction;
 	dst = &insn->FullDstRegisters[0].DstRegister;
 	mask = dst->WriteMask;
 
-	if (!r_usage[0])
-		r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
-	if (!r_usage[1])
-		r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
+        if (dst->File == TGSI_FILE_TEMPORARY)
+                reg = pc->temp;
+        else
+        if (dst->File == TGSI_FILE_OUTPUT)
+                reg = pc->result;
 
-	if (dst->File == TGSI_FILE_TEMPORARY) {
+	if (reg) {
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
+			reg[dst->Index * 4 + c].acc = pc->insn_nr;
 		}
 	}
 
 	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
 		src = &insn->FullSrcRegisters[i];
 
-		switch (src->SrcRegister.File) {
-		case TGSI_FILE_TEMPORARY:
-			acc_p = r_usage[0];
-			break;
-		case TGSI_FILE_INPUT:
-			acc_p = r_usage[1];
-			break;
-		default:
+		if (src->SrcRegister.File == TGSI_FILE_TEMPORARY)
+			reg = pc->temp;
+		else
+		if (src->SrcRegister.File == TGSI_FILE_INPUT)
+			reg = pc->attr;
+		else
 			continue;
-		}
 
-		insn_adjust_mask(insn, &mask);
+		mask = nv50_tgsi_src_mask(insn, i);
 
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-
 			k = tgsi_util_get_full_src_register_extswizzle(src, c);
-			switch (k) {
-			case TGSI_EXTSWIZZLE_X:
-			case TGSI_EXTSWIZZLE_Y:
-			case TGSI_EXTSWIZZLE_Z:
-			case TGSI_EXTSWIZZLE_W:
-				n = src->SrcRegister.Index * 4 + k;
-				acc_p[n] = pc->insn_nr;
-				break;
-			default:
-				break;
-			}
+
+			if (k > TGSI_EXTSWIZZLE_W)
+				continue;
+
+			reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr;
 		}
 	}
 }
 
+/* Returns a bitmask indicating which dst components need to be
+ * written to temporaries first to avoid 'corrupting' sources.
+ *
+ * m[i]   (out) indicate component to write in the i-th position
+ * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
+ */
 static unsigned
-load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
-	       int *aid, int *p_oid)
+nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
 {
-	struct nv50_reg *iv;
-	int oid, c, n;
-	unsigned mask = 0;
+	unsigned i, c, x, unsafe;
 
-	iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
+	for (c = 0; c < 4; c++)
+		m[c] = c;
 
-	for (c = 0, n = i * 4; c < 4; c++, n++) {
-		oid = (*p_oid)++;
-		pc->attr[n].type = P_TEMP;
-		pc->attr[n].index = i;
+	/* Swap as long as a dst component written earlier is depended on
+	 * by one written later, but the next one isn't depended on by it.
+	 */
+	for (c = 0; c < 3; c++) {
+		if (rdep[m[c + 1]] & (1 << m[c]))
+			continue; /* if next one is depended on by us */
+		for (i = c + 1; i < 4; i++)
+			/* if we are depended on by a later one */
+			if (rdep[m[c]] & (1 << m[i]))
+				break;
+		if (i == 4)
+			continue;
+		/* now, swap */
+		x = m[c];
+		m[c] = m[c + 1];
+		m[c + 1] = x;
+
+		/* restart */
+		c = 0;
+	}
+
+	/* mark dependencies that could not be resolved by reordering */
+	for (i = 0; i < 3; ++i)
+		for (c = i + 1; c < 4; ++c)
+			if (rdep[m[i]] & (1 << m[c]))
+				unsafe |= (1 << i);
+
+	/* NOTE: $unsafe is with respect to order, not component */
+	return unsafe;
+}
 
-		if (pc->attr[n].acc == acc[n])
+/* Select a suitable dst register for broadcasting scalar results,
+ * or return NULL if we have to allocate an extra TEMP.
+ *
+ * If e.g. only 1 component is written, we may also emit the final
+ * result to a write-only register.
+ */
+static struct nv50_reg *
+tgsi_broadcast_dst(struct nv50_pc *pc,
+		   const struct tgsi_full_dst_register *fd, unsigned mask)
+{
+	if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
+		int c = ffs(~mask & fd->DstRegister.WriteMask);
+		if (c)
+			return tgsi_dst(pc, c - 1, fd);
+	} else {
+		int c = ffs(fd->DstRegister.WriteMask) - 1;
+		if ((1 << c) == fd->DstRegister.WriteMask)
+			return tgsi_dst(pc, c, fd);
+	}
+
+	return NULL;
+}
+
+/* Scan source swizzles and return a bitmask indicating dst regs that
+ * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
+ */
+static unsigned
+nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
+		       unsigned rdep[4])
+{
+	const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
+	const struct tgsi_full_src_register *fs;
+	unsigned i, deqs = 0;
+
+	for (i = 0; i < 4; ++i)
+		rdep[i] = 0;
+
+	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
+		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
+		boolean neg_supp = negate_supported(insn, i);
+
+		fs = &insn->FullSrcRegisters[i];
+		if (fs->SrcRegister.File != fd->DstRegister.File ||
+		    fs->SrcRegister.Index != fd->DstRegister.Index)
 			continue;
-		mask |= (1 << c);
 
-		pc->attr[n].acc = acc[n];
-		pc->attr[n].rhw = pc->attr[n].hw = -1;
-		alloc_reg(pc, &pc->attr[n]);
+		for (chn = 0; chn < 4; ++chn) {
+			unsigned s, c;
+
+			if (!(mask & (1 << chn))) /* src is not read */
+				continue;
+			c = tgsi_util_get_full_src_register_extswizzle(fs, chn);
+			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
 
-		pc->attr[n].rhw = (*aid)++;
-		emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
+			if (c > TGSI_EXTSWIZZLE_W ||
+			    !(fd->DstRegister.WriteMask & (1 << c)))
+				continue;
 
-		pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
-		(*mid)++;
-		pc->p->cfg.fp.regs[1] += 0x00010001;
+			/* no danger if src is copied to TEMP first */
+			if ((s != TGSI_UTIL_SIGN_KEEP) &&
+			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
+				continue;
+
+			rdep[c] |= nv50_tgsi_dst_revdep(
+				insn->Instruction.Opcode, i, chn);
+			deqs |= (1 << c);
+		}
 	}
 
-	return mask;
+	return deqs;
 }
 
 static boolean
-nv50_program_tx_prep(struct nv50_pc *pc)
+nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 {
-	struct tgsi_parse_context p;
-	boolean ret = FALSE;
-	unsigned i, c;
-	unsigned fcol, bcol, fcrd, depr;
+	struct tgsi_full_instruction insn = tok->FullInstruction;
+	const struct tgsi_full_dst_register *fd;
+	unsigned i, deqs, rdep[4], m[4];
+
+	fd = &tok->FullInstruction.FullDstRegisters[0];
+	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
+
+	if (is_scalar_op(insn.Instruction.Opcode)) {
+		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
+		if (!pc->r_brdc)
+			pc->r_brdc = temp_temp(pc);
+		return nv50_program_tx_insn(pc, &insn);
+	}
+	pc->r_brdc = NULL;
+
+	if (!deqs)
+		return nv50_program_tx_insn(pc, &insn);
+
+	deqs = nv50_revdep_reorder(m, rdep);
 
-	/* count (centroid) perspective interpolations */
-	unsigned centroid_loads = 0;
-	unsigned perspect_loads = 0;
+	for (i = 0; i < 4; ++i) {
+		assert(pc->r_dst[m[i]] == NULL);
 
-	/* track register access for temps and attrs */
-	unsigned *r_usage[2];
-	r_usage[0] = NULL;
-	r_usage[1] = NULL;
+		insn.FullDstRegisters[0].DstRegister.WriteMask =
+			fd->DstRegister.WriteMask & (1 << m[i]);
 
-	depr = fcol = bcol = fcrd = 0xffff;
+		if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
+			continue;
+
+		if (deqs & (1 << i))
+			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
 
-	if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-		pc->p->cfg.fp.regs[0] = 0x01000404;
-		pc->p->cfg.fp.regs[1] = 0x00000400;
+		if (!nv50_program_tx_insn(pc, &insn))
+			return FALSE;
 	}
 
-	tgsi_parse_init(&p, pc->p->pipe.tokens);
-	while (!tgsi_parse_end_of_tokens(&p)) {
-		const union tgsi_full_token *tok = &p.FullToken;
+	for (i = 0; i < 4; i++) {
+		struct nv50_reg *reg = pc->r_dst[i];
+		if (!reg)
+			continue;
+		pc->r_dst[i] = NULL;
+
+		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
+			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
+		else
+			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
+		free_temp(pc, reg);
+	}
 
-		tgsi_parse_token(&p);
+	return TRUE;
+}
+
+static void
+load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
+{
+	struct nv50_reg *iv, **ppiv;
+	unsigned mode = pc->interp_mode[reg->index];
+
+	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
+	iv = *ppiv;
+
+	if ((mode & INTERP_PERSPECTIVE) && !iv) {
+		iv = *ppiv = alloc_temp(pc, NULL);
+		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
+
+		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
+		emit_flop(pc, 0, iv, iv);
+
+		/* XXX: when loading interpolants dynamically, move these
+		 * to the program head, or make sure it can't be skipped.
+		 */
+	}
+
+	emit_interp(pc, reg, iv, mode);
+}
+
+static boolean
+nv50_program_tx_prep(struct nv50_pc *pc)
+{
+	struct tgsi_parse_context tp;
+	struct nv50_program *p = pc->p;
+	boolean ret = FALSE;
+	unsigned i, c, flat_nr = 0;
+
+	tgsi_parse_init(&tp, pc->p->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&tp)) {
+		const union tgsi_full_token *tok = &tp.FullToken;
+
+		tgsi_parse_token(&tp);
 		switch (tok->Token.Type) {
 		case TGSI_TOKEN_TYPE_IMMEDIATE:
 		{
 			const struct tgsi_full_immediate *imm =
-				&p.FullToken.FullImmediate;
+				&tp.FullToken.FullImmediate;
 
 			ctor_immd(pc, imm->u[0].Float,
 				      imm->u[1].Float,
@@ -1820,78 +2037,61 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 		case TGSI_TOKEN_TYPE_DECLARATION:
 		{
 			const struct tgsi_full_declaration *d;
-			unsigned last, first, mode;
+			unsigned si, last, first, mode;
 
-			d = &p.FullToken.FullDeclaration;
+			d = &tp.FullToken.FullDeclaration;
 			first = d->DeclarationRange.First;
 			last = d->DeclarationRange.Last;
 
 			switch (d->Declaration.File) {
 			case TGSI_FILE_TEMPORARY:
-				if (pc->temp_nr < (last + 1))
-					pc->temp_nr = last + 1;
 				break;
 			case TGSI_FILE_OUTPUT:
-				if (pc->result_nr < (last + 1))
-					pc->result_nr = last + 1;
-
-				if (!d->Declaration.Semantic)
+				if (!d->Declaration.Semantic ||
+				    p->type == PIPE_SHADER_FRAGMENT)
 					break;
 
+				si = d->Semantic.SemanticIndex;
 				switch (d->Semantic.SemanticName) {
-				case TGSI_SEMANTIC_POSITION:
-					depr = first;
-					pc->p->cfg.fp.regs[2] |= 0x00000100;
-					pc->p->cfg.fp.regs[3] |= 0x00000011;
+				case TGSI_SEMANTIC_BCOLOR:
+					p->cfg.two_side[si].hw = first;
+					if (p->cfg.io_nr > first)
+						p->cfg.io_nr = first;
+					break;
+				case TGSI_SEMANTIC_PSIZE:
+					p->cfg.psiz = first;
+					if (p->cfg.io_nr > first)
+						p->cfg.io_nr = first;
+					break;
+					/*
+				case TGSI_SEMANTIC_CLIP_DISTANCE:
+					p->cfg.clpd = MIN2(p->cfg.clpd, first);
 					break;
+					*/
 				default:
 					break;
 				}
-
 				break;
 			case TGSI_FILE_INPUT:
 			{
-				if (pc->attr_nr < (last + 1))
-					pc->attr_nr = last + 1;
-
-				if (pc->p->type != PIPE_SHADER_FRAGMENT)
+				if (p->type != PIPE_SHADER_FRAGMENT)
 					break;
 
 				switch (d->Declaration.Interpolate) {
 				case TGSI_INTERPOLATE_CONSTANT:
 					mode = INTERP_FLAT;
+					flat_nr++;
 					break;
 				case TGSI_INTERPOLATE_PERSPECTIVE:
 					mode = INTERP_PERSPECTIVE;
+					p->cfg.regs[1] |= 0x08 << 24;
 					break;
 				default:
 					mode = INTERP_LINEAR;
 					break;
 				}
-
-				if (d->Declaration.Semantic) {
-					switch (d->Semantic.SemanticName) {
-					case TGSI_SEMANTIC_POSITION:
-						fcrd = first;
-						break;
-					case TGSI_SEMANTIC_COLOR:
-						fcol = first;
-						mode = INTERP_PERSPECTIVE;
-						break;
-					case TGSI_SEMANTIC_BCOLOR:
-						bcol = first;
-						mode = INTERP_PERSPECTIVE;
-						break;
-					}
-				}
-
-				if (d->Declaration.Centroid) {
+				if (d->Declaration.Centroid)
 					mode |= INTERP_CENTROID;
-					if (mode & INTERP_PERSPECTIVE)
-						centroid_loads++;
-				} else
-				if (mode & INTERP_PERSPECTIVE)
-					perspect_loads++;
 
 				assert(last < 32);
 				for (i = first; i <= last; i++)
@@ -1899,8 +2099,6 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 			}
 				break;
 			case TGSI_FILE_CONSTANT:
-				if (pc->param_nr < (last + 1))
-					pc->param_nr = last + 1;
 				break;
 			case TGSI_FILE_SAMPLER:
 				break;
@@ -1913,182 +2111,155 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 			break;
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
 			pc->insn_nr++;
-			prep_inspect_insn(pc, tok, r_usage);
+			prep_inspect_insn(pc, &tok->FullInstruction);
 			break;
 		default:
 			break;
 		}
 	}
 
-	if (pc->temp_nr) {
-		pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
-		if (!pc->temp)
-			goto out_err;
+	if (p->type == PIPE_SHADER_VERTEX) {
+		int rid = 0;
 
-		for (i = 0; i < pc->temp_nr; i++) {
-			for (c = 0; c < 4; c++) {
-				pc->temp[i*4+c].type = P_TEMP;
-				pc->temp[i*4+c].hw = -1;
-				pc->temp[i*4+c].rhw = -1;
-				pc->temp[i*4+c].index = i;
-				pc->temp[i*4+c].acc = r_usage[0][i*4+c];
+		for (i = 0; i < pc->attr_nr * 4; ++i) {
+			if (pc->attr[i].acc) {
+				pc->attr[i].hw = rid++;
+				p->cfg.attr[i / 32] |= 1 << (i % 32);
 			}
 		}
-	}
-
-	if (pc->attr_nr) {
-		int oid = 4, mid = 4, aid = 0;
-		/* oid = VP output id
-		 * aid = FP attribute/interpolant id
-		 * mid = VP output mapping field ID
-		 */
 
-		pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
-		if (!pc->attr)
-			goto out_err;
+		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
+			p->cfg.io[i].hw = rid;
+			p->cfg.io[i].id_vp = i;
 
-		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-			/* position should be loaded first */
-			if (fcrd != 0xffff) {
-				unsigned mask;
-				mid = 0;
-				mask = load_fp_attrib(pc, fcrd, r_usage[1],
-						      &mid, &aid, &oid);
-				oid = 0;
-				pc->p->cfg.fp.regs[1] |= (mask << 24);
-				pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
-			}
-			pc->p->cfg.fp.map[0] += 0x03020100;
-
-			/* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
-
-			if (perspect_loads) {
-				pc->iv_p = alloc_temp(pc, NULL);
-
-				if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
-					pc->p->cfg.fp.regs[1] |= 0x08000000;
-					pc->iv_p->rhw = aid++;
-					emit_interp(pc, pc->iv_p, NULL,
-						    INTERP_LINEAR);
-					emit_flop(pc, 0, pc->iv_p, pc->iv_p);
-				} else {
-					pc->iv_p->rhw = aid - 1;
-					emit_flop(pc, 0, pc->iv_p,
-						  &pc->attr[fcrd * 4 + 3]);
-				}
+			for (c = 0; c < 4; ++c) {
+				int n = i * 4 + c;
+				if (!pc->result[n].acc)
+					continue;
+				pc->result[n].hw = rid++;
+				p->cfg.io[i].mask |= 1 << c;
 			}
+		}
 
-			if (centroid_loads) {
-				pc->iv_c = alloc_temp(pc, NULL);
-				pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
-				emit_interp(pc, pc->iv_c, NULL,
-					    INTERP_CENTROID);
-				emit_flop(pc, 0, pc->iv_c, pc->iv_c);
-				pc->p->cfg.fp.regs[1] |= 0x08000000;
-			}
+		for (c = 0; c < 2; ++c)
+			if (p->cfg.two_side[c].hw < 0x40)
+				p->cfg.two_side[c] = p->cfg.io[
+					p->cfg.two_side[c].hw];
 
-			for (c = 0; c < 4; c++) {
-				/* I don't know what these values do, but
-				 * let's set them like the blob does:
-				 */
-				if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
-					pc->p->cfg.fp.regs[0] += 0x00010000;
-				if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
-					pc->p->cfg.fp.regs[0] += 0x00010000;
-			}
-
-			for (i = 0; i < pc->attr_nr; i++)
-				load_fp_attrib(pc, i, r_usage[1],
-					       &mid, &aid, &oid);
+		if (p->cfg.psiz < 0x40)
+			p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
+	} else
+	if (p->type == PIPE_SHADER_FRAGMENT) {
+		int rid, aid;
+		unsigned n = 0, m = pc->attr_nr - flat_nr;
 
-			if (pc->iv_p)
-				free_temp(pc, pc->iv_p);
-			if (pc->iv_c)
-				free_temp(pc, pc->iv_c);
+		int base = (TGSI_SEMANTIC_POSITION ==
+			    p->info.input_semantic_name[0]) ? 0 : 1;
 
-			pc->p->cfg.fp.high_map = (mid / 4);
-			pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
-		} else {
-			/* vertex program */
-			for (i = 0; i < pc->attr_nr * 4; i++) {
-				pc->p->cfg.vp.attr[aid / 32] |=
-					(1 << (aid % 32));
-				pc->attr[i].type = P_ATTR;
-				pc->attr[i].hw = aid++;
-				pc->attr[i].index = i / 4;
+		/* non-flat interpolants have to be mapped to
+		 * the lower hardware IDs, so sort them:
+		 */
+		for (i = 0; i < pc->attr_nr; i++) {
+			if (pc->interp_mode[i] == INTERP_FLAT) {
+				p->cfg.io[m].id_vp = i + base;
+				p->cfg.io[m++].id_fp = i;
+			} else {
+				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
+					p->cfg.io[n].linear = TRUE;
+				p->cfg.io[n].id_vp = i + base;
+				p->cfg.io[n++].id_fp = i;
 			}
 		}
-	}
 
-	if (pc->result_nr) {
-		int rid = 0;
+		if (!base) /* set w-coordinate mask from perspective interp */
+			p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
 
-		pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
-		if (!pc->result)
-			goto out_err;
+		aid = popcnt4( /* if fcrd isn't contained in cfg.io */
+			base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
 
-		for (i = 0; i < pc->result_nr; i++) {
-			for (c = 0; c < 4; c++) {
-				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-					pc->result[i*4+c].type = P_TEMP;
-					pc->result[i*4+c].hw = -1;
-					pc->result[i*4+c].rhw = (i == depr) ?
-						-1 : rid++;
-				} else {
-					pc->result[i*4+c].type = P_RESULT;
-					pc->result[i*4+c].hw = rid++;
-				}
-				pc->result[i*4+c].index = i;
-			}
+		for (n = 0; n < pc->attr_nr; ++n) {
+			p->cfg.io[n].hw = rid = aid;
+			i = p->cfg.io[n].id_fp;
+
+			for (c = 0; c < 4; ++c) {
+				if (!pc->attr[i * 4 + c].acc)
+					continue;
+				pc->attr[i * 4 + c].rhw = rid++;
+				p->cfg.io[n].mask |= 1 << c;
 
-			if (pc->p->type == PIPE_SHADER_FRAGMENT &&
-			    depr != 0xffff) {
-				pc->result[depr * 4 + 2].rhw =
-					(pc->result_nr - 1) * 4;
+				load_interpolant(pc, &pc->attr[i * 4 + c]);
 			}
+			aid += popcnt4(p->cfg.io[n].mask);
 		}
-	}
 
-	if (pc->param_nr) {
-		int rid = 0;
+		if (!base)
+			p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
 
-		pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
-		if (!pc->param)
-			goto out_err;
+		m = popcnt4(p->cfg.regs[1] >> 24);
 
-		for (i = 0; i < pc->param_nr; i++) {
-			for (c = 0; c < 4; c++) {
-				pc->param[i*4+c].type = P_CONST;
-				pc->param[i*4+c].hw = rid++;
-				pc->param[i*4+c].index = i;
+		/* set count of non-position inputs and of non-flat
+		 * non-position inputs for FP_INTERPOLANT_CTRL
+		 */
+		p->cfg.regs[1] |= aid - m;
+
+		if (flat_nr) {
+			i = p->cfg.io[pc->attr_nr - flat_nr].hw;
+			p->cfg.regs[1] |= (i - m) << 16;
+		} else
+			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
+
+		/* mark color semantic for light-twoside */
+		n = 0x40;
+		for (i = 0; i < pc->attr_nr; i++) {
+			ubyte si, sn;
+
+			sn = p->info.input_semantic_name[p->cfg.io[i].id_fp];
+			si = p->info.input_semantic_index[p->cfg.io[i].id_fp];
+
+			if (sn == TGSI_SEMANTIC_COLOR) {
+				p->cfg.two_side[si] = p->cfg.io[i];
+
+				/* increase colour count */
+				p->cfg.regs[0] += popcnt4(
+					p->cfg.two_side[si].mask) << 16;
+
+				n = MIN2(n, p->cfg.io[i].hw - m);
 			}
 		}
+		if (n < 0x40)
+			p->cfg.regs[0] += n;
+
+		/* Initialize FP results:
+		 * FragDepth is always first TGSI and last hw output
+		 */
+		i = p->info.writes_z ? 4 : 0;
+		for (rid = 0; i < pc->result_nr * 4; i++)
+			pc->result[i].rhw = rid++;
+		if (p->info.writes_z)
+			pc->result[2].rhw = rid;
 	}
 
 	if (pc->immd_nr) {
 		int rid = 0;
 
-		pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
+		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
 		if (!pc->immd)
 			goto out_err;
 
 		for (i = 0; i < pc->immd_nr; i++) {
-			for (c = 0; c < 4; c++) {
-				pc->immd[i*4+c].type = P_IMMD;
-				pc->immd[i*4+c].hw = rid++;
-				pc->immd[i*4+c].index = i;
-			}
+			for (c = 0; c < 4; c++, rid++)
+				ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
 		}
 	}
 
 	ret = TRUE;
 out_err:
-	if (r_usage[0])
-		FREE(r_usage[0]);
-	if (r_usage[1])
-		FREE(r_usage[1]);
+	if (pc->iv_p)
+		free_temp(pc, pc->iv_p);
+	if (pc->iv_c)
+		free_temp(pc, pc->iv_c);
 
-	tgsi_parse_free(&p);
+	tgsi_parse_free(&tp);
 	return ret;
 }
 
@@ -2110,6 +2281,88 @@ free_nv50_pc(struct nv50_pc *pc)
 }
 
 static boolean
+ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
+{
+	int i, c;
+	unsigned rtype[2] = { P_ATTR, P_RESULT };
+
+	pc->p = p;
+	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
+	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
+	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
+	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
+
+	p->cfg.high_temp = 4;
+
+	p->cfg.two_side[0].hw = 0x40;
+	p->cfg.two_side[1].hw = 0x40;
+
+	switch (p->type) {
+	case PIPE_SHADER_VERTEX:
+		p->cfg.psiz = 0x40;
+		p->cfg.clpd = 0x40;
+		p->cfg.io_nr = pc->result_nr;
+		break;
+	case PIPE_SHADER_FRAGMENT:
+		rtype[0] = rtype[1] = P_TEMP;
+
+		p->cfg.regs[0] = 0x01000004;
+		p->cfg.io_nr = pc->attr_nr;
+
+		if (p->info.writes_z) {
+			p->cfg.regs[2] |= 0x00000100;
+			p->cfg.regs[3] |= 0x00000011;
+		}
+		if (p->info.uses_kill)
+			p->cfg.regs[2] |= 0x00100000;
+		break;
+	}
+
+	if (pc->temp_nr) {
+		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
+		if (!pc->temp)
+			return FALSE;
+
+		for (i = 0; i < pc->temp_nr * 4; ++i)
+			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
+	}
+
+	if (pc->attr_nr) {
+		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
+		if (!pc->attr)
+			return FALSE;
+
+		for (i = 0; i < pc->attr_nr * 4; ++i)
+			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
+	}
+
+	if (pc->result_nr) {
+		unsigned nr = pc->result_nr * 4;
+
+		pc->result = MALLOC(nr * sizeof(struct nv50_reg));
+		if (!pc->result)
+			return FALSE;
+
+		for (i = 0; i < nr; ++i)
+			ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
+	}
+
+	if (pc->param_nr) {
+		int rid = 0;
+
+		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
+		if (!pc->param)
+			return FALSE;
+
+		for (i = 0; i < pc->param_nr; ++i)
+			for (c = 0; c < 4; ++c, ++rid)
+				ctor_reg(&pc->param[rid], P_CONST, i, rid);
+	}
+
+	return TRUE;
+}
+
+static boolean
 nv50_program_tx(struct nv50_program *p)
 {
 	struct tgsi_parse_context parse;
@@ -2120,8 +2373,10 @@ nv50_program_tx(struct nv50_program *p)
 	pc = CALLOC_STRUCT(nv50_pc);
 	if (!pc)
 		return FALSE;
-	pc->p = p;
-	pc->p->cfg.high_temp = 4;
+
+	ret = ctor_nv50_pc(pc, p);
+	if (ret == FALSE)
+		goto out_cleanup;
 
 	ret = nv50_program_tx_prep(pc);
 	if (ret == FALSE)
@@ -2141,7 +2396,7 @@ nv50_program_tx(struct nv50_program *p)
 		switch (tok->Token.Type) {
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
 			++pc->insn_cur;
-			ret = nv50_program_tx_insn(pc, tok);
+			ret = nv50_tgsi_insn(pc, tok);
 			if (ret == FALSE)
 				goto out_err;
 			break;
@@ -2152,8 +2407,8 @@ nv50_program_tx(struct nv50_program *p)
 
 	if (p->type == PIPE_SHADER_FRAGMENT) {
 		struct nv50_reg out;
+		ctor_reg(&out, P_TEMP, -1, -1);
 
-		out.type = P_TEMP;
 		for (k = 0; k < pc->result_nr * 4; k++) {
 			if (pc->result[k].rhw == -1)
 				continue;
@@ -2258,30 +2513,19 @@ nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
 					 p->immd_nr, NV50_CB_PMISC);
 	}
 
-	if (!p->data[1] && p->param_nr) {
-		struct nouveau_resource *heap =
-			nv50->screen->parm_heap[p->type];
-
-		if (nouveau_resource_alloc(heap, p->param_nr, p, &p->data[1])) {
-			while (heap->next && heap->size < p->param_nr) {
-				struct nv50_program *evict = heap->next->priv;
-				nouveau_resource_free(&evict->data[1]);
-			}
-
-			if (nouveau_resource_alloc(heap, p->param_nr, p,
-						   &p->data[1]))
-				assert(0);
-		}
-	}
+	assert(p->param_nr <= 128);
 
 	if (p->param_nr) {
-		unsigned cbuf = NV50_CB_PVP;
+		unsigned cb;
 		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
 					     PIPE_BUFFER_USAGE_CPU_READ);
-		if (p->type == PIPE_SHADER_FRAGMENT)
-			cbuf = NV50_CB_PFP;
-		nv50_program_upload_data(nv50, map, p->data[1]->start,
-					 p->param_nr, cbuf);
+
+		if (p->type == PIPE_SHADER_VERTEX)
+			cb = NV50_CB_PVP;
+		else
+			cb = NV50_CB_PFP;
+
+		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
 		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
 	}
 }
@@ -2303,32 +2547,30 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
 		upload = TRUE;
 	}
 
-	if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
-		(p->data[1] && p->data[1]->start != p->data_start[1])) {
-		for (e = p->exec_head; e; e = e->next) {
-			unsigned ei, ci, bs;
-
-			if (e->param.index < 0)
-				continue;
-			bs = (e->inst[1] >> 22) & 0x07;
-			assert(bs < 2);
-			ei = e->param.shift >> 5;
-			ci = e->param.index + p->data[bs]->start;
+	if (p->data[0] && p->data[0]->start != p->data_start[0])
+		upload = TRUE;
 
-			e->inst[ei] &= ~e->param.mask;
-			e->inst[ei] |= (ci << e->param.shift);
-		}
+	if (!upload)
+		return;
 
-		if (p->data[0])
-			p->data_start[0] = p->data[0]->start;
-		if (p->data[1])
-			p->data_start[1] = p->data[1]->start;
+	for (e = p->exec_head; e; e = e->next) {
+		unsigned ei, ci, bs;
 
-		upload = TRUE;
+		if (e->param.index < 0)
+			continue;
+		bs = (e->inst[1] >> 22) & 0x07;
+		assert(bs < 2);
+		ei = e->param.shift >> 5;
+		ci = e->param.index;
+		if (bs == 0)
+			ci += p->data[bs]->start;
+
+		e->inst[ei] &= ~e->param.mask;
+		e->inst[ei] |= (ci << e->param.shift);
 	}
 
-	if (!upload)
-		return;
+	if (p->data[0])
+		p->data_start[0] = p->data[0]->start;
 
 #ifdef NV50_PROGRAM_DUMP
 	NOUVEAU_ERR("-------\n");
@@ -2402,8 +2644,8 @@ nv50_vertprog_validate(struct nv50_context *nv50)
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
 		      NOUVEAU_BO_LOW, 0, 0);
 	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
-	so_data  (so, p->cfg.vp.attr[0]);
-	so_data  (so, p->cfg.vp.attr[1]);
+	so_data  (so, p->cfg.attr[0]);
+	so_data  (so, p->cfg.attr[1]);
 	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
 	so_data  (so, p->cfg.high_result);
 	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
@@ -2421,7 +2663,6 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nv50_program *p = nv50->fragprog;
 	struct nouveau_stateobj *so;
-	unsigned i;
 
 	if (!p->translated) {
 		nv50_program_validate(nv50, p);
@@ -2438,29 +2679,186 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 		      NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
 		      NOUVEAU_BO_LOW, 0, 0);
-	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
-	so_data  (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
-	so_data  (so, 0x00000004);
-	so_data  (so, 0x00000000);
-	so_data  (so, 0x00000000);
-	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), p->cfg.fp.high_map);
-	for (i = 0; i < p->cfg.fp.high_map; i++)
-		so_data(so, p->cfg.fp.map[i]);
-	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 2);
-	so_data  (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
+	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
 	so_data  (so, p->cfg.high_temp);
 	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
 	so_data  (so, p->cfg.high_result);
 	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
-	so_data  (so, p->cfg.fp.regs[2]);
+	so_data  (so, p->cfg.regs[2]);
 	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
-	so_data  (so, p->cfg.fp.regs[3]);
+	so_data  (so, p->cfg.regs[3]);
 	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
 	so_data  (so, 0); /* program start offset */
 	so_ref(so, &nv50->state.fragprog);
 	so_ref(NULL, &so);
 }
 
+static void
+nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
+{
+	struct nv50_program *fp = nv50->fragprog;
+	struct nv50_program *vp = nv50->vertprog;
+	unsigned i, c, m = base;
+
+	/* XXX: This can't work correctly in all cases yet, we either
+	 * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has
+	 * to be per FP input instead of per VP output
+	 */
+	memset(pntc, 0, 8 * sizeof(uint32_t));
+
+	for (i = 0; i < fp->cfg.io_nr; i++) {
+		uint8_t sn, si;
+		uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp;
+		unsigned n = popcnt4(fp->cfg.io[i].mask);
+
+		if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
+			m += n;
+			continue;
+		}
+
+		sn = vp->info.input_semantic_name[j];
+		si = vp->info.input_semantic_index[j];
+
+		if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) {
+			ubyte mode =
+				nv50->rasterizer->pipe.sprite_coord_mode[si];
+
+			if (mode == PIPE_SPRITE_COORD_NONE) {
+				m += n;
+				continue;
+			}
+		}
+
+		/* this is either PointCoord or replaced by sprite coords */
+		for (c = 0; c < 4; c++) {
+			if (!(fp->cfg.io[i].mask & (1 << c)))
+				continue;
+			pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
+			++m;
+		}
+	}
+}
+
+static int
+nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
+	       struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
+{
+	int c;
+	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
+	uint8_t *map = (uint8_t *)p_map;
+
+	for (c = 0; c < 4; ++c) {
+		if (mf & 1) {
+			if (fpi->linear == TRUE)
+				lin[mid / 32] |= 1 << (mid % 32);
+			map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
+		}
+
+		oid += mv & 1;
+		mf >>= 1;
+		mv >>= 1;
+	}
+
+	return mid;
+}
+
+void
+nv50_linkage_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *vp = nv50->vertprog;
+	struct nv50_program *fp = nv50->fragprog;
+	struct nouveau_stateobj *so;
+	struct nv50_sreg4 dummy, *vpo;
+	int i, n, c, m = 0;
+	uint32_t map[16], lin[4], reg[5], pcrd[8];
+
+	memset(map, 0, sizeof(map));
+	memset(lin, 0, sizeof(lin));
+
+	reg[1] = 0x00000004; /* low and high clip distance map ids */
+	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
+	reg[3] = 0x00000000; /* point size map id & enable */
+	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
+	reg[4] = fp->cfg.regs[1]; /* interpolant info */
+
+	dummy.linear = FALSE;
+	dummy.mask = 0xf; /* map all components of HPOS */
+	m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
+
+	dummy.mask = 0x0;
+
+	if (vp->cfg.clpd < 0x40) {
+		for (c = 0; c < vp->cfg.clpd_nr; ++c)
+			map[m++] = vp->cfg.clpd + c;
+		reg[1] = (m << 8);
+	}
+
+	reg[0] |= m << 8; /* adjust BFC0 id */
+
+	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
+	if (nv50->rasterizer->pipe.light_twoside) {
+		vpo = &vp->cfg.two_side[0];
+
+		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
+		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
+	}
+
+	reg[0] += m - 4; /* adjust FFC0 id */
+	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
+
+	i = 0;
+	if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION)
+		i = 1;
+	for (; i < fp->cfg.io_nr; i++) {
+		ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp];
+		ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp];
+
+		n = fp->cfg.io[i].id_vp;
+		if (n >= vp->cfg.io_nr ||
+		    vp->info.output_semantic_name[n] != sn ||
+		    vp->info.output_semantic_index[n] != si)
+			vpo = &dummy;
+		else
+			vpo = &vp->cfg.io[n];
+
+		m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
+	}
+
+	if (nv50->rasterizer->pipe.point_size_per_vertex) {
+		map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
+		reg[3] = (m++ << 4) | 1;
+	}
+
+	/* now fill the stateobj */
+	so = so_new(64, 0);
+
+	n = (m + 3) / 4;
+	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
+	so_data  (so, m);
+	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
+	so_datap (so, map, n);
+
+	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
+	so_datap (so, reg, 4);
+
+	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
+	so_data  (so, reg[4]);
+
+	so_method(so, tesla, 0x1540, 4);
+	so_datap (so, lin, 4);
+
+	if (nv50->rasterizer->pipe.point_sprite) {
+		nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
+
+		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
+		so_datap (so, pcrd, 8);
+	}
+
+        so_ref(so, &nv50->state.programs);
+        so_ref(NULL, &so);
+}
+
 void
 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
 {
@@ -2476,7 +2874,6 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
 	nouveau_bo_ref(NULL, &p->bo);
 
 	nouveau_resource_free(&p->data[0]);
-	nouveau_resource_free(&p->data[1]);
 
 	p->translated = 0;
 }
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 096e0476aa..d78dee083f 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -15,6 +15,15 @@ struct nv50_program_exec {
 	} param;
 };
 
+struct nv50_sreg4 {
+	uint8_t hw;
+	uint8_t id_vp;
+	uint8_t id_fp;
+
+	uint8_t mask;
+	boolean linear;
+};
+
 struct nv50_program {
 	struct pipe_shader_state pipe;
 	struct tgsi_shader_info info;
@@ -24,8 +33,8 @@ struct nv50_program {
 	struct nv50_program_exec *exec_head;
 	struct nv50_program_exec *exec_tail;
 	unsigned exec_size;
-	struct nouveau_resource *data[2];
-	unsigned data_start[2];
+	struct nouveau_resource *data[1];
+	unsigned data_start[1];
 
 	struct nouveau_bo *bo;
 
@@ -36,14 +45,20 @@ struct nv50_program {
 	struct {
 		unsigned high_temp;
 		unsigned high_result;
-		struct {
-			unsigned attr[2];
-		} vp;
-		struct {
-			unsigned regs[4];
-			unsigned map[5];
-			unsigned high_map;
-		} fp;
+
+		uint32_t attr[2];
+		uint32_t regs[4];
+
+		/* for VPs, io_nr doesn't count 'private' results (PSIZ etc.) */
+		unsigned io_nr;
+		struct nv50_sreg4 io[PIPE_MAX_SHADER_OUTPUTS];
+
+		/* FP colour inputs, VP/GP back colour outputs */
+		struct nv50_sreg4 two_side[2];
+
+		/* VP only */
+		uint8_t clpd, clpd_nr;
+		uint8_t psiz;
 	} cfg;
 };
 
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index c7f80a2203..3b08e1b89f 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -87,12 +87,10 @@ nv50_screen_get_param(struct pipe_screen *pscreen, int param)
 		return 1;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 1;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
-		return 0;
+		return 1;
 	case PIPE_CAP_MAX_RENDER_TARGETS:
 		return 8;
 	case PIPE_CAP_OCCLUSION_QUERY:
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index 4283808ed9..81fa3e34c5 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -276,6 +276,9 @@ nv50_rasterizer_state_create(struct pipe_context *pipe,
 	so_method(so, tesla, 0x1684, 1);
 	so_data  (so, cso->flatshade_first ? 0 : 1);
 
+	so_method(so, tesla, NV50TCL_VERTEX_TWO_SIDE_ENABLE, 1);
+	so_data  (so, cso->light_twoside);
+
 	so_method(so, tesla, NV50TCL_LINE_WIDTH, 1);
 	so_data  (so, fui(cso->line_width));
 	so_method(so, tesla, NV50TCL_LINE_SMOOTH_ENABLE, 1);
@@ -294,6 +297,9 @@ nv50_rasterizer_state_create(struct pipe_context *pipe,
 	so_method(so, tesla, NV50TCL_POINT_SIZE, 1);
 	so_data  (so, fui(cso->point_size));
 
+	so_method(so, tesla, NV50TCL_POINT_SPRITE_ENABLE, 1);
+	so_data  (so, cso->point_sprite);
+
 	so_method(so, tesla, NV50TCL_POLYGON_MODE_FRONT, 3);
 	if (cso->front_winding == PIPE_WINDING_CCW) {
 		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index 344c2cf6dd..5a3559ed18 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -66,7 +66,8 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 			so_data(so, NV50TCL_RT_FORMAT_X8R8G8B8_UNORM);
 			break;
 		}
-		so_data(so, bo->tile_mode << 4);
+		so_data(so, nv50_miptree(pt)->
+				level[fb->cbufs[i]->level].tile_mode << 4);
 		so_data(so, 0x00000000);
 
 		so_method(so, tesla, 0x1224, 1);
@@ -110,7 +111,8 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 			so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM);
 			break;
 		}
-		so_data(so, bo->tile_mode << 4);
+		so_data(so, nv50_miptree(pt)->
+				level[fb->zsbuf->level].tile_mode << 4);
 		so_data(so, 0x00000000);
 
 		so_method(so, tesla, 0x1538, 1);
@@ -187,6 +189,8 @@ nv50_state_emit(struct nv50_context *nv50)
 		so_emit(chan, nv50->state.vertprog);
 	if (nv50->state.dirty & NV50_NEW_FRAGPROG)
 		so_emit(chan, nv50->state.fragprog);
+	if (nv50->state.dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG))
+		so_emit(chan, nv50->state.programs);
 	if (nv50->state.dirty & NV50_NEW_RASTERIZER)
 		so_emit(chan, nv50->state.rast);
 	if (nv50->state.dirty & NV50_NEW_BLEND_COLOUR)
@@ -208,6 +212,12 @@ nv50_state_emit(struct nv50_context *nv50)
 			so_emit(chan, nv50->state.vtxattr);
 	}
 	nv50->state.dirty = 0;
+}
+
+void
+nv50_state_flush_notify(struct nouveau_channel *chan)
+{
+	struct nv50_context *nv50 = chan->user_private;
 
 	so_emit_reloc_markers(chan, nv50->state.fb);
 	so_emit_reloc_markers(chan, nv50->state.vertprog);
@@ -238,6 +248,9 @@ nv50_state_validate(struct nv50_context *nv50)
 	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_FRAGPROG_CB))
 		nv50_fragprog_validate(nv50);
 
+	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG))
+		nv50_linkage_validate(nv50);
+
 	if (nv50->dirty & NV50_NEW_RASTERIZER)
 		so_ref(nv50->rasterizer->so, &nv50->state.rast);
 
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
index b266324f58..6bf6f773b0 100644
--- a/src/gallium/drivers/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -60,13 +60,13 @@ nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
  	format = nv50_format(ps->format);
  	if (format < 0)
  		return 1;
-  
+
  	if (!bo->tile_flags) {
  		BEGIN_RING(chan, eng2d, mthd, 2);
  		OUT_RING  (chan, format);
  		OUT_RING  (chan, 1);
  		BEGIN_RING(chan, eng2d, mthd + 0x14, 5);
- 		OUT_RING  (chan, mt->level[0].pitch);
+		OUT_RING  (chan, mt->level[ps->level].pitch);
  		OUT_RING  (chan, ps->width);
  		OUT_RING  (chan, ps->height);
  		OUT_RELOCh(chan, bo, ps->offset, flags);
@@ -75,7 +75,7 @@ nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
  		BEGIN_RING(chan, eng2d, mthd, 5);
  		OUT_RING  (chan, format);
  		OUT_RING  (chan, 0);
- 		OUT_RING  (chan, bo->tile_mode << 4);
+		OUT_RING  (chan, mt->level[ps->level].tile_mode << 4);
  		OUT_RING  (chan, 1);
  		OUT_RING  (chan, 0);
  		BEGIN_RING(chan, eng2d, mthd + 0x18, 4);
diff --git a/src/gallium/drivers/nv50/nv50_transfer.c b/src/gallium/drivers/nv50/nv50_transfer.c
index e9c3562194..bb7731855c 100644
--- a/src/gallium/drivers/nv50/nv50_transfer.c
+++ b/src/gallium/drivers/nv50/nv50_transfer.c
@@ -89,14 +89,14 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 		if (src_bo->tile_flags) {
 			BEGIN_RING(chan, m2mf,
 				NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN, 1);
-			OUT_RING  (chan, (sy << 16) | sx);
+			OUT_RING  (chan, (sy << 16) | (sx * cpp));
 		} else {
 			src_offset += (line_count * src_pitch);
 		}
 		if (dst_bo->tile_flags) {
 			BEGIN_RING(chan, m2mf,
 				NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT, 1);
-			OUT_RING  (chan, (dy << 16) | dx);
+			OUT_RING  (chan, (dy << 16) | (dx * cpp));
 		} else {
 			dst_offset += (line_count * dst_pitch);
 		}
diff --git a/src/gallium/drivers/r300/Makefile b/src/gallium/drivers/r300/Makefile
index d7a2c8c462..93c2152edc 100644
--- a/src/gallium/drivers/r300/Makefile
+++ b/src/gallium/drivers/r300/Makefile
@@ -9,6 +9,7 @@ C_SOURCES = \
 	r300_chipset.c \
 	r300_clear.c \
 	r300_context.c \
+	r300_debug.c \
 	r300_emit.c \
 	r300_flush.c \
 	r300_fs.c \
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index da67bc29b8..9cc455135d 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -22,6 +22,9 @@
 
 #include "r300_context.h"
 
+#include "r300_flush.h"
+#include "r300_state_invariant.h"
+
 static boolean r300_draw_range_elements(struct pipe_context* pipe,
                                         struct pipe_buffer* indexBuffer,
                                         unsigned indexSize,
@@ -146,6 +149,8 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300->context.winsys = (struct pipe_winsys*)r300_winsys;
     r300->context.screen = r300_screen(screen);
 
+    r300_init_debug(r300);
+
     r300->context.destroy = r300_destroy_context;
 
     r300->context.clear = r300_clear;
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index f78492d4aa..52b1c9a6b2 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -184,8 +184,15 @@ struct r300_texture {
     /* Offsets into the buffer. */
     unsigned offset[PIPE_MAX_TEXTURE_LEVELS];
 
-    /* Stride (pitch?) of this texture in bytes */
-    unsigned stride;
+    /**
+     * If non-zero, override the natural texture layout with
+     * a custom stride (in bytes).
+     *
+     * \note Mipmapping fails for textures with a non-natural layout!
+     *
+     * \sa r300_texture_get_stride
+     */
+    unsigned stride_override;
 
     /* Total size of this texture, in bytes. */
     unsigned size;
@@ -211,10 +218,7 @@ struct r300_vertex_format {
     int fs_tab[16];
 };
 
-static struct pipe_viewport_state r300_viewport_identity = {
-    .scale = {1.0, 1.0, 1.0, 1.0},
-    .translate = {0.0, 0.0, 0.0, 0.0},
-};
+extern struct pipe_viewport_state r300_viewport_identity;
 
 struct r300_context {
     /* Parent class */
@@ -275,6 +279,9 @@ struct r300_context {
     uint32_t dirty_state;
     /* Flag indicating whether or not the HW is dirty. */
     uint32_t dirty_hw;
+
+    /** Combination of DBG_xxx flags */
+    unsigned debug;
 };
 
 /* Convenience cast wrapper. */
@@ -288,4 +295,40 @@ struct draw_stage* r300_draw_stage(struct r300_context* r300);
 void r300_init_state_functions(struct r300_context* r300);
 void r300_init_surface_functions(struct r300_context* r300);
 
+/* Debug functionality. */
+
+/**
+ * Debug flags to disable/enable certain groups of debugging outputs.
+ *
+ * \note These may be rather coarse, and the grouping may be impractical.
+ * If you find, while debugging the driver, that a different grouping
+ * of these flags would be beneficial, just feel free to change them
+ * but make sure to update the documentation in r300_debug.c to reflect
+ * those changes.
+ */
+/*@{*/
+#define DBG_HELP    0x0000001
+#define DBG_FP      0x0000002
+#define DBG_VP      0x0000004
+#define DBG_CS      0x0000008
+#define DBG_DRAW    0x0000010
+/*@}*/
+
+static INLINE boolean DBG_ON(struct r300_context * ctx, unsigned flags)
+{
+    return (ctx->debug & flags) ? true : false;
+}
+
+static INLINE void DBG(struct r300_context * ctx, unsigned flags, const char * fmt, ...)
+{
+    if (DBG_ON(ctx, flags)) {
+        va_list va;
+        va_start(va, fmt);
+        debug_vprintf(fmt, va);
+        va_end(va);
+    }
+}
+
+void r300_init_debug(struct r300_context * ctx);
+
 #endif /* R300_CONTEXT_H */
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
index 71b142c0db..0a7e470363 100644
--- a/src/gallium/drivers/r300/r300_cs.h
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -49,7 +49,8 @@
     (RADEON_CP_PACKET0 | ((count) << 16) | ((register) >> 2))
 
 #define CS_LOCALS(context) \
-    struct r300_winsys* cs_winsys = context->winsys; \
+    struct r300_context* const cs_context_copy = (context); \
+    struct r300_winsys* cs_winsys = cs_context_copy->winsys; \
     int cs_count = 0;
 
 #define CHECK_CS(size) \
@@ -58,7 +59,7 @@
 #define BEGIN_CS(size) do { \
     CHECK_CS(size); \
     if (VERY_VERBOSE_CS) { \
-        debug_printf("r300: BEGIN_CS, count %d, in %s (%s:%d)\n", \
+        DBG(cs_context_copy, DBG_CS, "r300: BEGIN_CS, count %d, in %s (%s:%d)\n", \
                 size, __FUNCTION__, __FILE__, __LINE__); \
     } \
     cs_winsys->begin_cs(cs_winsys, (size), \
@@ -78,7 +79,7 @@
 
 #define OUT_CS_REG(register, value) do { \
     if (VERY_VERBOSE_REGISTERS) \
-        debug_printf("r300: writing 0x%08X to register 0x%04X\n", \
+        DBG(cs_context_copy, DBG_CS, "r300: writing 0x%08X to register 0x%04X\n", \
             value, register); \
     assert(register); \
     OUT_CS(CP_PACKET0(register, 0)); \
@@ -89,14 +90,14 @@
  * not the actual packet0 count! */
 #define OUT_CS_REG_SEQ(register, count) do { \
     if (VERY_VERBOSE_REGISTERS) \
-        debug_printf("r300: writing register sequence of %d to 0x%04X\n", \
+        DBG(cs_context_copy, DBG_CS, "r300: writing register sequence of %d to 0x%04X\n", \
             count, register); \
     assert(register); \
     OUT_CS(CP_PACKET0(register, ((count) - 1))); \
 } while (0)
 
 #define OUT_CS_RELOC(bo, offset, rd, wd, flags) do { \
-    debug_printf("r300: writing relocation for buffer %p, offset %d, " \
+    DBG(cs_context_copy, DBG_CS, "r300: writing relocation for buffer %p, offset %d, " \
             "domains (%d, %d, %d)\n", \
         bo, offset, rd, wd, flags); \
     assert(bo); \
@@ -107,7 +108,7 @@
 
 #define END_CS do { \
     if (VERY_VERBOSE_CS) { \
-        debug_printf("r300: END_CS in %s (%s:%d)\n", __FUNCTION__, \
+        DBG(cs_context_copy, DBG_CS, "r300: END_CS in %s (%s:%d)\n", __FUNCTION__, \
                 __FILE__, __LINE__); \
     } \
     if (cs_count != 0) \
@@ -117,7 +118,7 @@
 
 #define FLUSH_CS do { \
     if (VERY_VERBOSE_CS) { \
-        debug_printf("r300: FLUSH_CS in %s (%s:%d)\n\n", __FUNCTION__, \
+        DBG(cs_context_copy, DBG_CS, "r300: FLUSH_CS in %s (%s:%d)\n\n", __FUNCTION__, \
                 __FILE__, __LINE__); \
     } \
     cs_winsys->flush_cs(cs_winsys); \
@@ -127,7 +128,7 @@
 
 #define OUT_CS_ONE_REG(register, count) do { \
     if (VERY_VERBOSE_REGISTERS) \
-        debug_printf("r300: writing data sequence of %d to 0x%04X\n", \
+        DBG(cs_context_copy, DBG_CS, "r300: writing data sequence of %d to 0x%04X\n", \
             count, register); \
     assert(register); \
     OUT_CS(CP_PACKET0(register, ((count) - 1)) | RADEON_ONE_REG_WR); \
@@ -141,7 +142,7 @@
 } while (0)
 
 #define OUT_CS_INDEX_RELOC(bo, offset, count, rd, wd, flags) do { \
-    debug_printf("r300: writing relocation for index buffer %p," \
+    DBG(cs_context_copy, DBG_CS, "r300: writing relocation for index buffer %p," \
             "offset %d\n", bo, offset); \
     assert(bo); \
     OUT_CS(offset); \
diff --git a/src/gallium/drivers/r300/r300_debug.c b/src/gallium/drivers/r300/r300_debug.c
new file mode 100644
index 0000000000..15308dda1d
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_debug.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2009 Nicolai Haehnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_context.h"
+
+#include <ctype.h>
+
+
+struct debug_option {
+    const char * name;
+    unsigned flag;
+    const char * description;
+};
+
+static struct debug_option debug_options[] = {
+    { "help", DBG_HELP, "Helpful meta-information about the driver" },
+    { "fp", DBG_FP, "Fragment program handling" },
+    { "vp", DBG_VP, "Vertex program handling" },
+    { "cs", DBG_CS, "Command submissions" },
+    { "draw", DBG_DRAW, "Draw and emit" },
+
+    { "all", ~0, "Convenience option that enables all debug flags" },
+
+    /* must be last */
+    { 0, 0, 0 }
+};
+
+void r300_init_debug(struct r300_context * ctx)
+{
+    const char * options = debug_get_option("RADEON_DEBUG", 0);
+    boolean printhint = false;
+
+    if (options) {
+        while(*options) {
+            if (*options == ' ' || *options == ',') {
+                options++;
+                continue;
+            }
+
+            size_t length = strcspn(options, " ,");
+            struct debug_option * opt;
+
+            for(opt = debug_options; opt->name; ++opt) {
+                if (!strncmp(options, opt->name, length)) {
+                    ctx->debug |= opt->flag;
+                    break;
+                }
+            }
+
+            if (!opt->name) {
+                debug_printf("Unknown debug option: %s\n", options);
+                printhint = true;
+            }
+
+            options += length;
+        }
+
+        if (!ctx->debug)
+            printhint = true;
+    }
+
+    if (printhint || ctx->debug & DBG_HELP) {
+        debug_printf("You can enable debug output by setting the RADEON_DEBUG environment variable\n"
+                     "to a comma-separated list of debug options. Available options are:\n");
+        for(struct debug_option * opt = debug_options; opt->name; ++opt) {
+            debug_printf("    %s: %s\n", opt->name, opt->description);
+        }
+    }
+}
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index bd4d59e6f1..a1b36ba2ed 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -25,6 +25,7 @@
 #include "r300_emit.h"
 
 #include "r300_fs.h"
+#include "r300_state_derived.h"
 #include "r300_vs.h"
 
 void r300_emit_blend_state(struct r300_context* r300,
@@ -282,7 +283,7 @@ void r300_emit_fb_state(struct r300_context* r300,
     for (i = 0; i < fb->nr_cbufs; i++) {
         tex = (struct r300_texture*)fb->cbufs[i]->texture;
         assert(tex && tex->buffer && "cbuf is marked, but NULL!");
-        pixpitch = tex->stride / tex->tex.block.size;
+        pixpitch = r300_texture_get_stride(tex, 0) / tex->tex.block.size;
 
         OUT_CS_REG_SEQ(R300_RB3D_COLOROFFSET0 + (4 * i), 1);
         OUT_CS_RELOC(tex->buffer, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
@@ -299,7 +300,7 @@ void r300_emit_fb_state(struct r300_context* r300,
     if (fb->zsbuf) {
         tex = (struct r300_texture*)fb->zsbuf->texture;
         assert(tex && tex->buffer && "zsbuf is marked, but NULL!");
-        pixpitch = tex->stride / tex->tex.block.size;
+        pixpitch = r300_texture_get_stride(tex, 0) / tex->tex.block.size;
 
         OUT_CS_REG_SEQ(R300_ZB_DEPTHOFFSET, 1);
         OUT_CS_RELOC(tex->buffer, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
@@ -490,7 +491,7 @@ void r300_emit_vertex_buffer(struct r300_context* r300)
 {
     CS_LOCALS(r300);
 
-    debug_printf("r300: Preparing vertex buffer %p for render, "
+    DBG(r300, DBG_DRAW, "r300: Preparing vertex buffer %p for render, "
             "vertex size %d\n", r300->vbo,
             r300->vertex_info.vinfo.size);
     /* Set the pointer to our vertex buffer. The emitted values are this:
diff --git a/src/gallium/drivers/r300/r300_emit.h b/src/gallium/drivers/r300/r300_emit.h
index 350691d592..c4002b8e5d 100644
--- a/src/gallium/drivers/r300/r300_emit.h
+++ b/src/gallium/drivers/r300/r300_emit.h
@@ -56,6 +56,11 @@ void r500_emit_fragment_program_code(struct r300_context* r300,
 void r300_emit_fb_state(struct r300_context* r300,
                         struct pipe_framebuffer_state* fb);
 
+void r300_emit_query_begin(struct r300_context* r300,
+                           struct r300_query* query);
+void r300_emit_query_end(struct r300_context* r300,
+                         struct r300_query* query);
+
 void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs);
 
 void r300_emit_rs_block_state(struct r300_context* r300,
diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index 36463b9a2e..a0e848a59a 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -96,7 +96,7 @@ void r300_translate_fragment_shader(struct r300_context* r300,
 
     memset(&compiler, 0, sizeof(compiler));
     rc_init(&compiler.Base);
-    compiler.Base.Debug = 1;
+    compiler.Base.Debug = DBG_ON(r300, DBG_FP);
 
     compiler.code = &fs->code;
     compiler.is_r500 = r300_screen(r300->context.screen)->caps->is_r500;
diff --git a/src/gallium/drivers/r300/r300_query.c b/src/gallium/drivers/r300/r300_query.c
index 1d5185b417..2880d34877 100644
--- a/src/gallium/drivers/r300/r300_query.c
+++ b/src/gallium/drivers/r300/r300_query.c
@@ -22,6 +22,8 @@
 
 #include "r300_query.h"
 
+#include "r300_emit.h"
+
 static struct pipe_query* r300_create_query(struct pipe_context* pipe,
                                             unsigned query_type)
 {
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index cd458d019a..737396d8d9 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -26,15 +26,17 @@
 
 #include "r300_cs.h"
 #include "r300_context.h"
+#include "r300_emit.h"
 #include "r300_reg.h"
 #include "r300_state_derived.h"
 
 /* r300_render: Vertex and index buffer primitive emission. */
+#define R300_MAX_VBO_SIZE  (1024 * 1024)
 
 struct r300_render {
     /* Parent class */
     struct vbuf_render base;
-    
+
     /* Pipe context */
     struct r300_context* r300;
 
@@ -45,7 +47,10 @@ struct r300_render {
 
     /* VBO */
     struct pipe_buffer* vbo;
-    size_t vbo_alloc_size;
+    size_t vbo_size;
+    size_t vbo_offset;
+    size_t vbo_max_used;
+    void * vbo_ptr;
 };
 
 static INLINE struct r300_render*
@@ -74,19 +79,18 @@ static boolean r300_render_allocate_vertices(struct vbuf_render* render,
     struct pipe_screen* screen = r300->context.screen;
     size_t size = (size_t)vertex_size * (size_t)count;
 
-    if (r300render->vbo && (size > r300render->vbo_alloc_size)) {
-        pipe_buffer_reference(&r300render->vbo, NULL);
-    }
-    
-    if (!r300render->vbo) {
+    if (size + r300render->vbo_offset > r300render->vbo_size) 
+    {
         r300render->vbo = pipe_buffer_create(screen,
                                              64,
                                              PIPE_BUFFER_USAGE_VERTEX,
-                                             size);
+                                             R300_MAX_VBO_SIZE);
+        r300render->vbo_size = R300_MAX_VBO_SIZE;
     }
 
-    r300render->vbo_alloc_size = MAX2(size, r300render->vbo_alloc_size);
     r300render->vertex_size = vertex_size;
+    r300->vbo = r300render->vbo;
+    r300->vbo_offset = r300render->vbo_offset;
 
     return (r300render->vbo) ? TRUE : FALSE;
 }
@@ -96,8 +100,10 @@ static void* r300_render_map_vertices(struct vbuf_render* render)
     struct r300_render* r300render = r300_render(render);
     struct pipe_screen* screen = r300render->r300->context.screen;
 
-    return (unsigned char*)pipe_buffer_map(screen, r300render->vbo,
-                                           PIPE_BUFFER_USAGE_CPU_WRITE);
+    r300render->vbo_ptr = pipe_buffer_map(screen, r300render->vbo,
+                                          PIPE_BUFFER_USAGE_CPU_WRITE);
+
+    return (r300render->vbo_ptr + r300render->vbo_offset);
 }
 
 static void r300_render_unmap_vertices(struct vbuf_render* render,
@@ -106,15 +112,24 @@ static void r300_render_unmap_vertices(struct vbuf_render* render,
 {
     struct r300_render* r300render = r300_render(render);
     struct pipe_screen* screen = r300render->r300->context.screen;
+    CS_LOCALS(r300render->r300);
+    BEGIN_CS(2);
+    OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, max);
+    END_CS;
 
+    r300render->vbo_max_used = MAX2(r300render->vbo_max_used, 
+                                    r300render->vertex_size * (max + 1));
     pipe_buffer_unmap(screen, r300render->vbo);
 }
 
 static void r300_render_release_vertices(struct vbuf_render* render)
 {
     struct r300_render* r300render = r300_render(render);
+    struct r300_context* r300 = r300render->r300;
 
-    pipe_buffer_reference(&r300render->vbo, NULL);
+    r300render->vbo_offset += r300render->vbo_max_used;
+    r300render->vbo_max_used = 0;
+    r300->vbo = NULL;
 }
 
 static boolean r300_render_set_primitive(struct vbuf_render* render,
@@ -162,14 +177,12 @@ static boolean r300_render_set_primitive(struct vbuf_render* render,
     return TRUE;
 }
 
-static void prepare_render(struct r300_render* render, unsigned count)
+static void r300_prepare_render(struct r300_render* render, unsigned count)
 {
     struct r300_context* r300 = render->r300;
 
     CS_LOCALS(r300);
 
-    r300->vbo = render->vbo;
-
     r300_emit_dirty_state(r300);
 }
 
@@ -182,9 +195,9 @@ static void r300_render_draw_arrays(struct vbuf_render* render,
 
     CS_LOCALS(r300);
 
-    prepare_render(r300render, count);
+    r300_prepare_render(r300render, count);
 
-    debug_printf("r300: Doing vbuf render, count %d\n", count);
+    DBG(r300, DBG_DRAW, "r300: Doing vbuf render, count %d\n", count);
 
     BEGIN_CS(2);
     OUT_CS_PKT3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
@@ -207,7 +220,7 @@ static void r300_render_draw(struct vbuf_render* render,
 
     CS_LOCALS(r300);
 
-    prepare_render(r300render, count);
+    r300_prepare_render(r300render, count);
 
     /* Send our indices into an index buffer. */
     index_buffer = pipe_buffer_create(screen, 64, PIPE_BUFFER_USAGE_VERTEX,
@@ -216,23 +229,6 @@ static void r300_render_draw(struct vbuf_render* render,
         return;
     }
 
-/*
-    index_map = pipe_buffer_map(screen, index_buffer,
-                                PIPE_BUFFER_USAGE_CPU_WRITE);
-    memcpy(index_map, indices, count);
-    pipe_buffer_unmap(screen, index_buffer);
-
-    debug_printf("r300: Doing indexbuf render, count %d\n", count);
-
-    BEGIN_CS(8);
-    OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, 0);
-    OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
-           r300render->hwprim);
-    OUT_CS_PKT3(R300_PACKET3_INDX_BUFFER, 2);
-    OUT_CS(R300_INDX_BUFFER_ONE_REG_WR | (R300_VAP_PORT_IDX0 >> 2));
-    OUT_CS_INDEX_RELOC(index_buffer, 0, count, RADEON_GEM_DOMAIN_GTT, 0, 0);
-    END_CS; */
-
     BEGIN_CS(2 + (count+1)/2);
     OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, (count+1)/2);
     OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
@@ -271,6 +267,10 @@ static struct vbuf_render* r300_render_create(struct r300_context* r300)
     r300render->base.release_vertices = r300_render_release_vertices;
     r300render->base.destroy = r300_render_destroy;
 
+    r300render->vbo = NULL;
+    r300render->vbo_size = 0;
+    r300render->vbo_offset = 0;
+
     return &r300render->base;
 }
 
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 15740f6125..3b5b1bbd37 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -93,8 +93,6 @@ static int r300_get_param(struct pipe_screen* pscreen, int param)
             } else {
                 return 0;
             }
-        case PIPE_CAP_S3TC:
-            return 1;
         case PIPE_CAP_ANISOTROPIC_FILTER:
             return 1;
         case PIPE_CAP_POINT_SPRITE:
@@ -323,13 +321,14 @@ r300_get_tex_transfer(struct pipe_screen *screen,
     if (trans) {
         pipe_texture_reference(&trans->transfer.texture, texture);
         trans->transfer.format = texture->format;
+        trans->transfer.x = x;
+        trans->transfer.y = y;
         trans->transfer.width = w;
         trans->transfer.height = h;
         trans->transfer.block = texture->block;
         trans->transfer.nblocksx = texture->nblocksx[level];
         trans->transfer.nblocksy = texture->nblocksy[level];
-        trans->transfer.stride = align(pf_get_stride(&trans->transfer.block,
-                                                     texture->width[level]), 32);
+        trans->transfer.stride = r300_texture_get_stride(tex, level);
         trans->transfer.usage = usage;
         trans->offset = offset;
     }
@@ -356,7 +355,7 @@ static void* r300_transfer_map(struct pipe_screen* screen,
     if (transfer->usage != PIPE_TRANSFER_READ) {
         flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
     }
-    
+
     map = pipe_buffer_map(screen, tex->buffer, flags);
 
     if (!map) {
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index c16cadd040..88cb9af6fb 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -20,10 +20,11 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+#include "util/u_debug.h"
 #include "util/u_math.h"
 #include "util/u_pack_color.h"
 
-#include "util/u_debug.h"
+#include "tgsi/tgsi_parse.h"
 
 #include "pipe/p_config.h"
 #include "pipe/internal/p_winsys_screen.h"
@@ -429,6 +430,9 @@ static void r300_bind_rs_state(struct pipe_context* pipe, void* state)
 
     r300->rs_state = rs;
     r300->dirty_state |= R300_NEW_RASTERIZER;
+    r300->dirty_state |= R300_NEW_RS_BLOCK;
+    r300->dirty_state |= R300_NEW_SCISSOR;
+    r300->dirty_state |= R300_NEW_VIEWPORT;
 }
 
 /* Free rasterizer state. */
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index c01e61a9b1..5f6b225d34 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -23,6 +23,7 @@
 #include "r300_state_derived.h"
 
 #include "r300_fs.h"
+#include "r300_state_inlines.h"
 #include "r300_vs.h"
 
 /* r300_state_derived: Various bits of state which are dependent upon
@@ -195,13 +196,13 @@ static void r300_vertex_psc(struct r300_context* r300,
      * and not on attrib information. */
     if (r300screen->caps->has_tcl) {
         attrib_count = r300->vs->info.num_inputs;
-        debug_printf("r300: routing %d attribs in psc for vs\n",
+        DBG(r300, DBG_DRAW, "r300: routing %d attribs in psc for vs\n",
                 attrib_count);
     } else {
         attrib_count = vinfo->num_attribs;
-        debug_printf("r300: attrib count: %d\n", attrib_count);
+        DBG(r300, DBG_DRAW, "r300: attrib count: %d\n", attrib_count);
         for (i = 0; i < attrib_count; i++) {
-            debug_printf("r300: attrib: offset %d, interp %d, size %d,"
+            DBG(r300, DBG_DRAW, "r300: attrib: offset %d, interp %d, size %d,"
                    " tab %d\n", vinfo->attrib[i].src_index,
                    vinfo->attrib[i].interp_mode, vinfo->attrib[i].emit,
                    tab[i]);
@@ -299,18 +300,18 @@ static void r300_update_fs_tab(struct r300_context* r300)
     }
 
     /* Now that we know where everything is... */
-    debug_printf("r300: fp input count: %d\n", info->num_inputs);
+    DBG(r300, DBG_DRAW, "r300: fp input count: %d\n", info->num_inputs);
     for (i = 0; i < info->num_inputs; i++) {
         switch (tab[i]) {
             case INTERP_LINEAR:
-                debug_printf("r300: attrib: "
+                DBG(r300, DBG_DRAW, "r300: attrib: "
                         "stack offset %d, color,    tab %d\n",
                         i, cols_emitted);
                 tab[i] = cols_emitted;
                 cols_emitted++;
                 break;
             case INTERP_PERSPECTIVE:
-                debug_printf("r300: attrib: "
+                DBG(r300, DBG_DRAW, "r300: attrib: "
                         "stack offset %d, texcoord, tab %d\n",
                         i, cols + texs);
                 tab[i] = cols + texs;
diff --git a/src/gallium/drivers/r300/r300_state_derived.h b/src/gallium/drivers/r300/r300_state_derived.h
index 63ae8eb8d0..71a4a47b00 100644
--- a/src/gallium/drivers/r300/r300_state_derived.h
+++ b/src/gallium/drivers/r300/r300_state_derived.h
@@ -23,11 +23,7 @@
 #ifndef R300_STATE_DERIVED_H
 #define R300_STATE_DERIVED_H
 
-#include "draw/draw_vertex.h"
-
-#include "r300_context.h"
-#include "r300_reg.h"
-#include "r300_state_inlines.h"
+struct r300_context;
 
 void r300_update_derived_state(struct r300_context* r300);
 
diff --git a/src/gallium/drivers/r300/r300_state_invariant.c b/src/gallium/drivers/r300/r300_state_invariant.c
index 7d822fec48..3865730d63 100644
--- a/src/gallium/drivers/r300/r300_state_invariant.c
+++ b/src/gallium/drivers/r300/r300_state_invariant.c
@@ -23,6 +23,12 @@
 
 #include "r300_state_invariant.h"
 
+
+struct pipe_viewport_state r300_viewport_identity = {
+    .scale = {1.0, 1.0, 1.0, 1.0},
+    .translate = {0.0, 0.0, 0.0, 0.0},
+};
+
 /* Calculate and emit invariant state. This is data that the 3D engine
  * will probably want at the beginning of every CS, but it's not currently
  * handled by any CSO setup, and in addition it doesn't really change much.
diff --git a/src/gallium/drivers/r300/r300_surface.c b/src/gallium/drivers/r300/r300_surface.c
index 96e6e4a77d..cc6288cb51 100644
--- a/src/gallium/drivers/r300/r300_surface.c
+++ b/src/gallium/drivers/r300/r300_surface.c
@@ -29,7 +29,7 @@ static void r300_surface_setup(struct r300_context* r300,
                                unsigned w, unsigned h)
 {
     struct r300_capabilities* caps = r300_screen(r300->context.screen)->caps;
-    unsigned pixpitch = dest->stride / dest->tex.block.size;
+    unsigned pixpitch = r300_texture_get_stride(dest, 0) / dest->tex.block.size;
     CS_LOCALS(r300);
 
     r300_emit_blend_state(r300, &blend_clear_state);
@@ -100,7 +100,7 @@ static void r300_surface_fill(struct pipe_context* pipe,
     struct r300_context* r300 = r300_context(pipe);
     struct r300_capabilities* caps = r300_screen(pipe->screen)->caps;
     struct r300_texture* tex = (struct r300_texture*)dest->texture;
-    unsigned pixpitch = tex->stride / tex->tex.block.size;
+    unsigned pixpitch = r300_texture_get_stride(tex, 0) / tex->tex.block.size;
     boolean invalid = FALSE;
     CS_LOCALS(r300);
 
@@ -233,7 +233,7 @@ static void r300_surface_copy(struct pipe_context* pipe,
     struct r300_capabilities* caps = r300_screen(pipe->screen)->caps;
     struct r300_texture* srctex = (struct r300_texture*)src->texture;
     struct r300_texture* desttex = (struct r300_texture*)dest->texture;
-    unsigned pixpitch = srctex->stride / srctex->tex.block.size;
+    unsigned pixpitch = r300_texture_get_stride(srctex, 0) / srctex->tex.block.size;
     boolean invalid = FALSE;
     float fsrcx = srcx, fsrcy = srcy, fdestx = destx, fdesty = desty;
     CS_LOCALS(r300);
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 590052509c..7c041d17f7 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -25,7 +25,6 @@
 static void r300_setup_texture_state(struct r300_texture* tex,
                                      unsigned width,
                                      unsigned height,
-                                     unsigned pitch,
                                      unsigned levels)
 {
     struct r300_texture_state* state = &tex->state;
@@ -38,7 +37,7 @@ static void r300_setup_texture_state(struct r300_texture* tex,
     /* XXX */
     state->format1 = r300_translate_texformat(tex->tex.format);
 
-    state->format2 = pitch - 1;
+    state->format2 = r300_texture_get_stride(tex, 0);
 
     /* Assume (somewhat foolishly) that oversized textures will
      * not be permitted by the state tracker. */
@@ -49,14 +48,31 @@ static void r300_setup_texture_state(struct r300_texture* tex,
         state->format2 |= R500_TXHEIGHT_BIT11;
     }
 
-    debug_printf("r300: Set texture state (%dx%d, pitch %d, %d levels)\n",
-            width, height, pitch, levels);
+    debug_printf("r300: Set texture state (%dx%d, %d levels)\n",
+            width, height, levels);
+}
+
+/**
+ * Return the stride, in bytes, of the texture images of the given texture
+ * at the given level.
+ */
+unsigned r300_texture_get_stride(struct r300_texture* tex, unsigned level)
+{
+    if (tex->stride_override)
+        return tex->stride_override;
+
+    if (level > tex->tex.last_level) {
+        debug_printf("%s: level (%u) > last_level (%u)\n", __FUNCTION__, level, tex->tex.last_level);
+        return 0;
+    }
+
+    return align(pf_get_stride(&tex->tex.block, tex->tex.width[level]), 32);
 }
 
 static void r300_setup_miptree(struct r300_texture* tex)
 {
     struct pipe_texture* base = &tex->tex;
-    int stride, size, offset;
+    int stride, size;
     int i;
 
     for (i = 0; i <= base->last_level; i++) {
@@ -74,7 +90,7 @@ static void r300_setup_miptree(struct r300_texture* tex)
          * XXX
          * POT, uncompressed, unmippmapped textures can be aligned to 32,
          * instead of 64. */
-        stride = align(pf_get_stride(&base->block, base->width[i]), 32);
+        stride = r300_texture_get_stride(tex, i);
         size = stride * base->nblocksy[i] * base->depth[i];
 
         tex->offset[i] = align(tex->size, 32);
@@ -84,10 +100,6 @@ static void r300_setup_miptree(struct r300_texture* tex)
                 "(%dx%dx%d px, pitch %d bytes)\n",
                 i, base->width[i], base->height[i], base->depth[i],
                 stride);
-        /* Save stride of first level to the texture. */
-        if (i == 0) {
-            tex->stride = stride;
-        }
     }
 }
 
@@ -109,7 +121,7 @@ static struct pipe_texture*
     r300_setup_miptree(tex);
 
     r300_setup_texture_state(tex, template->width[0], template->height[0],
-            template->width[0], template->last_level);
+                             template->last_level);
 
     tex->buffer = screen->buffer_create(screen, 1024,
                                         PIPE_BUFFER_USAGE_PIXEL,
@@ -189,11 +201,10 @@ static struct pipe_texture*
     pipe_reference_init(&tex->tex.reference, 1);
     tex->tex.screen = screen;
 
-    tex->stride = *stride;
+    tex->stride_override = *stride;
 
     /* XXX */
-    r300_setup_texture_state(tex, tex->tex.width[0], tex->tex.height[0],
-            tex->stride, 0);
+    r300_setup_texture_state(tex, tex->tex.width[0], tex->tex.height[0], 0);
 
     pipe_buffer_reference(&tex->buffer, buffer);
 
@@ -221,7 +232,7 @@ boolean r300_get_texture_buffer(struct pipe_texture* texture,
     pipe_buffer_reference(buffer, tex->buffer);
 
     if (stride) {
-        *stride = tex->stride;
+        *stride = r300_texture_get_stride(tex, 0);
     }
 
     return TRUE;
diff --git a/src/gallium/drivers/r300/r300_texture.h b/src/gallium/drivers/r300/r300_texture.h
index 3b56f0307c..3109af5bac 100644
--- a/src/gallium/drivers/r300/r300_texture.h
+++ b/src/gallium/drivers/r300/r300_texture.h
@@ -30,8 +30,12 @@
 #include "r300_context.h"
 #include "r300_reg.h"
 
+struct r300_texture;
+
 void r300_init_screen_texture_functions(struct pipe_screen* screen);
 
+unsigned r300_texture_get_stride(struct r300_texture* tex, unsigned level);
+
 /* Note the signature of R300_EASY_TX_FORMAT(A, R, G, B, FORMAT)... */
 static INLINE uint32_t r300_translate_texformat(enum pipe_format format)
 {
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
index d68a104106..0913ca1bd5 100644
--- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
@@ -158,7 +158,6 @@ static unsigned translate_saturate(unsigned saturate)
     switch(saturate) {
         case TGSI_SAT_NONE: return SATURATE_OFF;
         case TGSI_SAT_ZERO_ONE: return SATURATE_ZERO_ONE;
-        case TGSI_SAT_MINUS_PLUS_ONE: return SATURATE_PLUS_MINUS_ONE;
     }
 
     fprintf(stderr, "Unknown saturate mode: %i\n", saturate);
diff --git a/src/gallium/drivers/r300/r300_vs.c b/src/gallium/drivers/r300/r300_vs.c
index 2cb903bba2..12a6e37be6 100644
--- a/src/gallium/drivers/r300/r300_vs.c
+++ b/src/gallium/drivers/r300/r300_vs.c
@@ -116,7 +116,7 @@ void r300_translate_vertex_shader(struct r300_context* r300,
     /* Setup the compiler */
     rc_init(&compiler.Base);
 
-    compiler.Base.Debug = 1;
+    compiler.Base.Debug = DBG_ON(r300, DBG_VP);
     compiler.code = &vs->code;
     compiler.UserData = vs;
 
diff --git a/src/gallium/drivers/softpipe/Makefile b/src/gallium/drivers/softpipe/Makefile
index 516e3992fd..6ab3753762 100644
--- a/src/gallium/drivers/softpipe/Makefile
+++ b/src/gallium/drivers/softpipe/Makefile
@@ -6,26 +6,17 @@ LIBNAME = softpipe
 C_SOURCES = \
 	sp_fs_exec.c \
 	sp_fs_sse.c \
-	sp_fs_llvm.c \
 	sp_clear.c \
 	sp_flush.c \
 	sp_query.c \
 	sp_context.c \
 	sp_draw_arrays.c \
-	sp_prim_setup.c \
 	sp_prim_vbuf.c \
 	sp_quad_pipe.c \
-	sp_quad_alpha_test.c \
-	sp_quad_blend.c \
-	sp_quad_colormask.c \
-	sp_quad_coverage.c \
+	sp_quad_stipple.c \
 	sp_quad_depth_test.c \
-	sp_quad_earlyz.c \
 	sp_quad_fs.c \
-	sp_quad_occlusion.c \
-	sp_quad_output.c \
-	sp_quad_stencil.c \
-	sp_quad_stipple.c \
+	sp_quad_blend.c \
 	sp_screen.c \
         sp_setup.c \
 	sp_state_blend.c \
@@ -38,6 +29,7 @@ C_SOURCES = \
 	sp_state_vertex.c \
 	sp_texture.c \
 	sp_tex_sample.c \
+	sp_tex_tile_cache.c \
 	sp_tile_cache.c \
 	sp_surface.c 
 
diff --git a/src/gallium/drivers/softpipe/SConscript b/src/gallium/drivers/softpipe/SConscript
index f8720638a7..950c3d9955 100644
--- a/src/gallium/drivers/softpipe/SConscript
+++ b/src/gallium/drivers/softpipe/SConscript
@@ -7,25 +7,16 @@ softpipe = env.ConvenienceLibrary(
 	source = [
 		'sp_fs_exec.c',
 		'sp_fs_sse.c',
-		'sp_fs_llvm.c',
 		'sp_clear.c',
 		'sp_context.c',
 		'sp_draw_arrays.c',
 		'sp_flush.c',
-		'sp_prim_setup.c',
 		'sp_prim_vbuf.c',
 		'sp_setup.c',
-		'sp_quad_alpha_test.c',
 		'sp_quad_blend.c',
 		'sp_quad_pipe.c',
-		'sp_quad_colormask.c',
-		'sp_quad_coverage.c',
 		'sp_quad_depth_test.c',
-		'sp_quad_earlyz.c',
 		'sp_quad_fs.c',
-		'sp_quad_occlusion.c',
-		'sp_quad_output.c',
-		'sp_quad_stencil.c',
 		'sp_quad_stipple.c',
 		'sp_query.c',
 		'sp_screen.c',
@@ -39,6 +30,7 @@ softpipe = env.ConvenienceLibrary(
 		'sp_state_vertex.c',
 		'sp_surface.c',
 		'sp_tex_sample.c',
+		'sp_tex_tile_cache.c',
 		'sp_texture.c',
 		'sp_tile_cache.c',
 	])
diff --git a/src/gallium/drivers/softpipe/sp_clear.c b/src/gallium/drivers/softpipe/sp_clear.c
index d3af18e162..8fac8e6e05 100644
--- a/src/gallium/drivers/softpipe/sp_clear.c
+++ b/src/gallium/drivers/softpipe/sp_clear.c
@@ -36,8 +36,6 @@
 #include "util/u_pack_color.h"
 #include "sp_clear.h"
 #include "sp_context.h"
-#include "sp_surface.h"
-#include "sp_state.h"
 #include "sp_tile_cache.h"
 
 
diff --git a/src/gallium/drivers/softpipe/sp_clear.h b/src/gallium/drivers/softpipe/sp_clear.h
index 2e450672f5..9be3b86fe9 100644
--- a/src/gallium/drivers/softpipe/sp_clear.h
+++ b/src/gallium/drivers/softpipe/sp_clear.h
@@ -32,7 +32,6 @@
 #ifndef SP_CLEAR_H
 #define SP_CLEAR_H
 
-#include "pipe/p_state.h"
 struct pipe_context;
 
 extern void
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index 86df320ea8..e1e31ab047 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -31,17 +31,18 @@
  */
 
 #include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
 #include "pipe/p_defines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "sp_clear.h"
 #include "sp_context.h"
 #include "sp_flush.h"
-#include "sp_prim_setup.h"
 #include "sp_prim_vbuf.h"
 #include "sp_state.h"
 #include "sp_surface.h"
 #include "sp_tile_cache.h"
+#include "sp_tex_tile_cache.h"
 #include "sp_texture.h"
 #include "sp_winsys.h"
 #include "sp_query.h"
@@ -72,18 +73,16 @@ softpipe_unmap_transfers(struct softpipe_context *sp)
 {
    uint i;
 
-   for (i = 0; i < sp->framebuffer.nr_cbufs; i++)
-      sp_flush_tile_cache(sp, sp->cbuf_cache[i]);
-   sp_flush_tile_cache(sp, sp->zsbuf_cache);
-
    for (i = 0; i < sp->framebuffer.nr_cbufs; i++) {
       sp_tile_cache_unmap_transfers(sp->cbuf_cache[i]);
    }
+
    sp_tile_cache_unmap_transfers(sp->zsbuf_cache);
 }
 
 
-static void softpipe_destroy( struct pipe_context *pipe )
+static void
+softpipe_destroy( struct pipe_context *pipe )
 {
    struct softpipe_context *softpipe = softpipe_context( pipe );
    uint i;
@@ -91,26 +90,16 @@ static void softpipe_destroy( struct pipe_context *pipe )
    if (softpipe->draw)
       draw_destroy( softpipe->draw );
 
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      softpipe->quad[i].polygon_stipple->destroy( softpipe->quad[i].polygon_stipple );
-      softpipe->quad[i].earlyz->destroy( softpipe->quad[i].earlyz );
-      softpipe->quad[i].shade->destroy( softpipe->quad[i].shade );
-      softpipe->quad[i].alpha_test->destroy( softpipe->quad[i].alpha_test );
-      softpipe->quad[i].depth_test->destroy( softpipe->quad[i].depth_test );
-      softpipe->quad[i].stencil_test->destroy( softpipe->quad[i].stencil_test );
-      softpipe->quad[i].occlusion->destroy( softpipe->quad[i].occlusion );
-      softpipe->quad[i].coverage->destroy( softpipe->quad[i].coverage );
-      softpipe->quad[i].blend->destroy( softpipe->quad[i].blend );
-      softpipe->quad[i].colormask->destroy( softpipe->quad[i].colormask );
-      softpipe->quad[i].output->destroy( softpipe->quad[i].output );
-   }
+      softpipe->quad.shade->destroy( softpipe->quad.shade );
+      softpipe->quad.depth_test->destroy( softpipe->quad.depth_test );
+      softpipe->quad.blend->destroy( softpipe->quad.blend );
 
    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
       sp_destroy_tile_cache(softpipe->cbuf_cache[i]);
    sp_destroy_tile_cache(softpipe->zsbuf_cache);
 
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
-      sp_destroy_tile_cache(softpipe->tex_cache[i]);
+      sp_destroy_tex_tile_cache(softpipe->tex_cache[i]);
 
    for (i = 0; i < Elements(softpipe->constants); i++) {
       if (softpipe->constants[i].buffer) {
@@ -121,6 +110,15 @@ static void softpipe_destroy( struct pipe_context *pipe )
    FREE( softpipe );
 }
 
+
+/**
+ * if (the texture is being used as a framebuffer surface)
+ *    return PIPE_REFERENCED_FOR_WRITE
+ * else if (the texture is a bound texture source)
+ *    return PIPE_REFERENCED_FOR_READ  XXX not done yet
+ * else
+ *    return PIPE_UNREFERENCED
+ */
 static unsigned int
 softpipe_is_texture_referenced( struct pipe_context *pipe,
 				struct pipe_texture *texture,
@@ -129,15 +127,17 @@ softpipe_is_texture_referenced( struct pipe_context *pipe,
    struct softpipe_context *softpipe = softpipe_context( pipe );
    unsigned i;
 
-   if(softpipe->dirty_render_cache) {
+   if (softpipe->dirty_render_cache) {
       for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) {
-         if(softpipe->framebuffer.cbufs[i] && 
-            softpipe->framebuffer.cbufs[i]->texture == texture)
+         if (softpipe->framebuffer.cbufs[i] && 
+             softpipe->framebuffer.cbufs[i]->texture == texture) {
             return PIPE_REFERENCED_FOR_WRITE;
+         }
       }
-      if(softpipe->framebuffer.zsbuf && 
-         softpipe->framebuffer.zsbuf->texture == texture)
+      if (softpipe->framebuffer.zsbuf && 
+          softpipe->framebuffer.zsbuf->texture == texture) {
          return PIPE_REFERENCED_FOR_WRITE;
+      }
    }
    
    /* FIXME: we also need to do the same for the texture cache */
@@ -145,6 +145,7 @@ softpipe_is_texture_referenced( struct pipe_context *pipe,
    return PIPE_UNREFERENCED;
 }
 
+
 static unsigned int
 softpipe_is_buffer_referenced( struct pipe_context *pipe,
 			       struct pipe_buffer *buf)
@@ -152,6 +153,7 @@ softpipe_is_buffer_referenced( struct pipe_context *pipe,
    return PIPE_UNREFERENCED;
 }
 
+
 struct pipe_context *
 softpipe_create( struct pipe_screen *screen )
 {
@@ -222,7 +224,6 @@ softpipe_create( struct pipe_screen *screen )
    softpipe->pipe.is_buffer_referenced = softpipe_is_buffer_referenced;
 
    softpipe_init_query_funcs( softpipe );
-   softpipe_init_texture_funcs( softpipe );
 
    /*
     * Alloc caches for accessing drawing surfaces and textures.
@@ -233,41 +234,14 @@ softpipe_create( struct pipe_screen *screen )
    softpipe->zsbuf_cache = sp_create_tile_cache( screen );
 
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
-      softpipe->tex_cache[i] = sp_create_tile_cache( screen );
+      softpipe->tex_cache[i] = sp_create_tex_tile_cache( screen );
 
 
    /* setup quad rendering stages */
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      softpipe->quad[i].polygon_stipple = sp_quad_polygon_stipple_stage(softpipe);
-      softpipe->quad[i].earlyz = sp_quad_earlyz_stage(softpipe);
-      softpipe->quad[i].shade = sp_quad_shade_stage(softpipe);
-      softpipe->quad[i].alpha_test = sp_quad_alpha_test_stage(softpipe);
-      softpipe->quad[i].depth_test = sp_quad_depth_test_stage(softpipe);
-      softpipe->quad[i].stencil_test = sp_quad_stencil_test_stage(softpipe);
-      softpipe->quad[i].occlusion = sp_quad_occlusion_stage(softpipe);
-      softpipe->quad[i].coverage = sp_quad_coverage_stage(softpipe);
-      softpipe->quad[i].blend = sp_quad_blend_stage(softpipe);
-      softpipe->quad[i].colormask = sp_quad_colormask_stage(softpipe);
-      softpipe->quad[i].output = sp_quad_output_stage(softpipe);
-   }
-
-   /* vertex shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      softpipe->tgsi.vert_samplers[i].base.get_samples = sp_get_samples_vertex;
-      softpipe->tgsi.vert_samplers[i].unit = i;
-      softpipe->tgsi.vert_samplers[i].sp = softpipe;
-      softpipe->tgsi.vert_samplers[i].cache = softpipe->tex_cache[i];
-      softpipe->tgsi.vert_samplers_list[i] = &softpipe->tgsi.vert_samplers[i];
-   }
+      softpipe->quad.shade = sp_quad_shade_stage(softpipe);
+      softpipe->quad.depth_test = sp_quad_depth_test_stage(softpipe);
+      softpipe->quad.blend = sp_quad_blend_stage(softpipe);
 
-   /* fragment shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      softpipe->tgsi.frag_samplers[i].base.get_samples = sp_get_samples_fragment;
-      softpipe->tgsi.frag_samplers[i].unit = i;
-      softpipe->tgsi.frag_samplers[i].sp = softpipe;
-      softpipe->tgsi.frag_samplers[i].cache = softpipe->tex_cache[i];
-      softpipe->tgsi.frag_samplers_list[i] = &softpipe->tgsi.frag_samplers[i];
-   }
 
    /*
     * Create drawing context and plug our rendering stage into it.
@@ -281,30 +255,28 @@ softpipe_create( struct pipe_screen *screen )
                          (struct tgsi_sampler **)
                             softpipe->tgsi.vert_samplers_list);
 
-   softpipe->setup = sp_draw_render_stage(softpipe);
-   if (!softpipe->setup)
-      goto fail;
-
    if (debug_get_bool_option( "SP_NO_RAST", FALSE ))
       softpipe->no_rast = TRUE;
 
-   if (debug_get_bool_option( "SP_NO_VBUF", FALSE )) {
-      /* Deprecated path -- vbuf is the intended interface to the draw module:
-       */
-      draw_set_rasterize_stage(softpipe->draw, softpipe->setup);
-   }
-   else {
-      sp_init_vbuf(softpipe);
-   }
+   softpipe->vbuf_backend = sp_create_vbuf_backend(softpipe);
+   if (!softpipe->vbuf_backend)
+      goto fail;
+
+   softpipe->vbuf = draw_vbuf_stage(softpipe->draw, softpipe->vbuf_backend);
+   if (!softpipe->vbuf)
+      goto fail;
+
+   draw_set_rasterize_stage(softpipe->draw, softpipe->vbuf);
+   draw_set_render(softpipe->draw, softpipe->vbuf_backend);
+
+
 
    /* plug in AA line/point stages */
    draw_install_aaline_stage(softpipe->draw, &softpipe->pipe);
    draw_install_aapoint_stage(softpipe->draw, &softpipe->pipe);
 
-#if USE_DRAW_STAGE_PSTIPPLE
    /* Do polygon stipple w/ texture map + frag prog? */
    draw_install_pstipple_stage(softpipe->draw, &softpipe->pipe);
-#endif
 
    sp_init_surface_functions(softpipe);
 
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 7888c2f644..43a195c8ef 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -36,24 +36,13 @@
 #include "draw/draw_vertex.h"
 
 #include "sp_quad_pipe.h"
-#include "sp_tex_sample.h"
 
 
-/**
- * This is a temporary variable for testing draw-stage polygon stipple.
- * If zero, do stipple in sp_quad_stipple.c
- */
-#define USE_DRAW_STAGE_PSTIPPLE 1
-
-/* Number of threads working on individual quads.
- * Setting to 1 disables this feature.
- */
-#define SP_NUM_QUAD_THREADS 1
-
 struct softpipe_vbuf_render;
 struct draw_context;
 struct draw_stage;
 struct softpipe_tile_cache;
+struct softpipe_tex_tile_cache;
 struct sp_fragment_shader;
 struct sp_vertex_shader;
 
@@ -62,12 +51,12 @@ struct softpipe_context {
    struct pipe_context pipe;  /**< base class */
 
    /** Constant state objects */
-   const struct pipe_blend_state *blend;
-   const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
-   const struct pipe_depth_stencil_alpha_state *depth_stencil;
-   const struct pipe_rasterizer_state *rasterizer;
-   const struct sp_fragment_shader *fs;
-   const struct sp_vertex_shader *vs;
+   struct pipe_blend_state *blend;
+   struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_depth_stencil_alpha_state *depth_stencil;
+   struct pipe_rasterizer_state *rasterizer;
+   struct sp_fragment_shader *fs;
+   struct sp_vertex_shader *vs;
 
    /** Other rendering state */
    struct pipe_blend_color blend_color;
@@ -107,7 +96,16 @@ struct softpipe_context {
    /** Which vertex shader output slot contains point size */
    int psize_slot;
 
-   unsigned reduced_api_prim;  /**< PIPE_PRIM_POINTS, _LINES or _TRIANGLES */
+   /* The reduced version of the primitive supplied by the state
+    * tracker.
+    */
+   unsigned reduced_api_prim;
+
+   /* The reduced primitive after unfilled triangles, wide-line
+    * decomposition, etc, are taken into account.  This is the
+    * primitive actually rasterized.
+    */
+   unsigned reduced_prim;
 
    /** Derived from scissor and surface bounds: */
    struct pipe_scissor_state cliprect;
@@ -116,41 +114,33 @@ struct softpipe_context {
 
    /** Software quad rendering pipeline */
    struct {
-      struct quad_stage *polygon_stipple;
-      struct quad_stage *earlyz;
       struct quad_stage *shade;
-      struct quad_stage *alpha_test;
-      struct quad_stage *stencil_test;
       struct quad_stage *depth_test;
-      struct quad_stage *occlusion;
-      struct quad_stage *coverage;
       struct quad_stage *blend;
-      struct quad_stage *colormask;
-      struct quad_stage *output;
 
       struct quad_stage *first; /**< points to one of the above stages */
-   } quad[SP_NUM_QUAD_THREADS];
+   } quad;
 
    /** TGSI exec things */
    struct {
-      struct sp_shader_sampler vert_samplers[PIPE_MAX_SAMPLERS];
-      struct sp_shader_sampler *vert_samplers_list[PIPE_MAX_SAMPLERS];
-      struct sp_shader_sampler frag_samplers[PIPE_MAX_SAMPLERS];
-      struct sp_shader_sampler *frag_samplers_list[PIPE_MAX_SAMPLERS];
+      struct sp_sampler_varient *vert_samplers_list[PIPE_MAX_SAMPLERS];
+      struct sp_sampler_varient *frag_samplers_list[PIPE_MAX_SAMPLERS];
    } tgsi;
 
    /** The primitive drawing context */
    struct draw_context *draw;
-   struct draw_stage *setup;
+
+   /** Draw module backend */
+   struct vbuf_render *vbuf_backend;
    struct draw_stage *vbuf;
-   struct softpipe_vbuf_render *vbuf_render;
 
    boolean dirty_render_cache;
    
    struct softpipe_tile_cache *cbuf_cache[PIPE_MAX_COLOR_BUFS];
    struct softpipe_tile_cache *zsbuf_cache;
-
-   struct softpipe_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
+   
+   unsigned tex_timestamp;
+   struct softpipe_tex_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
 
    unsigned use_sse : 1;
    unsigned dump_fs : 1;
@@ -164,5 +154,9 @@ softpipe_context( struct pipe_context *pipe )
    return (struct softpipe_context *)pipe;
 }
 
+void
+softpipe_reset_sampler_varients(struct softpipe_context *softpipe);
+
+
 #endif /* SP_CONTEXT_H */
 
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
index 4a14d49686..e38b767cf2 100644
--- a/src/gallium/drivers/softpipe/sp_flush.c
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -37,6 +37,7 @@
 #include "sp_surface.h"
 #include "sp_state.h"
 #include "sp_tile_cache.h"
+#include "sp_tex_tile_cache.h"
 #include "sp_winsys.h"
 
 
@@ -52,17 +53,19 @@ softpipe_flush( struct pipe_context *pipe,
 
    if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
       for (i = 0; i < softpipe->num_textures; i++) {
-         sp_flush_tile_cache(softpipe, softpipe->tex_cache[i]);
+         sp_flush_tex_tile_cache(softpipe->tex_cache[i]);
       }
    }
 
-   if (flags & PIPE_FLUSH_RENDER_CACHE) {
+   if (flags & PIPE_FLUSH_SWAPBUFFERS) {
+      /* If this is a swapbuffers, just flush color buffers.
+       *
+       * The zbuffer changes are not discarded, but held in the cache
+       * in the hope that a later clear will wipe them out.
+       */
       for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++)
          if (softpipe->cbuf_cache[i])
-            sp_flush_tile_cache(softpipe, softpipe->cbuf_cache[i]);
-
-      if (softpipe->zsbuf_cache)
-         sp_flush_tile_cache(softpipe, softpipe->zsbuf_cache);
+            sp_flush_tile_cache(softpipe->cbuf_cache[i]);
 
       /* Need this call for hardware buffers before swapbuffers.
        *
@@ -71,7 +74,15 @@ softpipe_flush( struct pipe_context *pipe,
        * to unmap surfaces when flushing.
        */
       softpipe_unmap_transfers(softpipe);
-      
+   }
+   else if (flags & PIPE_FLUSH_RENDER_CACHE) {
+      for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++)
+         if (softpipe->cbuf_cache[i])
+            sp_flush_tile_cache(softpipe->cbuf_cache[i]);
+
+      if (softpipe->zsbuf_cache)
+         sp_flush_tile_cache(softpipe->zsbuf_cache);
+     
       softpipe->dirty_render_cache = FALSE;
    }
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 9ee86fe787..c469ac6340 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -59,15 +59,34 @@ sp_exec_fragment_shader(const struct sp_fragment_shader *base)
 }
 
 
+static void
+exec_prepare( const struct sp_fragment_shader *base,
+	      struct tgsi_exec_machine *machine,
+	      struct tgsi_sampler **samplers )
+{
+   /*
+    * Bind tokens/shader to the interpreter's machine state.
+    * Avoid redundant binding.
+    */
+   if (machine->Tokens != base->shader.tokens) {
+      tgsi_exec_machine_bind_shader( machine,
+                                     base->shader.tokens,
+                                     PIPE_MAX_SAMPLERS,
+                                     samplers );
+   }
+}
+
+
+
 /**
  * Compute quad X,Y,Z,W for the four fragments in a quad.
  *
  * This should really be part of the compiled shader.
  */
-void
-sp_setup_pos_vector(const struct tgsi_interp_coef *coef,
-		    float x, float y,
-		    struct tgsi_exec_vector *quadpos)
+static void
+setup_pos_vector(const struct tgsi_interp_coef *coef,
+                 float x, float y,
+                 struct tgsi_exec_vector *quadpos)
 {
    uint chan;
    /* do X */
@@ -95,24 +114,6 @@ sp_setup_pos_vector(const struct tgsi_interp_coef *coef,
 }
 
 
-static void
-exec_prepare( const struct sp_fragment_shader *base,
-	      struct tgsi_exec_machine *machine,
-	      struct tgsi_sampler **samplers )
-{
-   /*
-    * Bind tokens/shader to the interpreter's machine state.
-    * Avoid redundant binding.
-    */
-   if (machine->Tokens != base->shader.tokens) {
-      tgsi_exec_machine_bind_shader( machine,
-                                     base->shader.tokens,
-                                     PIPE_MAX_SAMPLERS,
-                                     samplers );
-   }
-}
-
-
 /* TODO: hide the machine struct in here somewhere, remove from this
  * interface:
  */
@@ -122,11 +123,43 @@ exec_run( const struct sp_fragment_shader *base,
 	  struct quad_header *quad )
 {
    /* Compute X, Y, Z, W vals for this quad */
-   sp_setup_pos_vector(quad->posCoef, 
-		       (float)quad->input.x0, (float)quad->input.y0, 
-		       &machine->QuadPos);
+   setup_pos_vector(quad->posCoef, 
+                    (float)quad->input.x0, (float)quad->input.y0, 
+                    &machine->QuadPos);
    
-   return tgsi_exec_machine_run( machine );
+   quad->inout.mask &= tgsi_exec_machine_run( machine );
+   if (quad->inout.mask == 0)
+      return FALSE;
+
+   /* store outputs */
+   {
+      const ubyte *sem_name = base->info.output_semantic_name;
+      const ubyte *sem_index = base->info.output_semantic_index;
+      const uint n = base->info.num_outputs;
+      uint i;
+      for (i = 0; i < n; i++) {
+         switch (sem_name[i]) {
+         case TGSI_SEMANTIC_COLOR:
+            {
+               uint cbuf = sem_index[i];
+               memcpy(quad->output.color[cbuf],
+                      &machine->Outputs[i].xyzw[0].f[0],
+                      sizeof(quad->output.color[0]) );
+            }
+            break;
+         case TGSI_SEMANTIC_POSITION:
+            {
+               uint j;
+               for (j = 0; j < 4; j++) {
+                  quad->output.depth[j] = machine->Outputs[0].xyzw[2].f[j];
+               }
+            }
+            break;
+         }
+      }
+   }
+
+   return TRUE;
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_llvm.c b/src/gallium/drivers/softpipe/sp_fs_llvm.c
deleted file mode 100644
index 95c0d982d1..0000000000
--- a/src/gallium/drivers/softpipe/sp_fs_llvm.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * Execute fragment shader using LLVM code generation.
- * Authors:
- *   Zack Rusin
- */
-
-#include "sp_context.h"
-#include "sp_state.h"
-#include "sp_fs.h"
-
-#include "pipe/p_state.h"
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "tgsi/tgsi_sse2.h"
-
-#if 0
-
-/**
- * Subclass of sp_fragment_shader
- */
-struct sp_llvm_fragment_shader
-{
-   struct sp_fragment_shader base;
-   struct gallivm_prog *llvm_prog;
-};
-
-
-static void
-shade_quad_llvm(struct quad_stage *qs,
-                struct quad_header *quad)
-{
-   struct quad_shade_stage *qss = quad_shade_stage(qs);
-   struct softpipe_context *softpipe = qs->softpipe;
-   float dests[4][16][4] ALIGN16_ATTRIB;
-   float inputs[4][16][4] ALIGN16_ATTRIB;
-   const float fx = (float) quad->x0;
-   const float fy = (float) quad->y0;
-   struct gallivm_prog *llvm = qss->llvm_prog;
-
-   inputs[0][0][0] = fx;
-   inputs[1][0][0] = fx + 1.0f;
-   inputs[2][0][0] = fx;
-   inputs[3][0][0] = fx + 1.0f;
-
-   inputs[0][0][1] = fy;
-   inputs[1][0][1] = fy;
-   inputs[2][0][1] = fy + 1.0f;
-   inputs[3][0][1] = fy + 1.0f;
-
-
-   gallivm_prog_inputs_interpolate(llvm, inputs, quad->coef);
-
-#if DLLVM
-   debug_printf("MASK = %d\n", quad->mask);
-   for (int i = 0; i < 4; ++i) {
-      for (int j = 0; j < 2; ++j) {
-         debug_printf("IN(%d,%d) [%f %f %f %f]\n", i, j, 
-                inputs[i][j][0], inputs[i][j][1], inputs[i][j][2], inputs[i][j][3]);
-      }
-   }
-#endif
-
-   quad->mask &=
-      gallivm_fragment_shader_exec(llvm, fx, fy, dests, inputs,
-                                   softpipe->mapped_constants[PIPE_SHADER_FRAGMENT],
-                                   qss->samplers);
-#if DLLVM
-   debug_printf("OUT LLVM = 1[%f %f %f %f], 2[%f %f %f %f]\n",
-          dests[0][0][0], dests[0][0][1], dests[0][0][2], dests[0][0][3], 
-          dests[0][1][0], dests[0][1][1], dests[0][1][2], dests[0][1][3]);
-#endif
-
-   /* store result color */
-   if (qss->colorOutSlot >= 0) {
-      unsigned i;
-      /* XXX need to handle multiple color outputs someday */
-      allvmrt(qss->stage.softpipe->fs->info.output_semantic_name[qss->colorOutSlot]
-             == TGSI_SEMANTIC_COLOR);
-      for (i = 0; i < QUAD_SIZE; ++i) {
-         quad->outputs.color[0][0][i] = dests[i][qss->colorOutSlot][0];
-         quad->outputs.color[0][1][i] = dests[i][qss->colorOutSlot][1];
-         quad->outputs.color[0][2][i] = dests[i][qss->colorOutSlot][2];
-         quad->outputs.color[0][3][i] = dests[i][qss->colorOutSlot][3];
-      }
-   }
-#if DLLVM
-   for (int i = 0; i < QUAD_SIZE; ++i) {
-      debug_printf("QLLVM%d(%d) [%f, %f, %f, %f]\n", i, qss->colorOutSlot,
-             quad->outputs.color[0][0][i],
-             quad->outputs.color[0][1][i],
-             quad->outputs.color[0][2][i],
-             quad->outputs.color[0][3][i]);
-   }
-#endif
-
-   /* store result Z */
-   if (qss->depthOutSlot >= 0) {
-      /* output[slot] is new Z */
-      uint i;
-      for (i = 0; i < 4; i++) {
-         quad->outputs.depth[i] = dests[i][0][2];
-      }
-   }
-   else {
-      /* copy input Z (which was interpolated by the executor) to output Z */
-      uint i;
-      for (i = 0; i < 4; i++) {
-         quad->outputs.depth[i] = inputs[i][0][2];
-      }
-   }
-#if DLLVM
-   debug_printf("D [%f, %f, %f, %f] mask = %d\n",
-             quad->outputs.depth[0],
-             quad->outputs.depth[1],
-             quad->outputs.depth[2],
-             quad->outputs.depth[3], quad->mask);
-#endif
-
-   /* shader may cull fragments */
-   if( quad->mask ) {
-      qs->next->run( qs->next, quad );
-   }
-}
-
-
-unsigned 
-run_llvm_fs( const struct sp_fragment_shader *base,
-	     struct foo *machine )
-{
-}
-
-
-void 
-delete_llvm_fs( struct sp_fragment_shader *base )
-{
-   FREE(base);
-}
-
-
-struct sp_fragment_shader *
-softpipe_create_fs_llvm(struct softpipe_context *softpipe,
-                        const struct pipe_shader_state *templ)
-{
-   struct sp_llvm_fragment_shader *shader = NULL;
-
-   /* LLVM fragment shaders currently disabled:
-    */
-   state = CALLOC_STRUCT(sp_llvm_shader_state);
-   if (!state)
-      return NULL;
-
-   state->llvm_prog = 0;
-
-   if (!gallivm_global_cpu_engine()) {
-      gallivm_cpu_engine_create(state->llvm_prog);
-   }
-   else
-      gallivm_cpu_jit_compile(gallivm_global_cpu_engine(), state->llvm_prog);
-   
-   if (shader) {
-      shader->base.run = run_llvm_fs;
-      shader->base.delete = delete_llvm_fs;
-   }
-
-   return shader;
-}
-
-
-#else
-
-struct sp_fragment_shader *
-softpipe_create_fs_llvm(struct softpipe_context *softpipe,
-		       const struct pipe_shader_state *templ)
-{
-   return NULL;
-}
-
-#endif
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index f4fa0905d7..9d3e4670ee 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -76,6 +76,43 @@ fs_sse_prepare( const struct sp_fragment_shader *base,
 }
 
 
+
+/**
+ * Compute quad X,Y,Z,W for the four fragments in a quad.
+ *
+ * This should really be part of the compiled shader.
+ */
+static void
+setup_pos_vector(const struct tgsi_interp_coef *coef,
+		    float x, float y,
+		    struct tgsi_exec_vector *quadpos)
+{
+   uint chan;
+   /* do X */
+   quadpos->xyzw[0].f[0] = x;
+   quadpos->xyzw[0].f[1] = x + 1;
+   quadpos->xyzw[0].f[2] = x;
+   quadpos->xyzw[0].f[3] = x + 1;
+
+   /* do Y */
+   quadpos->xyzw[1].f[0] = y;
+   quadpos->xyzw[1].f[1] = y;
+   quadpos->xyzw[1].f[2] = y + 1;
+   quadpos->xyzw[1].f[3] = y + 1;
+
+   /* do Z and W for all fragments in the quad */
+   for (chan = 2; chan < 4; chan++) {
+      const float dadx = coef->dadx[chan];
+      const float dady = coef->dady[chan];
+      const float a0 = coef->a0[chan] + dadx * x + dady * y;
+      quadpos->xyzw[chan].f[0] = a0;
+      quadpos->xyzw[chan].f[1] = a0 + dadx;
+      quadpos->xyzw[chan].f[2] = a0 + dady;
+      quadpos->xyzw[chan].f[3] = a0 + dadx + dady;
+   }
+}
+
+
 /* TODO: codegenerate the whole run function, skip this wrapper.
  * TODO: break dependency on tgsi_exec_machine struct
  * TODO: push Position calculation into the generated shader
@@ -89,9 +126,9 @@ fs_sse_run( const struct sp_fragment_shader *base,
    struct sp_sse_fragment_shader *shader = sp_sse_fragment_shader(base);
 
    /* Compute X, Y, Z, W vals for this quad -- place in temp[0] for now */
-   sp_setup_pos_vector(quad->posCoef, 
-		       (float)quad->input.x0, (float)quad->input.y0, 
-		       machine->Temps);
+   setup_pos_vector(quad->posCoef, 
+                    (float)quad->input.x0, (float)quad->input.y0, 
+                    machine->Temps);
 
    /* init kill mask */
    tgsi_set_kill_mask(machine, 0x0);
@@ -104,7 +141,39 @@ fs_sse_run( const struct sp_fragment_shader *base,
 		 //	 , &machine->QuadPos
       );
 
-   return ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]);
+   quad->inout.mask &= ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]);
+   if (quad->inout.mask == 0)
+      return FALSE;
+
+   /* store outputs */
+   {
+      const ubyte *sem_name = base->info.output_semantic_name;
+      const ubyte *sem_index = base->info.output_semantic_index;
+      const uint n = base->info.num_outputs;
+      uint i;
+      for (i = 0; i < n; i++) {
+         switch (sem_name[i]) {
+         case TGSI_SEMANTIC_COLOR:
+            {
+               uint cbuf = sem_index[i];
+               memcpy(quad->output.color[cbuf],
+                      &machine->Outputs[i].xyzw[0].f[0],
+                      sizeof(quad->output.color[0]) );
+            }
+            break;
+         case TGSI_SEMANTIC_POSITION:
+            {
+               uint j;
+               for (j = 0; j < 4; j++) {
+                  quad->output.depth[j] = machine->Outputs[0].xyzw[2].f[j];
+               }
+            }
+            break;
+         }
+      }
+   }
+
+   return TRUE;
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.c b/src/gallium/drivers/softpipe/sp_prim_setup.c
deleted file mode 100644
index 038ff04d4f..0000000000
--- a/src/gallium/drivers/softpipe/sp_prim_setup.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * \brief A draw stage that drives our triangle setup routines from
- * within the draw pipeline.  One of two ways to drive setup, the
- * other being in sp_prim_vbuf.c.
- *
- * \author  Keith Whitwell <keith@tungstengraphics.com>
- * \author  Brian Paul
- */
-
-
-#include "sp_context.h"
-#include "sp_setup.h"
-#include "sp_state.h"
-#include "sp_prim_setup.h"
-#include "draw/draw_pipe.h"
-#include "draw/draw_vertex.h"
-#include "util/u_memory.h"
-
-/**
- * Triangle setup info (derived from draw_stage).
- * Also used for line drawing (taking some liberties).
- */
-struct setup_stage {
-   struct draw_stage stage; /**< This must be first (base class) */
-
-   struct setup_context *setup;
-};
-
-
-
-/**
- * Basically a cast wrapper.
- */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
-{
-   return (struct setup_stage *)stage;
-}
-
-
-typedef const float (*cptrf4)[4];
-
-static void
-do_tri(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-   
-   setup_tri( setup->setup,
-              (cptrf4)prim->v[0]->data,
-              (cptrf4)prim->v[1]->data,
-              (cptrf4)prim->v[2]->data );
-}
-
-static void
-do_line(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-
-   setup_line( setup->setup,
-               (cptrf4)prim->v[0]->data,
-               (cptrf4)prim->v[1]->data );
-}
-
-static void
-do_point(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-
-   setup_point( setup->setup,
-                (cptrf4)prim->v[0]->data );
-}
-
-
-
-
-static void setup_begin( struct draw_stage *stage )
-{
-   struct setup_stage *setup = setup_stage(stage);
-
-   setup_prepare( setup->setup );
-
-   stage->point = do_point;
-   stage->line = do_line;
-   stage->tri = do_tri;
-}
-
-
-static void setup_first_point( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->point( stage, header );
-}
-
-static void setup_first_line( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->line( stage, header );
-}
-
-
-static void setup_first_tri( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->tri( stage, header );
-}
-
-
-
-static void setup_flush( struct draw_stage *stage,
-			 unsigned flags )
-{
-   stage->point = setup_first_point;
-   stage->line = setup_first_line;
-   stage->tri = setup_first_tri;
-}
-
-
-static void reset_stipple_counter( struct draw_stage *stage )
-{
-}
-
-
-static void render_destroy( struct draw_stage *stage )
-{
-   struct setup_stage *ssetup = setup_stage(stage);
-   setup_destroy_context(ssetup->setup);
-   FREE( stage );
-}
-
-
-/**
- * Create a new primitive setup/render stage.
- */
-struct draw_stage *sp_draw_render_stage( struct softpipe_context *softpipe )
-{
-   struct setup_stage *sstage = CALLOC_STRUCT(setup_stage);
-
-   sstage->setup = setup_create_context(softpipe);
-   sstage->stage.draw = softpipe->draw;
-   sstage->stage.point = setup_first_point;
-   sstage->stage.line = setup_first_line;
-   sstage->stage.tri = setup_first_tri;
-   sstage->stage.flush = setup_flush;
-   sstage->stage.reset_stipple_counter = reset_stipple_counter;
-   sstage->stage.destroy = render_destroy;
-
-   return (struct draw_stage *)sstage;
-}
-
-struct setup_context *
-sp_draw_setup_context( struct draw_stage *stage )
-{
-   struct setup_stage *ssetup = setup_stage(stage);
-   return ssetup->setup;
-}
-
-void
-sp_draw_flush( struct draw_stage *stage )
-{
-   stage->flush( stage, 0 );
-}
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.h b/src/gallium/drivers/softpipe/sp_prim_setup.h
deleted file mode 100644
index 49bdd98ed8..0000000000
--- a/src/gallium/drivers/softpipe/sp_prim_setup.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#ifndef SP_PRIM_SETUP_H
-#define SP_PRIM_SETUP_H
-
-
-/**
- * vbuf is a special stage to gather the stream of triangles, lines, points
- * together and reconstruct vertex buffers for hardware upload.
- *
- * First attempt, work in progress.
- * 
- * TODO:
- *    - separate out vertex buffer building and primitive emit, ie >1 draw per vb.
- *    - tell vbuf stage how to build hw vertices directly
- *    - pass vbuf stage a buffer pointer for direct emit to agp/vram.
- *
- *
- *
- * Vertices are just an array of floats, with all the attributes
- * packed.  We currently assume a layout like:
- *
- * attr[0][0..3] - window position
- * attr[1..n][0..3] - remaining attributes.
- *
- * Attributes are assumed to be 4 floats wide but are packed so that
- * all the enabled attributes run contiguously.
- */
-
-
-struct draw_stage;
-struct softpipe_context;
-
-
-typedef void (*vbuf_draw_func)( struct pipe_context *pipe,
-                                unsigned prim,
-                                const ushort *elements,
-                                unsigned nr_elements,
-                                const void *vertex_buffer,
-                                unsigned nr_vertices );
-
-
-extern struct draw_stage *
-sp_draw_render_stage( struct softpipe_context *softpipe );
-
-extern struct setup_context *
-sp_draw_setup_context( struct draw_stage * );
-
-extern void
-sp_draw_flush( struct draw_stage * );
-
-
-extern struct draw_stage *
-sp_draw_vbuf_stage( struct draw_context *draw_context,
-                    struct pipe_context *pipe,
-                    vbuf_draw_func draw );
-
-
-#endif /* SP_PRIM_SETUP_H */
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index 42021789ea..e603c20fc4 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -37,13 +37,13 @@
 
 
 #include "sp_context.h"
+#include "sp_setup.h"
 #include "sp_state.h"
 #include "sp_prim_vbuf.h"
-#include "sp_prim_setup.h"
-#include "sp_setup.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
 #include "util/u_memory.h"
+#include "util/u_prim.h"
 
 
 #define SP_MAX_VBUF_INDEXES 1024
@@ -58,6 +58,8 @@ struct softpipe_vbuf_render
 {
    struct vbuf_render base;
    struct softpipe_context *softpipe;
+   struct setup_context *setup;
+
    uint prim;
    uint vertex_size;
    uint nr_vertices;
@@ -74,6 +76,11 @@ softpipe_vbuf_render(struct vbuf_render *vbr)
 }
 
 
+
+
+
+
+
 static const struct vertex_info *
 sp_vbuf_get_vertex_info(struct vbuf_render *vbr)
 {
@@ -104,36 +111,6 @@ sp_vbuf_allocate_vertices(struct vbuf_render *vbr,
 static void
 sp_vbuf_release_vertices(struct vbuf_render *vbr)
 {
-#if 0
-   {
-      struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
-      const struct vertex_info *info = 
-         softpipe_get_vbuf_vertex_info(cvbr->softpipe);
-      const float *vtx = (const float *) cvbr->vertex_buffer;
-      uint i, j;
-      debug_printf("%s (vtx_size = %u,  vtx_used = %u)\n",
-             __FUNCTION__, cvbr->vertex_size, cvbr->nr_vertices);
-      for (i = 0; i < cvbr->nr_vertices; i++) {
-         for (j = 0; j < info->num_attribs; j++) {
-            uint k;
-            switch (info->attrib[j].emit) {
-            case EMIT_4F:  k = 4;   break;
-            case EMIT_3F:  k = 3;   break;
-            case EMIT_2F:  k = 2;   break;
-            case EMIT_1F:  k = 1;   break;
-            default: assert(0);
-            }
-            debug_printf("Vert %u attr %u: ", i, j);
-            while (k-- > 0) {
-               debug_printf("%g ", vtx[0]);
-               vtx++;
-            }
-            debug_printf("\n");
-         }
-      }
-   }
-#endif
-
    /* keep the old allocation for next time */
 }
 
@@ -159,14 +136,11 @@ static boolean
 sp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
-
-   /* XXX: break this dependency - make setup_context live under
-    * softpipe, rename the old "setup" draw stage to something else.
-    */
-   struct setup_context *setup_ctx = sp_draw_setup_context(cvbr->softpipe->setup);
+   struct setup_context *setup_ctx = cvbr->setup;
    
    setup_prepare( setup_ctx );
 
+   cvbr->softpipe->reduced_prim = u_reduced_prim(prim);
    cvbr->prim = prim;
    return TRUE;
 
@@ -191,14 +165,9 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
    struct softpipe_context *softpipe = cvbr->softpipe;
    const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
    const void *vertex_buffer = cvbr->vertex_buffer;
+   struct setup_context *setup_ctx = cvbr->setup;
    unsigned i;
 
-   /* XXX: break this dependency - make setup_context live under
-    * softpipe, rename the old "setup" draw stage to something else.
-    */
-   struct draw_stage *setup = softpipe->setup;
-   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
-
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
       for (i = 0; i < nr; i++) {
@@ -237,14 +206,16 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLES:
-      for (i = 2; i < nr; i += 3) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 3) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-1], stride),
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[i-2], stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i += 3) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-2], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
@@ -254,14 +225,16 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_STRIP:
-      for (i = 2; i < nr; i += 1) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i+(i&1)-1], stride),
                        get_vert(vertex_buffer, indices[i-(i&1)], stride),
                        get_vert(vertex_buffer, indices[i-2], stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i+(i&1)-2], stride),
                        get_vert(vertex_buffer, indices[i-(i&1)-1], stride),
@@ -271,14 +244,16 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_FAN:
-      for (i = 2; i < nr; i += 1) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[0], stride),
                        get_vert(vertex_buffer, indices[i-1], stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[0], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
@@ -288,8 +263,8 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_QUADS:
-      for (i = 3; i < nr; i += 4) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 4) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-2], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
@@ -299,7 +274,9 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[i-3], stride) );
          }
-         else {
+      }
+      else {
+         for (i = 3; i < nr; i += 4) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-3], stride),
                        get_vert(vertex_buffer, indices[i-2], stride),
@@ -314,8 +291,8 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_QUAD_STRIP:
-      for (i = 3; i < nr; i += 2) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 2) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
@@ -325,7 +302,9 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[i-3], stride) );
          }
-         else {
+      }
+      else {
+         for (i = 3; i < nr; i += 2) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-3], stride),
                        get_vert(vertex_buffer, indices[i-2], stride),
@@ -355,11 +334,6 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
    default:
       assert(0);
    }
-
-   /* XXX: why are we calling this???  If we had to call something, it
-    * would be a function in sp_setup.c:
-    */
-   sp_draw_flush( setup );
 }
 
 
@@ -372,17 +346,12 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
    struct softpipe_context *softpipe = cvbr->softpipe;
+   struct setup_context *setup_ctx = cvbr->setup;
    const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
    const void *vertex_buffer =
       (void *) get_vert(cvbr->vertex_buffer, start, stride);
    unsigned i;
 
-   /* XXX: break this dependency - make setup_context live under
-    * softpipe, rename the old "setup" draw stage to something else.
-    */
-   struct draw_stage *setup = softpipe->setup;
-   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
-
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
       for (i = 0; i < nr; i++) {
@@ -421,14 +390,16 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLES:
-      for (i = 2; i < nr; i += 3) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 3) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-1, stride),
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, i-2, stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i += 3) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-2, stride),
                        get_vert(vertex_buffer, i-1, stride),
@@ -438,14 +409,16 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_STRIP:
-      for (i = 2; i < nr; i++) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i++) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i+(i&1)-1, stride),
                        get_vert(vertex_buffer, i-(i&1), stride),
                        get_vert(vertex_buffer, i-2, stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i++) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i+(i&1)-2, stride),
                        get_vert(vertex_buffer, i-(i&1)-1, stride),
@@ -455,14 +428,16 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_FAN:
-      for (i = 2; i < nr; i += 1) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, 0, stride),
                        get_vert(vertex_buffer, i-1, stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, 0, stride),
                        get_vert(vertex_buffer, i-1, stride),
@@ -472,8 +447,8 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_QUADS:
-      for (i = 3; i < nr; i += 4) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 4) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-2, stride),
                        get_vert(vertex_buffer, i-1, stride),
@@ -483,7 +458,9 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, i-3, stride) );
          }
-         else {
+      }
+      else {
+         for (i = 3; i < nr; i += 4) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-3, stride),
                        get_vert(vertex_buffer, i-2, stride),
@@ -497,8 +474,8 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_QUAD_STRIP:
-      for (i = 3; i < nr; i += 2) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 2) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, i-1, stride),
@@ -508,7 +485,9 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, i-3, stride) );
          }
-         else {
+      }
+      else {
+         for (i = 3; i < nr; i += 2) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-3, stride),
                        get_vert(vertex_buffer, i-2, stride),
@@ -546,40 +525,38 @@ static void
 sp_vbuf_destroy(struct vbuf_render *vbr)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
-   cvbr->softpipe->vbuf_render = NULL;
+   setup_destroy_context(cvbr->setup);
    FREE(cvbr);
 }
 
 
 /**
- * Initialize the post-transform vertex buffer information for the given
- * context.
+ * Create the post-transform vertex handler for the given context.
  */
-void
-sp_init_vbuf(struct softpipe_context *sp)
+struct vbuf_render *
+sp_create_vbuf_backend(struct softpipe_context *sp)
 {
-   assert(sp->draw);
+   struct softpipe_vbuf_render *cvbr = CALLOC_STRUCT(softpipe_vbuf_render);
 
-   sp->vbuf_render = CALLOC_STRUCT(softpipe_vbuf_render);
+   assert(sp->draw);
 
-   sp->vbuf_render->base.max_indices = SP_MAX_VBUF_INDEXES;
-   sp->vbuf_render->base.max_vertex_buffer_bytes = SP_MAX_VBUF_SIZE;
 
-   sp->vbuf_render->base.get_vertex_info = sp_vbuf_get_vertex_info;
-   sp->vbuf_render->base.allocate_vertices = sp_vbuf_allocate_vertices;
-   sp->vbuf_render->base.map_vertices = sp_vbuf_map_vertices;
-   sp->vbuf_render->base.unmap_vertices = sp_vbuf_unmap_vertices;
-   sp->vbuf_render->base.set_primitive = sp_vbuf_set_primitive;
-   sp->vbuf_render->base.draw = sp_vbuf_draw;
-   sp->vbuf_render->base.draw_arrays = sp_vbuf_draw_arrays;
-   sp->vbuf_render->base.release_vertices = sp_vbuf_release_vertices;
-   sp->vbuf_render->base.destroy = sp_vbuf_destroy;
+   cvbr->base.max_indices = SP_MAX_VBUF_INDEXES;
+   cvbr->base.max_vertex_buffer_bytes = SP_MAX_VBUF_SIZE;
 
-   sp->vbuf_render->softpipe = sp;
+   cvbr->base.get_vertex_info = sp_vbuf_get_vertex_info;
+   cvbr->base.allocate_vertices = sp_vbuf_allocate_vertices;
+   cvbr->base.map_vertices = sp_vbuf_map_vertices;
+   cvbr->base.unmap_vertices = sp_vbuf_unmap_vertices;
+   cvbr->base.set_primitive = sp_vbuf_set_primitive;
+   cvbr->base.draw = sp_vbuf_draw;
+   cvbr->base.draw_arrays = sp_vbuf_draw_arrays;
+   cvbr->base.release_vertices = sp_vbuf_release_vertices;
+   cvbr->base.destroy = sp_vbuf_destroy;
 
-   sp->vbuf = draw_vbuf_stage(sp->draw, &sp->vbuf_render->base);
+   cvbr->softpipe = sp;
 
-   draw_set_rasterize_stage(sp->draw, sp->vbuf);
+   cvbr->setup = setup_create_context(cvbr->softpipe);
 
-   draw_set_render(sp->draw, &sp->vbuf_render->base);
+   return &cvbr->base;
 }
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.h b/src/gallium/drivers/softpipe/sp_prim_vbuf.h
index 1de9cc2a89..ad01cc2f28 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.h
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.h
@@ -31,8 +31,8 @@
 
 struct softpipe_context;
 
-extern void
-sp_init_vbuf(struct softpipe_context *softpipe);
+extern struct vbuf_render *
+sp_create_vbuf_backend(struct softpipe_context *softpipe);
 
 
 #endif /* SP_VBUF_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad.h b/src/gallium/drivers/softpipe/sp_quad.h
index bd6c6cb912..a3236bd116 100644
--- a/src/gallium/drivers/softpipe/sp_quad.h
+++ b/src/gallium/drivers/softpipe/sp_quad.h
@@ -97,10 +97,10 @@ struct quad_header {
    struct quad_header_inout inout;
    struct quad_header_output output;
 
-   const struct tgsi_interp_coef *coef;
+   /* Redundant/duplicated:
+    */
    const struct tgsi_interp_coef *posCoef;
-
-   unsigned nr_attrs;
+   const struct tgsi_interp_coef *coef;
 };
 
 #endif /* SP_QUAD_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
deleted file mode 100644
index 0845bae0e6..0000000000
--- a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
+++ /dev/null
@@ -1,108 +0,0 @@
-
-/**
- * quad alpha test
- */
-
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_quad_pipe.h"
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-
-
-static void
-alpha_test_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   const float ref = softpipe->depth_stencil->alpha.ref_value;
-   unsigned passMask = 0x0, j;
-   const uint cbuf = 0; /* only output[0].alpha is tested */
-   const float *aaaa = quad->output.color[cbuf][3];
-
-   switch (softpipe->depth_stencil->alpha.func) {
-   case PIPE_FUNC_NEVER:
-      break;
-   case PIPE_FUNC_LESS:
-      /*
-       * If mask were an array [4] we could do this SIMD-style:
-       * passMask = (quad->outputs.color[0][3] <= vec4(ref));
-       */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] < ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_EQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] == ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_LEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] <= ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_GREATER:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] > ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] != ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_GEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] >= ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_ALWAYS:
-      passMask = MASK_ALL;
-      break;
-   default:
-      assert(0);
-   }
-
-   quad->inout.mask &= passMask;
-
-   if (quad->inout.mask)
-      qs->next->run(qs->next, quad);
-}
-
-
-static void alpha_test_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void alpha_test_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *
-sp_quad_alpha_test_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = alpha_test_begin;
-   stage->run = alpha_test_quad;
-   stage->destroy = alpha_test_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c
index b1e18805c7..e243c63fa2 100644
--- a/src/gallium/drivers/softpipe/sp_quad_blend.c
+++ b/src/gallium/drivers/softpipe/sp_quad_blend.c
@@ -117,644 +117,865 @@ do { \
 
 
 static void
-logicop_quad(struct quad_stage *qs, struct quad_header *quad)
+logicop_quad(struct quad_stage *qs, 
+             float (*quadColor)[4],
+             float (*dest)[4])
 {
    struct softpipe_context *softpipe = qs->softpipe;
-   uint cbuf;
+   ubyte src[4][4], dst[4][4], res[4][4];
+   uint *src4 = (uint *) src;
+   uint *dst4 = (uint *) dst;
+   uint *res4 = (uint *) res;
+   uint j;
+
+
+   /* convert to ubyte */
+   for (j = 0; j < 4; j++) { /* loop over R,G,B,A channels */
+      dst[j][0] = float_to_ubyte(dest[j][0]); /* P0 */
+      dst[j][1] = float_to_ubyte(dest[j][1]); /* P1 */
+      dst[j][2] = float_to_ubyte(dest[j][2]); /* P2 */
+      dst[j][3] = float_to_ubyte(dest[j][3]); /* P3 */
+
+      src[j][0] = float_to_ubyte(quadColor[j][0]); /* P0 */
+      src[j][1] = float_to_ubyte(quadColor[j][1]); /* P1 */
+      src[j][2] = float_to_ubyte(quadColor[j][2]); /* P2 */
+      src[j][3] = float_to_ubyte(quadColor[j][3]); /* P3 */
+   }
 
-   /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-      float dest[4][QUAD_SIZE];
-      ubyte src[4][4], dst[4][4], res[4][4];
-      uint *src4 = (uint *) src;
-      uint *dst4 = (uint *) dst;
-      uint *res4 = (uint *) res;
-      struct softpipe_cached_tile *
-         tile = sp_get_cached_tile(softpipe,
-                                   softpipe->cbuf_cache[cbuf],
-                                   quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      uint i, j;
+   switch (softpipe->blend->logicop_func) {
+   case PIPE_LOGICOP_CLEAR:
+      for (j = 0; j < 4; j++)
+         res4[j] = 0;
+      break;
+   case PIPE_LOGICOP_NOR:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~(src4[j] | dst4[j]);
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~src4[j] & dst4[j];
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~src4[j];
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] & ~dst4[j];
+      break;
+   case PIPE_LOGICOP_INVERT:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~dst4[j];
+      break;
+   case PIPE_LOGICOP_XOR:
+      for (j = 0; j < 4; j++)
+         res4[j] = dst4[j] ^ src4[j];
+      break;
+   case PIPE_LOGICOP_NAND:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~(src4[j] & dst4[j]);
+      break;
+   case PIPE_LOGICOP_AND:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] & dst4[j];
+      break;
+   case PIPE_LOGICOP_EQUIV:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~(src4[j] ^ dst4[j]);
+      break;
+   case PIPE_LOGICOP_NOOP:
+      for (j = 0; j < 4; j++)
+         res4[j] = dst4[j];
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~src4[j] | dst4[j];
+      break;
+   case PIPE_LOGICOP_COPY:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j];
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] | ~dst4[j];
+      break;
+   case PIPE_LOGICOP_OR:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] | dst4[j];
+      break;
+   case PIPE_LOGICOP_SET:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~0;
+      break;
+   default:
+      assert(0);
+   }
 
-      /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
-         for (i = 0; i < 4; i++) {
-            dest[i][j] = tile->data.color[y][x][i];
-         }
-      }
+   for (j = 0; j < 4; j++) {
+      quadColor[j][0] = ubyte_to_float(res[j][0]);
+      quadColor[j][1] = ubyte_to_float(res[j][1]);
+      quadColor[j][2] = ubyte_to_float(res[j][2]);
+      quadColor[j][3] = ubyte_to_float(res[j][3]);
+   }
+}
 
-      /* convert to ubyte */
-      for (j = 0; j < 4; j++) { /* loop over R,G,B,A channels */
-         dst[j][0] = float_to_ubyte(dest[j][0]); /* P0 */
-         dst[j][1] = float_to_ubyte(dest[j][1]); /* P1 */
-         dst[j][2] = float_to_ubyte(dest[j][2]); /* P2 */
-         dst[j][3] = float_to_ubyte(dest[j][3]); /* P3 */
-
-         src[j][0] = float_to_ubyte(quadColor[j][0]); /* P0 */
-         src[j][1] = float_to_ubyte(quadColor[j][1]); /* P1 */
-         src[j][2] = float_to_ubyte(quadColor[j][2]); /* P2 */
-         src[j][3] = float_to_ubyte(quadColor[j][3]); /* P3 */
-      }
 
-      switch (softpipe->blend->logicop_func) {
-      case PIPE_LOGICOP_CLEAR:
-         for (j = 0; j < 4; j++)
-            res4[j] = 0;
-         break;
-      case PIPE_LOGICOP_NOR:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~(src4[j] | dst4[j]);
-         break;
-      case PIPE_LOGICOP_AND_INVERTED:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~src4[j] & dst4[j];
-         break;
-      case PIPE_LOGICOP_COPY_INVERTED:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~src4[j];
-         break;
-      case PIPE_LOGICOP_AND_REVERSE:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] & ~dst4[j];
-         break;
-      case PIPE_LOGICOP_INVERT:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~dst4[j];
-         break;
-      case PIPE_LOGICOP_XOR:
-         for (j = 0; j < 4; j++)
-            res4[j] = dst4[j] ^ src4[j];
-         break;
-      case PIPE_LOGICOP_NAND:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~(src4[j] & dst4[j]);
-         break;
-      case PIPE_LOGICOP_AND:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] & dst4[j];
-         break;
-      case PIPE_LOGICOP_EQUIV:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~(src4[j] ^ dst4[j]);
-         break;
-      case PIPE_LOGICOP_NOOP:
-         for (j = 0; j < 4; j++)
-            res4[j] = dst4[j];
-         break;
-      case PIPE_LOGICOP_OR_INVERTED:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~src4[j] | dst4[j];
-         break;
-      case PIPE_LOGICOP_COPY:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j];
-         break;
-      case PIPE_LOGICOP_OR_REVERSE:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] | ~dst4[j];
-         break;
-      case PIPE_LOGICOP_OR:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] | dst4[j];
-         break;
-      case PIPE_LOGICOP_SET:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~0;
-         break;
-      default:
-         assert(0);
-      }
 
-      for (j = 0; j < 4; j++) {
-         quadColor[j][0] = ubyte_to_float(res[j][0]);
-         quadColor[j][1] = ubyte_to_float(res[j][1]);
-         quadColor[j][2] = ubyte_to_float(res[j][2]);
-         quadColor[j][3] = ubyte_to_float(res[j][3]);
-      }
+static void
+blend_quad(struct quad_stage *qs, 
+           float (*quadColor)[4],
+           float (*dest)[4])
+{
+   static const float zero[4] = { 0, 0, 0, 0 };
+   static const float one[4] = { 1, 1, 1, 1 };
+   struct softpipe_context *softpipe = qs->softpipe;
+   float source[4][QUAD_SIZE];
+
+   /*
+    * Compute src/first term RGB
+    */
+   switch (softpipe->blend->rgb_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      VEC4_COPY(source[0], quadColor[0]); /* R */
+      VEC4_COPY(source[1], quadColor[1]); /* G */
+      VEC4_COPY(source[2], quadColor[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      VEC4_MUL(source[0], quadColor[0], quadColor[0]); /* R */
+      VEC4_MUL(source[1], quadColor[1], quadColor[1]); /* G */
+      VEC4_MUL(source[2], quadColor[2], quadColor[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+   {
+      const float *alpha = quadColor[3];
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      VEC4_MUL(source[0], quadColor[0], dest[0]); /* R */
+      VEC4_MUL(source[1], quadColor[1], dest[1]); /* G */
+      VEC4_MUL(source[2], quadColor[2], dest[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+   {
+      const float *alpha = dest[3];
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+   {
+      const float *alpha = quadColor[3];
+      float diff[4], temp[4];
+      VEC4_SUB(diff, one, dest[3]);
+      VEC4_MIN(temp, alpha, diff);
+      VEC4_MUL(source[0], quadColor[0], temp); /* R */
+      VEC4_MUL(source[1], quadColor[1], temp); /* G */
+      VEC4_MUL(source[2], quadColor[2], temp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
+      VEC4_MUL(source[0], quadColor[0], comp); /* R */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
+      VEC4_MUL(source[1], quadColor[1], comp); /* G */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
+      VEC4_MUL(source[2], quadColor[2], comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float alpha[4];
+      VEC4_SCALAR(alpha, softpipe->blend_color.color[3]);
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(source[0], zero); /* R */
+      VEC4_COPY(source[1], zero); /* G */
+      VEC4_COPY(source[2], zero); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
+      VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+      VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
+      VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+      VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
+      VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, quadColor[3]);
+      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, dest[3]);
+      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[0]); /* R */
+      VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+      VEC4_SUB(inv_comp, one, dest[1]); /* G */
+      VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+      VEC4_SUB(inv_comp, one, dest[2]); /* B */
+      VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   {
+      float inv_comp[4];
+      /* R */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
+      VEC4_MUL(source[0], quadColor[0], inv_comp);
+      /* G */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
+      VEC4_MUL(source[1], quadColor[1], inv_comp);
+      /* B */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
+      VEC4_MUL(source[2], quadColor[2], inv_comp);
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SCALAR(inv_alpha, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      assert(0); /* to do */
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Compute src/first term A
+    */
+   switch (softpipe->blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      VEC4_COPY(source[3], quadColor[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+   {
+      const float *alpha = quadColor[3];
+      VEC4_MUL(source[3], quadColor[3], alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      VEC4_MUL(source[3], quadColor[3], dest[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      /* multiply alpha by 1.0 */
+      VEC4_COPY(source[3], quadColor[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+      VEC4_MUL(source[3], quadColor[3], comp); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(source[3], zero); /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, quadColor[3]);
+      VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, dest[3]);
+      VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_comp[4];
+      /* A */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(source[3], quadColor[3], inv_comp);
+   }
+   break;
+   default:
+      assert(0);
    }
 
-   /* pass quad to next stage */
-   qs->next->run(qs->next, quad);
-}
 
+   /*
+    * Compute dest/second term RGB
+    */
+   switch (softpipe->blend->rgb_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* dest = dest * 1   NO-OP, leave dest as-is */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      VEC4_MUL(dest[0], dest[0], quadColor[0]); /* R */
+      VEC4_MUL(dest[1], dest[1], quadColor[1]); /* G */
+      VEC4_MUL(dest[2], dest[2], quadColor[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      VEC4_MUL(dest[0], dest[0], quadColor[3]); /* R * A */
+      VEC4_MUL(dest[1], dest[1], quadColor[3]); /* G * A */
+      VEC4_MUL(dest[2], dest[2], quadColor[3]); /* B * A */
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      VEC4_MUL(dest[0], dest[0], dest[3]); /* R * A */
+      VEC4_MUL(dest[1], dest[1], dest[3]); /* G * A */
+      VEC4_MUL(dest[2], dest[2], dest[3]); /* B * A */
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      VEC4_MUL(dest[0], dest[0], dest[0]); /* R */
+      VEC4_MUL(dest[1], dest[1], dest[1]); /* G */
+      VEC4_MUL(dest[2], dest[2], dest[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      assert(0); /* illegal */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
+      VEC4_MUL(dest[0], dest[0], comp); /* R */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
+      VEC4_MUL(dest[1], dest[1], comp); /* G */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
+      VEC4_MUL(dest[2], dest[2], comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+      VEC4_MUL(dest[0], dest[0], comp); /* R */
+      VEC4_MUL(dest[1], dest[1], comp); /* G */
+      VEC4_MUL(dest[2], dest[2], comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(dest[0], zero); /* R */
+      VEC4_COPY(dest[1], zero); /* G */
+      VEC4_COPY(dest[2], zero); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      /* XXX what are these? */
+      assert(0);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
+      VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
+      VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
+      VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
+      VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
+      VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float one_minus_alpha[QUAD_SIZE];
+      VEC4_SUB(one_minus_alpha, one, quadColor[3]);
+      VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
+      VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
+      VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[3]); /* A */
+      VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
+      VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
+      VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[0]); /* R */
+      VEC4_MUL(dest[0], dest[0], inv_comp); /* R */
+      VEC4_SUB(inv_comp, one, dest[1]); /* G */
+      VEC4_MUL(dest[1], dest[1], inv_comp); /* G */
+      VEC4_SUB(inv_comp, one, dest[2]); /* B */
+      VEC4_MUL(dest[2], dest[2], inv_comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   {
+      float inv_comp[4];
+      /* R */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
+      VEC4_MUL(dest[0], dest[0], inv_comp);
+      /* G */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
+      VEC4_MUL(dest[1], dest[1], inv_comp);
+      /* B */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
+      VEC4_MUL(dest[2], dest[2], inv_comp);
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(dest[0], dest[0], inv_comp);
+      VEC4_MUL(dest[1], dest[1], inv_comp);
+      VEC4_MUL(dest[2], dest[2], inv_comp);
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      /* XXX what are these? */
+      assert(0);
+      break;
+   default:
+      assert(0);
+   }
 
+   /*
+    * Compute dest/second term A
+    */
+   switch (softpipe->blend->alpha_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* dest = dest * 1   NO-OP, leave dest as-is */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      VEC4_MUL(dest[3], dest[3], quadColor[3]); /* A * A */
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      VEC4_MUL(dest[3], dest[3], dest[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      assert(0); /* illegal */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+      VEC4_MUL(dest[3], dest[3], comp); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(dest[3], zero); /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float one_minus_alpha[QUAD_SIZE];
+      VEC4_SUB(one_minus_alpha, one, quadColor[3]);
+      VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[3]); /* A */
+      VEC4_MUL(dest[3], inv_comp, dest[3]); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(dest[3], dest[3], inv_comp);
+   }
+   break;
+   default:
+      assert(0);
+   }
 
+   /*
+    * Combine RGB terms
+    */
+   switch (softpipe->blend->rgb_func) {
+   case PIPE_BLEND_ADD:
+      VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_SUB_SAT(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_SUB_SAT(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[0], dest[0], source[0]); /* R */
+      VEC4_SUB_SAT(quadColor[1], dest[1], source[1]); /* G */
+      VEC4_SUB_SAT(quadColor[2], dest[2], source[2]); /* B */
+      break;
+   case PIPE_BLEND_MIN:
+      VEC4_MIN(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_MIN(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_MIN(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   case PIPE_BLEND_MAX:
+      VEC4_MAX(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_MAX(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_MAX(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Combine A terms
+    */
+   switch (softpipe->blend->alpha_func) {
+   case PIPE_BLEND_ADD:
+      VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[3], dest[3], source[3]); /* A */
+      break;
+   case PIPE_BLEND_MIN:
+      VEC4_MIN(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   case PIPE_BLEND_MAX:
+      VEC4_MAX(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   default:
+      assert(0);
+   }
+}
 
 static void
-blend_quad(struct quad_stage *qs, struct quad_header *quad)
+colormask_quad(struct quad_stage *qs,
+               float (*quadColor)[4],
+               float (*dest)[4])
 {
-   static const float zero[4] = { 0, 0, 0, 0 };
-   static const float one[4] = { 1, 1, 1, 1 };
+   struct softpipe_context *softpipe = qs->softpipe;
 
+   /* R */
+   if (!(softpipe->blend->colormask & PIPE_MASK_R))
+      COPY_4V(quadColor[0], dest[0]);
+
+   /* G */
+   if (!(softpipe->blend->colormask & PIPE_MASK_G))
+      COPY_4V(quadColor[1], dest[1]);
+
+   /* B */
+   if (!(softpipe->blend->colormask & PIPE_MASK_B))
+      COPY_4V(quadColor[2], dest[2]);
+
+   /* A */
+   if (!(softpipe->blend->colormask & PIPE_MASK_A))
+      COPY_4V(quadColor[3], dest[3]);
+}
+
+
+static void
+blend_fallback(struct quad_stage *qs, 
+               struct quad_header *quads[],
+               unsigned nr)
+{
    struct softpipe_context *softpipe = qs->softpipe;
-   uint cbuf;
+   const struct pipe_blend_state *blend = softpipe->blend;
+   unsigned cbuf;
+
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) 
+   {
+      float dest[4][QUAD_SIZE];
+      struct softpipe_cached_tile *tile
+         = sp_get_cached_tile(softpipe->cbuf_cache[cbuf],
+                              quads[0]->input.x0, 
+                              quads[0]->input.y0);
+      uint q, i, j;
+
+      for (q = 0; q < nr; q++) {
+         struct quad_header *quad = quads[q];
+         float (*quadColor)[4] = quad->output.color[cbuf];
+         const int itx = (quad->input.x0 & (TILE_SIZE-1));
+         const int ity = (quad->input.y0 & (TILE_SIZE-1));
+
+         /* get/swizzle dest colors 
+          */
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) {
+               dest[i][j] = tile->data.color[y][x][i];
+            }
+         }
 
-   if (softpipe->blend->logicop_enable) {
-      logicop_quad(qs, quad);
-      return;
+
+         if (blend->logicop_enable) {
+            logicop_quad( qs, quadColor, dest );
+         }
+         else if (blend->blend_enable) {
+            blend_quad( qs, quadColor, dest );
+         }
+
+         if (blend->colormask != 0xf)
+            colormask_quad( qs, quadColor, dest );
+   
+         /* Output color values
+          */
+         for (j = 0; j < QUAD_SIZE; j++) {
+            if (quad->inout.mask & (1 << j)) {
+               int x = itx + (j & 1);
+               int y = ity + (j >> 1);
+               for (i = 0; i < 4; i++) { /* loop over color chans */
+                  tile->data.color[y][x][i] = quadColor[i][j];
+               }
+            }
+         }
+      }
    }
+}
 
-   /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-      float source[4][QUAD_SIZE], dest[4][QUAD_SIZE];
-      struct softpipe_cached_tile *tile
-         = sp_get_cached_tile(softpipe,
-                              softpipe->cbuf_cache[cbuf],
-                              quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      uint i, j;
 
+static void
+blend_single_add_src_alpha_inv_src_alpha(struct quad_stage *qs, 
+                                         struct quad_header *quads[],
+                                         unsigned nr)
+{
+   static const float one[4] = { 1, 1, 1, 1 };
+   float one_minus_alpha[QUAD_SIZE];
+   float dest[4][QUAD_SIZE];
+   float source[4][QUAD_SIZE];
+   uint i, j, q;
+
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
+                           quads[0]->input.x0, 
+                           quads[0]->input.y0);
+
+   for (q = 0; q < nr; q++) {
+      struct quad_header *quad = quads[q];
+      float (*quadColor)[4] = quad->output.color[0];
+      const float *alpha = quadColor[3];
+      const int itx = (quad->input.x0 & (TILE_SIZE-1));
+      const int ity = (quad->input.y0 & (TILE_SIZE-1));
+      
       /* get/swizzle dest colors */
       for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
+         int x = itx + (j & 1);
+         int y = ity + (j >> 1);
          for (i = 0; i < 4; i++) {
             dest[i][j] = tile->data.color[y][x][i];
          }
       }
 
-      /*
-       * Compute src/first term RGB
-       */
-      switch (softpipe->blend->rgb_src_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         VEC4_COPY(source[0], quadColor[0]); /* R */
-         VEC4_COPY(source[1], quadColor[1]); /* G */
-         VEC4_COPY(source[2], quadColor[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         VEC4_MUL(source[0], quadColor[0], quadColor[0]); /* R */
-         VEC4_MUL(source[1], quadColor[1], quadColor[1]); /* G */
-         VEC4_MUL(source[2], quadColor[2], quadColor[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         {
-            const float *alpha = quadColor[3];
-            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         VEC4_MUL(source[0], quadColor[0], dest[0]); /* R */
-         VEC4_MUL(source[1], quadColor[1], dest[1]); /* G */
-         VEC4_MUL(source[2], quadColor[2], dest[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         {
-            const float *alpha = dest[3];
-            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         {
-            const float *alpha = quadColor[3];
-            float diff[4], temp[4];
-            VEC4_SUB(diff, one, dest[3]);
-            VEC4_MIN(temp, alpha, diff);
-            VEC4_MUL(source[0], quadColor[0], temp); /* R */
-            VEC4_MUL(source[1], quadColor[1], temp); /* G */
-            VEC4_MUL(source[2], quadColor[2], temp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
-            VEC4_MUL(source[0], quadColor[0], comp); /* R */
-            VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
-            VEC4_MUL(source[1], quadColor[1], comp); /* G */
-            VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
-            VEC4_MUL(source[2], quadColor[2], comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
-         {
-            float alpha[4];
-            VEC4_SCALAR(alpha, softpipe->blend_color.color[3]);
-            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_SRC1_COLOR:
-         assert(0); /* to do */
-         break;
-      case PIPE_BLENDFACTOR_SRC1_ALPHA:
-         assert(0); /* to do */
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(source[0], zero); /* R */
-         VEC4_COPY(source[1], zero); /* G */
-         VEC4_COPY(source[2], zero); /* B */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
-            VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
-            VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
-            VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
-            VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
-            VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SUB(inv_alpha, one, quadColor[3]);
-            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SUB(inv_alpha, one, dest[3]);
-            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, dest[0]); /* R */
-            VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
-            VEC4_SUB(inv_comp, one, dest[1]); /* G */
-            VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
-            VEC4_SUB(inv_comp, one, dest[2]); /* B */
-            VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         {
-            float inv_comp[4];
-            /* R */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
-            VEC4_MUL(source[0], quadColor[0], inv_comp);
-            /* G */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
-            VEC4_MUL(source[1], quadColor[1], inv_comp);
-            /* B */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
-            VEC4_MUL(source[2], quadColor[2], inv_comp);
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SCALAR(inv_alpha, 1.0f - softpipe->blend_color.color[3]);
-            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+      VEC4_MUL(source[3], quadColor[3], alpha); /* A */
+
+      VEC4_SUB(one_minus_alpha, one, alpha);
+      VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
+      VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
+      VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
+      VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* B */
+
+      VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
+      VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
          }
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-         assert(0); /* to do */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-         assert(0); /* to do */
-         break;
-      default:
-         assert(0);
       }
+   }
+}
 
-      /*
-       * Compute src/first term A
-       */
-      switch (softpipe->blend->alpha_src_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         VEC4_COPY(source[3], quadColor[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         {
-            const float *alpha = quadColor[3];
-            VEC4_MUL(source[3], quadColor[3], alpha); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         VEC4_MUL(source[3], quadColor[3], dest[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         /* multiply alpha by 1.0 */
-         VEC4_COPY(source[3], quadColor[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
-            VEC4_MUL(source[3], quadColor[3], comp); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(source[3], zero); /* A */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SUB(inv_alpha, one, quadColor[3]);
-            VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SUB(inv_alpha, one, dest[3]);
-            VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-         {
-            float inv_comp[4];
-            /* A */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
-            VEC4_MUL(source[3], quadColor[3], inv_comp);
+static void
+blend_single_add_one_one(struct quad_stage *qs, 
+                         struct quad_header *quads[],
+                         unsigned nr)
+{
+   float dest[4][QUAD_SIZE];
+   uint i, j, q;
+
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
+                           quads[0]->input.x0, 
+                           quads[0]->input.y0);
+
+   for (q = 0; q < nr; q++) {
+      struct quad_header *quad = quads[q];
+      float (*quadColor)[4] = quad->output.color[0];
+      const int itx = (quad->input.x0 & (TILE_SIZE-1));
+      const int ity = (quad->input.y0 & (TILE_SIZE-1));
+      
+      /* get/swizzle dest colors */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = itx + (j & 1);
+         int y = ity + (j >> 1);
+         for (i = 0; i < 4; i++) {
+            dest[i][j] = tile->data.color[y][x][i];
          }
-         break;
-      default:
-         assert(0);
       }
+     
+      VEC4_ADD_SAT(quadColor[0], quadColor[0], dest[0]); /* R */
+      VEC4_ADD_SAT(quadColor[1], quadColor[1], dest[1]); /* G */
+      VEC4_ADD_SAT(quadColor[2], quadColor[2], dest[2]); /* B */
+      VEC4_ADD_SAT(quadColor[3], quadColor[3], dest[3]); /* A */
 
-
-      /*
-       * Compute dest/second term RGB
-       */
-      switch (softpipe->blend->rgb_dst_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         /* dest = dest * 1   NO-OP, leave dest as-is */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         VEC4_MUL(dest[0], dest[0], quadColor[0]); /* R */
-         VEC4_MUL(dest[1], dest[1], quadColor[1]); /* G */
-         VEC4_MUL(dest[2], dest[2], quadColor[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         VEC4_MUL(dest[0], dest[0], quadColor[3]); /* R * A */
-         VEC4_MUL(dest[1], dest[1], quadColor[3]); /* G * A */
-         VEC4_MUL(dest[2], dest[2], quadColor[3]); /* B * A */
-         break;
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         VEC4_MUL(dest[0], dest[0], dest[3]); /* R * A */
-         VEC4_MUL(dest[1], dest[1], dest[3]); /* G * A */
-         VEC4_MUL(dest[2], dest[2], dest[3]); /* B * A */
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         VEC4_MUL(dest[0], dest[0], dest[0]); /* R */
-         VEC4_MUL(dest[1], dest[1], dest[1]); /* G */
-         VEC4_MUL(dest[2], dest[2], dest[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         assert(0); /* illegal */
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
-            VEC4_MUL(dest[0], dest[0], comp); /* R */
-            VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
-            VEC4_MUL(dest[1], dest[1], comp); /* G */
-            VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
-            VEC4_MUL(dest[2], dest[2], comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
-            VEC4_MUL(dest[0], dest[0], comp); /* R */
-            VEC4_MUL(dest[1], dest[1], comp); /* G */
-            VEC4_MUL(dest[2], dest[2], comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(dest[0], zero); /* R */
-         VEC4_COPY(dest[1], zero); /* G */
-         VEC4_COPY(dest[2], zero); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC1_COLOR:
-      case PIPE_BLENDFACTOR_SRC1_ALPHA:
-         /* XXX what are these? */
-         assert(0);
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
-            VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
-            VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
-            VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
-            VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
-            VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-         {
-            float one_minus_alpha[QUAD_SIZE];
-            VEC4_SUB(one_minus_alpha, one, quadColor[3]);
-            VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
-            VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
-            VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, dest[3]); /* A */
-            VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
-            VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
-            VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, dest[0]); /* R */
-            VEC4_MUL(dest[0], dest[0], inv_comp); /* R */
-            VEC4_SUB(inv_comp, one, dest[1]); /* G */
-            VEC4_MUL(dest[1], dest[1], inv_comp); /* G */
-            VEC4_SUB(inv_comp, one, dest[2]); /* B */
-            VEC4_MUL(dest[2], dest[2], inv_comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         {
-            float inv_comp[4];
-            /* R */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
-            VEC4_MUL(dest[0], dest[0], inv_comp);
-            /* G */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
-            VEC4_MUL(dest[1], dest[1], inv_comp);
-            /* B */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
-            VEC4_MUL(dest[2], dest[2], inv_comp);
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-         {
-            float inv_comp[4];
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
-            VEC4_MUL(dest[0], dest[0], inv_comp);
-            VEC4_MUL(dest[1], dest[1], inv_comp);
-            VEC4_MUL(dest[2], dest[2], inv_comp);
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
          }
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-      case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-         /* XXX what are these? */
-         assert(0);
-         break;
-      default:
-         assert(0);
       }
+   }
+}
 
-      /*
-       * Compute dest/second term A
-       */
-      switch (softpipe->blend->alpha_dst_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         /* dest = dest * 1   NO-OP, leave dest as-is */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         VEC4_MUL(dest[3], dest[3], quadColor[3]); /* A * A */
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         VEC4_MUL(dest[3], dest[3], dest[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         assert(0); /* illegal */
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
-            VEC4_MUL(dest[3], dest[3], comp); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(dest[3], zero); /* A */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-         {
-            float one_minus_alpha[QUAD_SIZE];
-            VEC4_SUB(one_minus_alpha, one, quadColor[3]);
-            VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, dest[3]); /* A */
-            VEC4_MUL(dest[3], inv_comp, dest[3]); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-         {
-            float inv_comp[4];
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
-            VEC4_MUL(dest[3], dest[3], inv_comp);
+
+static void
+single_output_color(struct quad_stage *qs, 
+                    struct quad_header *quads[],
+                    unsigned nr)
+{
+   uint i, j, q;
+
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
+                           quads[0]->input.x0, 
+                           quads[0]->input.y0);
+
+   for (q = 0; q < nr; q++) {
+      struct quad_header *quad = quads[q];
+      float (*quadColor)[4] = quad->output.color[0];
+      const int itx = (quad->input.x0 & (TILE_SIZE-1));
+      const int ity = (quad->input.y0 & (TILE_SIZE-1));
+      
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
          }
-         break;
-      default:
-         assert(0);
       }
+   }
+}
+
+static void
+blend_noop(struct quad_stage *qs, 
+           struct quad_header *quads[],
+           unsigned nr)
+{
+}
 
-      /*
-       * Combine RGB terms
-       */
-      switch (softpipe->blend->rgb_func) {
-      case PIPE_BLEND_ADD:
-         VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      case PIPE_BLEND_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_SUB_SAT(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_SUB_SAT(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      case PIPE_BLEND_REVERSE_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[0], dest[0], source[0]); /* R */
-         VEC4_SUB_SAT(quadColor[1], dest[1], source[1]); /* G */
-         VEC4_SUB_SAT(quadColor[2], dest[2], source[2]); /* B */
-         break;
-      case PIPE_BLEND_MIN:
-         VEC4_MIN(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_MIN(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_MIN(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      case PIPE_BLEND_MAX:
-         VEC4_MAX(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_MAX(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_MAX(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      default:
-         assert(0);
-      }
 
-      /*
-       * Combine A terms
-       */
-      switch (softpipe->blend->alpha_func) {
-      case PIPE_BLEND_ADD:
-         VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      case PIPE_BLEND_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      case PIPE_BLEND_REVERSE_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[3], dest[3], source[3]); /* A */
-         break;
-      case PIPE_BLEND_MIN:
-         VEC4_MIN(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      case PIPE_BLEND_MAX:
-         VEC4_MAX(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      default:
-         assert(0);
+static void
+choose_blend_quad(struct quad_stage *qs, 
+                  struct quad_header *quads[],
+                  unsigned nr)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   const struct pipe_blend_state *blend = softpipe->blend;
+
+   qs->run = blend_fallback;
+   
+   if (softpipe->framebuffer.nr_cbufs == 0) {
+      qs->run = blend_noop;
+   }
+   else if (!softpipe->blend->logicop_enable &&
+            softpipe->blend->colormask == 0xf) 
+   {
+      if (!blend->blend_enable) {
+         qs->run = single_output_color;
       }
+      else if (blend->rgb_src_factor == blend->alpha_src_factor &&
+               blend->rgb_dst_factor == blend->alpha_dst_factor &&
+               blend->rgb_func == blend->alpha_func &&
+               softpipe->framebuffer.nr_cbufs == 1)
+      {
+         if (blend->alpha_func == PIPE_BLEND_ADD) {
+            if (blend->rgb_src_factor == PIPE_BLENDFACTOR_ONE &&
+                blend->rgb_dst_factor == PIPE_BLENDFACTOR_ONE) {
+               qs->run = blend_single_add_one_one;
+            }
+            else if (blend->rgb_src_factor == PIPE_BLENDFACTOR_SRC_ALPHA &&
+                blend->rgb_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
+               qs->run = blend_single_add_src_alpha_inv_src_alpha;
 
-   } /* cbuf loop */
+         }
+      }
+   }
 
-   /* pass blended quad to next stage */
-   qs->next->run(qs->next, quad);
+   qs->run(qs, quads, nr);
 }
 
 
 static void blend_begin(struct quad_stage *qs)
 {
-   qs->next->begin(qs->next);
+   qs->run = choose_blend_quad;
 }
 
 
@@ -770,7 +991,7 @@ struct quad_stage *sp_quad_blend_stage( struct softpipe_context *softpipe )
 
    stage->softpipe = softpipe;
    stage->begin = blend_begin;
-   stage->run = blend_quad;
+   stage->run = choose_blend_quad;
    stage->destroy = blend_destroy;
 
    return stage;
diff --git a/src/gallium/drivers/softpipe/sp_quad_bufloop.c b/src/gallium/drivers/softpipe/sp_quad_bufloop.c
deleted file mode 100644
index 953d8516b9..0000000000
--- a/src/gallium/drivers/softpipe/sp_quad_bufloop.c
+++ /dev/null
@@ -1,74 +0,0 @@
-
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_quad_pipe.h"
-
-
-/**
- * Loop over colorbuffers, passing quad to next stage each time.
- */
-static void
-cbuf_loop_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   float tmp[PIPE_MAX_COLOR_BUFS][4][QUAD_SIZE];
-   unsigned i;
-
-   assert(sizeof(quad->outputs.color) == sizeof(tmp));
-   assert(softpipe->framebuffer.nr_cbufs <= PIPE_MAX_COLOR_BUFS);
-
-   /* make copy of original colors since they can get modified
-    * by blending and masking.
-    * XXX we won't have to do this if the fragment program actually emits
-    * N separate colors and we're drawing to N color buffers (MRT).
-    * But if we emitted one color and glDrawBuffer(GL_FRONT_AND_BACK) is
-    * in effect, we need to save/restore colors like this.
-    */
-   memcpy(tmp, quad->outputs.color, sizeof(tmp));
-
-   for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) {
-      /* set current cbuffer */
-#if 0 /* obsolete & going away */
-      softpipe->current_cbuf = i;
-#endif
-
-      /* pass blended quad to next stage */
-      qs->next->run(qs->next, quad);
-
-      /* restore quad's colors for next buffer */
-      memcpy(quad->outputs.color, tmp, sizeof(tmp));
-   }
-}
-
-
-static void cbuf_loop_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void cbuf_loop_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-/**
- * Create the colorbuffer loop stage.
- * This is used to implement multiple render targets and GL_FRONT_AND_BACK
- * rendering.
- */
-struct quad_stage *sp_quad_bufloop_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = cbuf_loop_begin;
-   stage->run = cbuf_loop_quad;
-   stage->destroy = cbuf_loop_destroy;
-
-   return stage;
-}
-
diff --git a/src/gallium/drivers/softpipe/sp_quad_colormask.c b/src/gallium/drivers/softpipe/sp_quad_colormask.c
deleted file mode 100644
index dc90e5d5e9..0000000000
--- a/src/gallium/drivers/softpipe/sp_quad_colormask.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * \brief  quad colormask stage
- * \author Brian Paul
- */
-
-#include "pipe/p_defines.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_quad_pipe.h"
-#include "sp_tile_cache.h"
-
-
-
-/**
- * XXX colormask could be rolled into blending...
- */
-static void
-colormask_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   uint cbuf;
-
-   /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-      float dest[4][QUAD_SIZE];
-      struct softpipe_cached_tile *tile
-         = sp_get_cached_tile(softpipe,
-                              softpipe->cbuf_cache[cbuf],
-                              quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      uint i, j;
-
-      /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
-         for (i = 0; i < 4; i++) {
-            dest[i][j] = tile->data.color[y][x][i];
-         }
-      }
-
-      /* R */
-      if (!(softpipe->blend->colormask & PIPE_MASK_R))
-          COPY_4V(quadColor[0], dest[0]);
-
-      /* G */
-      if (!(softpipe->blend->colormask & PIPE_MASK_G))
-          COPY_4V(quadColor[1], dest[1]);
-
-      /* B */
-      if (!(softpipe->blend->colormask & PIPE_MASK_B))
-          COPY_4V(quadColor[2], dest[2]);
-
-      /* A */
-      if (!(softpipe->blend->colormask & PIPE_MASK_A))
-          COPY_4V(quadColor[3], dest[3]);
-   }
-
-   /* pass quad to next stage */
-   qs->next->run(qs->next, quad);
-}
-
-
-static void colormask_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void colormask_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_colormask_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = colormask_begin;
-   stage->run = colormask_quad;
-   stage->destroy = colormask_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_coverage.c b/src/gallium/drivers/softpipe/sp_quad_coverage.c
deleted file mode 100644
index 4aeee85870..0000000000
--- a/src/gallium/drivers/softpipe/sp_quad_coverage.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-/**
- * \brief  Apply AA coverage to quad alpha valus
- * \author  Brian Paul
- */
-
-
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_quad_pipe.h"
-
-
-/**
- * Multiply quad's alpha values by the fragment coverage.
- */
-static void
-coverage_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   const uint prim = quad->input.prim;
-
-   if ((softpipe->rasterizer->poly_smooth && prim == QUAD_PRIM_TRI) ||
-       (softpipe->rasterizer->line_smooth && prim == QUAD_PRIM_LINE) ||
-       (softpipe->rasterizer->point_smooth && prim == QUAD_PRIM_POINT)) {
-      uint cbuf;
-
-      /* loop over colorbuffer outputs */
-      for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-         float (*quadColor)[4] = quad->output.color[cbuf];
-         unsigned j;
-         for (j = 0; j < QUAD_SIZE; j++) {
-            assert(quad->input.coverage[j] >= 0.0);
-            assert(quad->input.coverage[j] <= 1.0);
-         quadColor[3][j] *= quad->input.coverage[j];
-         }
-      }
-   }
-
-   qs->next->run(qs->next, quad);
-}
-
-
-static void coverage_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void coverage_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_coverage_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = coverage_begin;
-   stage->run = coverage_quad;
-   stage->destroy = coverage_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
index d463930bae..0ca86c4e1c 100644
--- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -31,61 +31,109 @@
 
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
+#include "tgsi/tgsi_scan.h"
 #include "sp_context.h"
 #include "sp_quad.h"
 #include "sp_surface.h"
 #include "sp_quad_pipe.h"
 #include "sp_tile_cache.h"
+#include "sp_state.h"           /* for sp_fragment_shader */
 
 
-/**
- * Do depth testing for a quad.
- * Not static since it's used by the stencil code.
- */
+struct depth_data {
+   struct pipe_surface *ps;
+   enum pipe_format format;
+   unsigned bzzzz[QUAD_SIZE];  /**< Z values fetched from depth buffer */
+   unsigned qzzzz[QUAD_SIZE];  /**< Z values from the quad */
+   ubyte stencilVals[QUAD_SIZE];
+   struct softpipe_cached_tile *tile;
+};
 
-/*
- * To increase efficiency, we should probably have multiple versions
- * of this function that are specifically for Z16, Z32 and FP Z buffers.
- * Try to effectively do that with codegen...
- */
 
-void
-sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
+
+static void
+get_depth_stencil_values( struct depth_data *data,
+                          const struct quad_header *quad )
 {
-   struct softpipe_context *softpipe = qs->softpipe;
-   struct pipe_surface *ps = softpipe->framebuffer.zsbuf;
-   const enum pipe_format format = ps->format;
-   unsigned bzzzz[QUAD_SIZE];  /**< Z values fetched from depth buffer */
-   unsigned qzzzz[QUAD_SIZE];  /**< Z values from the quad */
-   unsigned zmask = 0;
    unsigned j;
-   struct softpipe_cached_tile *tile
-      = sp_get_cached_tile(softpipe, softpipe->zsbuf_cache, quad->input.x0, quad->input.y0);
+   const struct softpipe_cached_tile *tile = data->tile;
+
+   switch (data->format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth16[y][x];
+      }
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth32[y][x];
+      }
+      break;
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth32[y][x] & 0xffffff;
+         data->stencilVals[j] = tile->data.depth32[y][x] >> 24;
+      }
+   break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24S8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth32[y][x] >> 8;
+         data->stencilVals[j] = tile->data.depth32[y][x] & 0xff;
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
 
-   assert(ps); /* shouldn't get here if there's no zbuffer */
+/* If the shader has not been run, interpolate the depth values
+ * ourselves.
+ */
+static void
+interpolate_quad_depth( struct quad_header *quad )
+{
+   const float fx = (float) quad->input.x0;
+   const float fy = (float) quad->input.y0;
+   const float dzdx = quad->posCoef->dadx[2];
+   const float dzdy = quad->posCoef->dady[2];
+   const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
 
-   /*
-    * Convert quad's float depth values to int depth values (qzzzz).
+   quad->output.depth[0] = z0;
+   quad->output.depth[1] = z0 + dzdx;
+   quad->output.depth[2] = z0 + dzdy;
+   quad->output.depth[3] = z0 + dzdx + dzdy;
+}
+
+
+static void
+convert_quad_depth( struct depth_data *data, 
+                    const struct quad_header *quad )
+{
+   unsigned j;
+
+   /* Convert quad's float depth values to int depth values (qzzzz).
     * If the Z buffer stores integer values, we _have_ to do the depth
     * compares with integers (not floats).  Otherwise, the float->int->float
     * conversion of Z values (which isn't an identity function) will cause
     * Z-fighting errors.
-    *
-    * Also, get the zbuffer values (bzzzz) from the cached tile.
     */
-   switch (format) {
+   switch (data->format) {
    case PIPE_FORMAT_Z16_UNORM:
       {
          float scale = 65535.0;
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
-         }
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            bzzzz[j] = tile->data.depth16[y][x];
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
       }
       break;
@@ -94,47 +142,247 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
          double scale = (double) (uint) ~0UL;
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
-         }
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            bzzzz[j] = tile->data.depth32[y][x];
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
       }
       break;
    case PIPE_FORMAT_X8Z24_UNORM:
-      /* fall-through */
    case PIPE_FORMAT_S8Z24_UNORM:
       {
          float scale = (float) ((1 << 24) - 1);
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
-         }
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            bzzzz[j] = tile->data.depth32[y][x] & 0xffffff;
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
       }
       break;
    case PIPE_FORMAT_Z24X8_UNORM:
-      /* fall-through */
    case PIPE_FORMAT_Z24S8_UNORM:
       {
          float scale = (float) ((1 << 24) - 1);
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
 
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            bzzzz[j] = tile->data.depth32[y][x] >> 8;
+
+
+static void
+write_depth_stencil_values( struct depth_data *data,
+                            struct quad_header *quad )
+{
+   struct softpipe_cached_tile *tile = data->tile;
+   unsigned j;
+
+   /* put updated Z values back into cached tile */
+   switch (data->format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth16[y][x] = (ushort) data->bzzzz[j];
+      }
+      break;
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_Z32_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = data->bzzzz[j];
+      }
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = (data->stencilVals[j] << 24) | data->bzzzz[j];
+      }
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = (data->bzzzz[j] << 8) | data->stencilVals[j];
+      }
+      break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = data->bzzzz[j] << 8;
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+
+
+/** Only 8-bit stencil supported */
+#define STENCIL_MAX 0xff
+
+
+/**
+ * Do the basic stencil test (compare stencil buffer values against the
+ * reference value.
+ *
+ * \param data->stencilVals  the stencil values from the stencil buffer
+ * \param func  the stencil func (PIPE_FUNC_x)
+ * \param ref  the stencil reference value
+ * \param valMask  the stencil value mask indicating which bits of the stencil
+ *                 values and ref value are to be used.
+ * \return mask indicating which pixels passed the stencil test
+ */
+static unsigned
+do_stencil_test(struct depth_data *data,
+                unsigned func,
+                unsigned ref, unsigned valMask)
+{
+   unsigned passMask = 0x0;
+   unsigned j;
+
+   ref &= valMask;
+
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      /* passMask = 0x0 */
+      break;
+   case PIPE_FUNC_LESS:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref < (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_EQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref == (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_LEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref <= (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GREATER:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref > (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref != (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref >= (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_ALWAYS:
+      passMask = MASK_ALL;
+      break;
+   default:
+      assert(0);
+   }
+
+   return passMask;
+}
+
+
+/**
+ * Apply the stencil operator to stencil values.
+ *
+ * \param data->stencilVals  the stencil buffer values (read and written)
+ * \param mask  indicates which pixels to update
+ * \param op  the stencil operator (PIPE_STENCIL_OP_x)
+ * \param ref  the stencil reference value
+ * \param wrtMask  writemask controlling which bits are changed in the
+ *                 stencil values
+ */
+static void
+apply_stencil_op(struct depth_data *data,
+                 unsigned mask, unsigned op, ubyte ref, ubyte wrtMask)
+{
+   unsigned j;
+   ubyte newstencil[QUAD_SIZE];
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      newstencil[j] = data->stencilVals[j];
+   }
+
+   switch (op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* no-op */
+      break;
+   case PIPE_STENCIL_OP_ZERO:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = 0;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_REPLACE:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = ref;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INCR:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            if (data->stencilVals[j] < STENCIL_MAX) {
+               newstencil[j] = data->stencilVals[j] + 1;
+            }
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_DECR:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            if (data->stencilVals[j] > 0) {
+               newstencil[j] = data->stencilVals[j] - 1;
+            }
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = data->stencilVals[j] + 1;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = data->stencilVals[j] - 1;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INVERT:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = ~data->stencilVals[j];
          }
       }
       break;
@@ -142,6 +390,39 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
       assert(0);
    }
 
+   /*
+    * update the stencil values
+    */
+   if (wrtMask != STENCIL_MAX) {
+      /* apply bit-wise stencil buffer writemask */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         data->stencilVals[j] = (wrtMask & newstencil[j]) | (~wrtMask & data->stencilVals[j]);
+      }
+   }
+   else {
+      for (j = 0; j < QUAD_SIZE; j++) {
+         data->stencilVals[j] = newstencil[j];
+      }
+   }
+}
+
+   
+
+/*
+ * To increase efficiency, we should probably have multiple versions
+ * of this function that are specifically for Z16, Z32 and FP Z buffers.
+ * Try to effectively do that with codegen...
+ */
+
+static boolean
+depth_test_quad(struct quad_stage *qs, 
+                struct depth_data *data,
+                struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   unsigned zmask = 0;
+   unsigned j;
+
    switch (softpipe->depth_stencil->depth.func) {
    case PIPE_FUNC_NEVER:
       /* zmask = 0 */
@@ -151,37 +432,37 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
        * Like this:  quad->mask &= (quad->outputs.depth < zzzz);
        */
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] < bzzzz[j]) 
+	 if (data->qzzzz[j] < data->bzzzz[j]) 
 	    zmask |= 1 << j;
       }
       break;
    case PIPE_FUNC_EQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] == bzzzz[j]) 
+	 if (data->qzzzz[j] == data->bzzzz[j]) 
 	    zmask |= 1 << j;
       }
       break;
    case PIPE_FUNC_LEQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] <= bzzzz[j]) 
+	 if (data->qzzzz[j] <= data->bzzzz[j]) 
 	    zmask |= (1 << j);
       }
       break;
    case PIPE_FUNC_GREATER:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] > bzzzz[j]) 
+	 if (data->qzzzz[j] > data->bzzzz[j]) 
 	    zmask |= (1 << j);
       }
       break;
    case PIPE_FUNC_NOTEQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] != bzzzz[j]) 
+	 if (data->qzzzz[j] != data->bzzzz[j]) 
 	    zmask |= (1 << j);
       }
       break;
    case PIPE_FUNC_GEQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] >= bzzzz[j]) 
+	 if (data->qzzzz[j] >= data->bzzzz[j]) 
 	    zmask |= (1 << j);
       }
       break;
@@ -193,80 +474,480 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
    }
 
    quad->inout.mask &= zmask;
+   if (quad->inout.mask == 0)
+      return FALSE;
 
+   /* Update our internal copy only if writemask set.  Even if
+    * depth.writemask is FALSE, may still need to write out buffer
+    * data due to stencil changes.
+    */
    if (softpipe->depth_stencil->depth.writemask) {
-      
-      /* This is also efficient with sse / spe instructions: 
-       */
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (quad->inout.mask & (1 << j)) {
-	    bzzzz[j] = qzzzz[j];
-	 }
+         if (quad->inout.mask & (1 << j)) {
+            data->bzzzz[j] = data->qzzzz[j];
+         }
       }
+   }
 
-      /* put updated Z values back into cached tile */
-      switch (format) {
-      case PIPE_FORMAT_Z16_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            tile->data.depth16[y][x] = (ushort) bzzzz[j];
+   return TRUE;
+}
+
+
+
+/**
+ * Do stencil (and depth) testing.  Stenciling depends on the outcome of
+ * depth testing.
+ */
+static void
+depth_stencil_test_quad(struct quad_stage *qs, 
+                        struct depth_data *data,
+                        struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   unsigned func, zFailOp, zPassOp, failOp;
+   ubyte ref, wrtMask, valMask;
+   uint face = quad->input.facing;
+
+   if (!softpipe->depth_stencil->stencil[1].enabled) {
+      /* single-sided stencil test, use front (face=0) state */
+      face = 0;
+   }
+
+   /* choose front or back face function, operator, etc */
+   /* XXX we could do these initializations once per primitive */
+   func    = softpipe->depth_stencil->stencil[face].func;
+   failOp  = softpipe->depth_stencil->stencil[face].fail_op;
+   zFailOp = softpipe->depth_stencil->stencil[face].zfail_op;
+   zPassOp = softpipe->depth_stencil->stencil[face].zpass_op;
+   ref     = softpipe->depth_stencil->stencil[face].ref_value;
+   wrtMask = softpipe->depth_stencil->stencil[face].writemask;
+   valMask = softpipe->depth_stencil->stencil[face].valuemask;
+
+
+   /* do the stencil test first */
+   {
+      unsigned passMask, failMask;
+      passMask = do_stencil_test(data, func, ref, valMask);
+      failMask = quad->inout.mask & ~passMask;
+      quad->inout.mask &= passMask;
+
+      if (failOp != PIPE_STENCIL_OP_KEEP) {
+         apply_stencil_op(data, failMask, failOp, ref, wrtMask);
+      }
+   }
+
+   if (quad->inout.mask) {
+      /* now the pixels that passed the stencil test are depth tested */
+      if (softpipe->depth_stencil->depth.enabled) {
+         const unsigned origMask = quad->inout.mask;
+
+         depth_test_quad(qs, data, quad);  /* quad->mask is updated */
+
+         /* update stencil buffer values according to z pass/fail result */
+         if (zFailOp != PIPE_STENCIL_OP_KEEP) {
+            const unsigned zFailMask = origMask & ~quad->inout.mask;
+            apply_stencil_op(data, zFailMask, zFailOp, ref, wrtMask);
          }
-         break;
-      case PIPE_FORMAT_X8Z24_UNORM:
-         /* fall-through */
-         /* (yes, this falls through to a different case than above) */
-      case PIPE_FORMAT_Z32_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            tile->data.depth32[y][x] = bzzzz[j];
+
+         if (zPassOp != PIPE_STENCIL_OP_KEEP) {
+            const unsigned zPassMask = origMask & quad->inout.mask;
+            apply_stencil_op(data, zPassMask, zPassOp, ref, wrtMask);
          }
-         break;
-      case PIPE_FORMAT_S8Z24_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            uint s8z24 = tile->data.depth32[y][x];
-            s8z24 = (s8z24 & 0xff000000) | bzzzz[j];
-            tile->data.depth32[y][x] = s8z24;
+      }
+      else {
+         /* no depth test, apply Zpass operator to stencil buffer values */
+         apply_stencil_op(data, quad->inout.mask, zPassOp, ref, wrtMask);
+      }
+   }
+}
+
+
+#define ALPHATEST( FUNC, COMP )                                         \
+   static int                                                          \
+   alpha_test_quads_##FUNC( struct quad_stage *qs,                      \
+                           struct quad_header *quads[],                 \
+                           unsigned nr )                                \
+   {                                                                    \
+      const float ref = qs->softpipe->depth_stencil->alpha.ref_value;   \
+      const uint cbuf = 0; /* only output[0].alpha is tested */         \
+      unsigned pass_nr = 0;                                             \
+      unsigned i;                                                       \
+                                                                        \
+      for (i = 0; i < nr; i++) {                                        \
+         const float *aaaa = quads[i]->output.color[cbuf][3];           \
+         unsigned passMask = 0;                                         \
+                                                                        \
+         if (aaaa[0] COMP ref) passMask |= (1 << 0);                    \
+         if (aaaa[1] COMP ref) passMask |= (1 << 1);                    \
+         if (aaaa[2] COMP ref) passMask |= (1 << 2);                    \
+         if (aaaa[3] COMP ref) passMask |= (1 << 3);                    \
+                                                                        \
+         quads[i]->inout.mask &= passMask;                              \
+                                                                        \
+         if (quads[i]->inout.mask)                                      \
+            quads[pass_nr++] = quads[i];                                \
+      }                                                                 \
+                                                                        \
+      return pass_nr;                                                   \
+   }
+
+
+ALPHATEST( LESS,     < )
+ALPHATEST( EQUAL,    == )
+ALPHATEST( LEQUAL,   <= )
+ALPHATEST( GREATER,  > )
+ALPHATEST( NOTEQUAL, != )
+ALPHATEST( GEQUAL,   >= )
+
+
+/* XXX: Incorporate into shader using KILP.
+ */
+static int
+alpha_test_quads(struct quad_stage *qs, 
+                 struct quad_header *quads[], 
+                 unsigned nr)
+{
+   switch (qs->softpipe->depth_stencil->alpha.func) {
+   case PIPE_FUNC_LESS:
+      return alpha_test_quads_LESS( qs, quads, nr );
+   case PIPE_FUNC_EQUAL:
+      return alpha_test_quads_EQUAL( qs, quads, nr );
+      break;
+   case PIPE_FUNC_LEQUAL:
+      return alpha_test_quads_LEQUAL( qs, quads, nr );
+   case PIPE_FUNC_GREATER:
+      return alpha_test_quads_GREATER( qs, quads, nr );
+   case PIPE_FUNC_NOTEQUAL:
+      return alpha_test_quads_NOTEQUAL( qs, quads, nr );
+   case PIPE_FUNC_GEQUAL:
+      return alpha_test_quads_GEQUAL( qs, quads, nr );
+   case PIPE_FUNC_ALWAYS:
+      return nr;
+   case PIPE_FUNC_NEVER:
+   default:
+      return 0;
+   }
+}
+
+static unsigned mask_count[16] = 
+{
+   0,                           /* 0x0 */
+   1,                           /* 0x1 */
+   1,                           /* 0x2 */
+   2,                           /* 0x3 */
+   1,                           /* 0x4 */
+   2,                           /* 0x5 */
+   2,                           /* 0x6 */
+   3,                           /* 0x7 */
+   1,                           /* 0x8 */
+   2,                           /* 0x9 */
+   2,                           /* 0xa */
+   3,                           /* 0xb */
+   2,                           /* 0xc */
+   3,                           /* 0xd */
+   3,                           /* 0xe */
+   4,                           /* 0xf */
+};
+
+
+
+static void
+depth_test_quads_fallback(struct quad_stage *qs, 
+                          struct quad_header *quads[],
+                          unsigned nr)
+{
+   unsigned i, pass = 0;
+   const struct sp_fragment_shader *fs = qs->softpipe->fs;
+   boolean interp_depth = !fs->info.writes_z;
+   struct depth_data data;
+
+
+   if (qs->softpipe->depth_stencil->alpha.enabled) {
+      nr = alpha_test_quads(qs, quads, nr);
+   }
+
+   if (qs->softpipe->framebuffer.zsbuf && 
+       (qs->softpipe->depth_stencil->depth.enabled ||
+        qs->softpipe->depth_stencil->stencil[0].enabled)) {
+
+      data.ps = qs->softpipe->framebuffer.zsbuf;
+      data.format = data.ps->format;
+      data.tile = sp_get_cached_tile(qs->softpipe->zsbuf_cache, 
+                                     quads[0]->input.x0, 
+                                     quads[0]->input.y0);
+
+      for (i = 0; i < nr; i++) {
+         get_depth_stencil_values(&data, quads[i]);
+
+         if (qs->softpipe->depth_stencil->depth.enabled) {
+            if (interp_depth)
+               interpolate_quad_depth(quads[i]);
+
+            convert_quad_depth(&data, quads[i]);
          }
-         break;
-      case PIPE_FORMAT_Z24S8_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            uint z24s8 = tile->data.depth32[y][x];
-            z24s8 = (z24s8 & 0xff) | (bzzzz[j] << 8);
-            tile->data.depth32[y][x] = z24s8;
+
+         if (qs->softpipe->depth_stencil->stencil[0].enabled) {
+            depth_stencil_test_quad(qs, &data, quads[i]);
+            write_depth_stencil_values(&data, quads[i]);
          }
-         break;
-      case PIPE_FORMAT_Z24X8_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            tile->data.depth32[y][x] = bzzzz[j] << 8;
+         else {
+            if (!depth_test_quad(qs, &data, quads[i]))
+               continue;
+
+            if (qs->softpipe->depth_stencil->depth.writemask)
+               write_depth_stencil_values(&data, quads[i]);
          }
-         break;
-      default:
-         assert(0);
+
+
+         quads[pass++] = quads[i];
+      }
+
+      nr = pass;
+   }
+
+   if (qs->softpipe->active_query_count) {
+      for (i = 0; i < nr; i++) 
+         qs->softpipe->occlusion_count += mask_count[quads[i]->inout.mask];
+   }
+
+   if (nr)
+      qs->next->run(qs->next, quads, nr);
+}
+
+/* XXX: this function assumes setup function actually emits linear
+ * spans of quads.  It seems a lot more natural to do (early)
+ * depth-testing on spans rather than quads.
+ */
+static void
+depth_interp_z16_less_write(struct quad_stage *qs, 
+                            struct quad_header *quads[],
+                            unsigned nr)
+{
+   unsigned i, pass = 0;
+   const unsigned ix = quads[0]->input.x0;
+   const unsigned iy = quads[0]->input.y0;
+   const float fx = (float) ix;
+   const float fy = (float) iy;
+   const float dzdx = quads[0]->posCoef->dadx[2];
+   const float dzdy = quads[0]->posCoef->dady[2];
+   const float z0 = quads[0]->posCoef->a0[2] + dzdx * fx + dzdy * fy;
+   struct softpipe_cached_tile *tile;
+   ushort (*depth16)[TILE_SIZE];
+   ushort idepth[4], depth_step;
+   const float scale = 65535.0;
+
+   idepth[0] = (ushort)((z0) * scale);
+   idepth[1] = (ushort)((z0 + dzdx) * scale);
+   idepth[2] = (ushort)((z0 + dzdy) * scale);
+   idepth[3] = (ushort)((z0 + dzdx + dzdy) * scale);
+
+   depth_step = (ushort)(dzdx * 2 * scale);
+
+   tile = sp_get_cached_tile(qs->softpipe->zsbuf_cache, ix, iy);
+
+   depth16 = (ushort (*)[TILE_SIZE])
+      &tile->data.depth16[iy % TILE_SIZE][ix % TILE_SIZE];
+
+   for (i = 0; i < nr; i++) {
+      unsigned outmask = quads[i]->inout.mask;
+      unsigned mask = 0;
+      
+      if ((outmask & 1) && idepth[0] < depth16[0][0]) {
+         depth16[0][0] = idepth[0];
+         mask |= (1 << 0);
+      }
+
+      if ((outmask & 2) && idepth[1] < depth16[0][1]) {
+         depth16[0][1] = idepth[1];
+         mask |= (1 << 1);
+      }
+
+      if ((outmask & 4) && idepth[2] < depth16[1][0]) {
+         depth16[1][0] = idepth[2];
+         mask |= (1 << 2);
+      }
+
+      if ((outmask & 8) && idepth[3] < depth16[1][1]) {
+         depth16[1][1] = idepth[3];
+         mask |= (1 << 3);
       }
+
+      idepth[0] += depth_step;
+      idepth[1] += depth_step;
+      idepth[2] += depth_step;
+      idepth[3] += depth_step;
+
+      depth16 = (ushort (*)[TILE_SIZE]) &depth16[0][2];
+
+      quads[i]->inout.mask = mask;
+      if (quads[i]->inout.mask)
+         quads[pass++] = quads[i];
    }
+
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
+
 }
 
 
 static void
-depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
+depth_interp_z16_lequal_write(struct quad_stage *qs, 
+                            struct quad_header *quads[],
+                            unsigned nr)
 {
-   sp_depth_test_quad(qs, quad);
+   unsigned i, pass = 0;
+   const unsigned ix = quads[0]->input.x0;
+   const unsigned iy = quads[0]->input.y0;
+   const float fx = (float) ix;
+   const float fy = (float) iy;
+   const float dzdx = quads[0]->posCoef->dadx[2];
+   const float dzdy = quads[0]->posCoef->dady[2];
+   const float z0 = quads[0]->posCoef->a0[2] + dzdx * fx + dzdy * fy;
+   struct softpipe_cached_tile *tile;
+   ushort (*depth16)[TILE_SIZE];
+   ushort idepth[4], depth_step;
+   const float scale = 65535.0;
+
+   idepth[0] = (ushort)((z0) * scale);
+   idepth[1] = (ushort)((z0 + dzdx) * scale);
+   idepth[2] = (ushort)((z0 + dzdy) * scale);
+   idepth[3] = (ushort)((z0 + dzdx + dzdy) * scale);
+
+   depth_step = (ushort)(dzdx * 2 * scale);
+
+   tile = sp_get_cached_tile(qs->softpipe->zsbuf_cache, ix, iy);
+
+   depth16 = (ushort (*)[TILE_SIZE])
+      &tile->data.depth16[iy % TILE_SIZE][ix % TILE_SIZE];
+
+   for (i = 0; i < nr; i++) {
+      unsigned outmask = quads[i]->inout.mask;
+      unsigned mask = 0;
+      
+      if ((outmask & 1) && idepth[0] <= depth16[0][0]) {
+         depth16[0][0] = idepth[0];
+         mask |= (1 << 0);
+      }
+
+      if ((outmask & 2) && idepth[1] <= depth16[0][1]) {
+         depth16[0][1] = idepth[1];
+         mask |= (1 << 1);
+      }
+
+      if ((outmask & 4) && idepth[2] <= depth16[1][0]) {
+         depth16[1][0] = idepth[2];
+         mask |= (1 << 2);
+      }
+
+      if ((outmask & 8) && idepth[3] <= depth16[1][1]) {
+         depth16[1][1] = idepth[3];
+         mask |= (1 << 3);
+      }
+
+      idepth[0] += depth_step;
+      idepth[1] += depth_step;
+      idepth[2] += depth_step;
+      idepth[3] += depth_step;
+
+      depth16 = (ushort (*)[TILE_SIZE]) &depth16[0][2];
+
+      quads[i]->inout.mask = mask;
+      if (quads[i]->inout.mask)
+         quads[pass++] = quads[i];
+   }
+
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
 
-   if (quad->inout.mask)
-      qs->next->run(qs->next, quad);
 }
 
 
+
+
+
+static void
+depth_noop(struct quad_stage *qs, 
+           struct quad_header *quads[],
+           unsigned nr)
+{
+   qs->next->run(qs->next, quads, nr);
+}
+
+
+
+static void
+choose_depth_test(struct quad_stage *qs, 
+                  struct quad_header *quads[],
+                  unsigned nr)
+{
+   boolean interp_depth = !qs->softpipe->fs->info.writes_z;
+
+   boolean alpha = qs->softpipe->depth_stencil->alpha.enabled;
+
+   boolean depth = (qs->softpipe->framebuffer.zsbuf && 
+                    qs->softpipe->depth_stencil->depth.enabled);
+
+   unsigned depthfunc = qs->softpipe->depth_stencil->depth.func;
+
+   boolean stencil = qs->softpipe->depth_stencil->stencil[0].enabled;
+
+   boolean depthwrite = qs->softpipe->depth_stencil->depth.writemask;
+
+   boolean occlusion = qs->softpipe->active_query_count;
+
+
+   if (!alpha &&
+       !depth &&
+       !stencil) {
+      qs->run = depth_noop;
+   }
+   else if (!alpha && 
+            interp_depth && 
+            depth && 
+            depthwrite && 
+            !occlusion &&
+            !stencil) 
+   {
+      switch (depthfunc) {
+      case PIPE_FUNC_LESS:
+         switch (qs->softpipe->framebuffer.zsbuf->format) {
+         case PIPE_FORMAT_Z16_UNORM:
+            qs->run = depth_interp_z16_less_write;
+            break;
+         default:
+            qs->run = depth_test_quads_fallback;
+            break;
+         }
+         break;
+      case PIPE_FUNC_LEQUAL:
+         switch (qs->softpipe->framebuffer.zsbuf->format) {
+         case PIPE_FORMAT_Z16_UNORM:
+            qs->run = depth_interp_z16_lequal_write;
+            break;
+         default:
+            qs->run = depth_test_quads_fallback;
+            break;
+         }
+         break;
+      default:
+         qs->run = depth_test_quads_fallback;
+      }
+   }
+   else {
+      qs->run = depth_test_quads_fallback;
+   }
+
+
+   qs->run( qs, quads, nr );
+}
+
+
+
+
+
 static void depth_test_begin(struct quad_stage *qs)
 {
+   qs->run = choose_depth_test;
    qs->next->begin(qs->next);
 }
 
@@ -283,7 +964,7 @@ struct quad_stage *sp_quad_depth_test_stage( struct softpipe_context *softpipe )
 
    stage->softpipe = softpipe;
    stage->begin = depth_test_begin;
-   stage->run = depth_test_quad;
+   stage->run = choose_depth_test;
    stage->destroy = depth_test_destroy;
 
    return stage;
diff --git a/src/gallium/drivers/softpipe/sp_quad_earlyz.c b/src/gallium/drivers/softpipe/sp_quad_earlyz.c
deleted file mode 100644
index 496fd39ed1..0000000000
--- a/src/gallium/drivers/softpipe/sp_quad_earlyz.c
+++ /dev/null
@@ -1,88 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * \brief  Quad early-z testing
- */
-
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "sp_quad.h"
-#include "sp_quad_pipe.h"
-
-
-/**
- * All this stage does is compute the quad's Z values (which is normally
- * done by the shading stage).
- * The next stage will do the actual depth test.
- */
-static void
-earlyz_quad(
-   struct quad_stage    *qs,
-   struct quad_header   *quad )
-{
-   const float fx = (float) quad->input.x0;
-   const float fy = (float) quad->input.y0;
-   const float dzdx = quad->posCoef->dadx[2];
-   const float dzdy = quad->posCoef->dady[2];
-   const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
-
-   quad->output.depth[0] = z0;
-   quad->output.depth[1] = z0 + dzdx;
-   quad->output.depth[2] = z0 + dzdy;
-   quad->output.depth[3] = z0 + dzdx + dzdy;
-
-   qs->next->run( qs->next, quad );
-}
-
-static void
-earlyz_begin(
-   struct quad_stage *qs )
-{
-   qs->next->begin( qs->next );
-}
-
-static void
-earlyz_destroy(
-   struct quad_stage *qs )
-{
-   FREE( qs );
-}
-
-struct quad_stage *
-sp_quad_earlyz_stage(
-   struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT( quad_stage );
-
-   stage->softpipe = softpipe;
-   stage->begin = earlyz_begin;
-   stage->run = earlyz_quad;
-   stage->destroy = earlyz_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 28f8d1a60e..1e7533d0f9 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -68,72 +68,69 @@ quad_shade_stage(struct quad_stage *qs)
 /**
  * Execute fragment shader for the four fragments in the quad.
  */
-static void
+static INLINE boolean
 shade_quad(struct quad_stage *qs, struct quad_header *quad)
 {
    struct quad_shade_stage *qss = quad_shade_stage( qs );
    struct softpipe_context *softpipe = qs->softpipe;
    struct tgsi_exec_machine *machine = qss->machine;
-   boolean z_written;
-   
-   /* Consts do not require 16 byte alignment. */
-   machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
-
-   machine->InterpCoefs = quad->coef;
 
    /* run shader */
-   quad->inout.mask &= softpipe->fs->run( softpipe->fs, machine, quad );
-
-   /* store outputs */
-   z_written = FALSE;
-   {
-      const ubyte *sem_name = softpipe->fs->info.output_semantic_name;
-      const ubyte *sem_index = softpipe->fs->info.output_semantic_index;
-      const uint n = qss->stage.softpipe->fs->info.num_outputs;
-      uint i;
-      for (i = 0; i < n; i++) {
-         switch (sem_name[i]) {
-         case TGSI_SEMANTIC_COLOR:
-            {
-               uint cbuf = sem_index[i];
-               memcpy(quad->output.color[cbuf],
-                      &machine->Outputs[i].xyzw[0].f[0],
-                      sizeof(quad->output.color[0]) );
-            }
-            break;
-         case TGSI_SEMANTIC_POSITION:
-            {
-               uint j;
-               for (j = 0; j < 4; j++) {
-                  quad->output.depth[j] = machine->Outputs[0].xyzw[2].f[j];
-               }
-               z_written = TRUE;
-            }
-            break;
-         }
+   return softpipe->fs->run( softpipe->fs, machine, quad );
+}
+
+
+
+static void
+coverage_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   uint cbuf;
+
+   /* loop over colorbuffer outputs */
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
+      float (*quadColor)[4] = quad->output.color[cbuf];
+      unsigned j;
+      for (j = 0; j < QUAD_SIZE; j++) {
+         assert(quad->input.coverage[j] >= 0.0);
+         assert(quad->input.coverage[j] <= 1.0);
+         quadColor[3][j] *= quad->input.coverage[j];
       }
    }
+}
+
 
-   if (!z_written) {
-      /* compute Z values now, as in the quad earlyz stage */
-      /* XXX we should really only do this if the earlyz stage is not used */
-      const float fx = (float) quad->input.x0;
-      const float fy = (float) quad->input.y0;
-      const float dzdx = quad->posCoef->dadx[2];
-      const float dzdy = quad->posCoef->dady[2];
-      const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
-
-      quad->output.depth[0] = z0;
-      quad->output.depth[1] = z0 + dzdx;
-      quad->output.depth[2] = z0 + dzdy;
-      quad->output.depth[3] = z0 + dzdx + dzdy;
-   }
 
-   /* shader may cull fragments */
-   if (quad->inout.mask) {
-      qs->next->run( qs->next, quad );
+static void
+shade_quads(struct quad_stage *qs, 
+                 struct quad_header *quads[],
+                 unsigned nr)
+{
+   struct quad_shade_stage *qss = quad_shade_stage( qs );
+   struct softpipe_context *softpipe = qs->softpipe;
+   struct tgsi_exec_machine *machine = qss->machine;
+
+   unsigned i, pass = 0;
+   
+   machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
+   machine->InterpCoefs = quads[0]->coef;
+
+   for (i = 0; i < nr; i++) {
+      if (!shade_quad(qs, quads[i]))
+         continue;
+
+      if (/*do_coverage*/ 0)
+         coverage_quad( qs, quads[i] );
+
+      quads[pass++] = quads[i];
    }
+   
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
 }
+   
+
+
 
 
 /**
@@ -174,7 +171,7 @@ sp_quad_shade_stage( struct softpipe_context *softpipe )
 
    qss->stage.softpipe = softpipe;
    qss->stage.begin = shade_begin;
-   qss->stage.run = shade_quad;
+   qss->stage.run = shade_quads;
    qss->stage.destroy = shade_destroy;
 
    qss->machine = tgsi_exec_machine_create();
diff --git a/src/gallium/drivers/softpipe/sp_quad_occlusion.c b/src/gallium/drivers/softpipe/sp_quad_occlusion.c
deleted file mode 100644
index dfa7ff3b1d..0000000000
--- a/src/gallium/drivers/softpipe/sp_quad_occlusion.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-/**
- * \brief  Quad occlusion counter stage
- * \author  Brian Paul
- */
-
-
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_quad_pipe.h"
-
-static unsigned count_bits( unsigned val )
-{
-   unsigned i;
-
-   for (i = 0; val ; val >>= 1)
-      i += (val & 1);
-
-   return i;
-}
-
-static void
-occlusion_count_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-
-   softpipe->occlusion_count += count_bits(quad->inout.mask);
-
-   qs->next->run(qs->next, quad);
-}
-
-
-static void occlusion_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void occlusion_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_occlusion_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = occlusion_begin;
-   stage->run = occlusion_count_quad;
-   stage->destroy = occlusion_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_output.c b/src/gallium/drivers/softpipe/sp_quad_output.c
deleted file mode 100644
index 92d5f9f3c1..0000000000
--- a/src/gallium/drivers/softpipe/sp_quad_output.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_quad_pipe.h"
-#include "sp_tile_cache.h"
-
-
-/**
- * Last step of quad processing: write quad colors to the framebuffer,
- * taking mask into account.
- */
-static void
-output_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   /* in-tile pos: */
-   const int itx = quad->input.x0 % TILE_SIZE;
-   const int ity = quad->input.y0 % TILE_SIZE;
-
-   struct softpipe_context *softpipe = qs->softpipe;
-   uint cbuf;
-
-   /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-      struct softpipe_cached_tile *tile
-         = sp_get_cached_tile(softpipe,
-                              softpipe->cbuf_cache[cbuf],
-                              quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      int i, j;
-
-      /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (quad->inout.mask & (1 << j)) {
-            int x = itx + (j & 1);
-            int y = ity + (j >> 1);
-            for (i = 0; i < 4; i++) { /* loop over color chans */
-               tile->data.color[y][x][i] = quadColor[i][j];
-            }
-            if (0) {
-               debug_printf("sp write pixel %d,%d: %g, %g, %g\n",
-                            quad->input.x0 + x,
-                            quad->input.y0 + y,
-                            quadColor[0][j],
-                            quadColor[1][j],
-                            quadColor[2][j]);
-            }
-         }
-      }
-   }
-}
-
-
-static void output_begin(struct quad_stage *qs)
-{
-   assert(qs->next == NULL);
-}
-
-
-static void output_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_output_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = output_begin;
-   stage->run = output_quad;
-   stage->destroy = output_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_pipe.c b/src/gallium/drivers/softpipe/sp_quad_pipe.c
index b5f69b7426..1b5bab4eca 100644
--- a/src/gallium/drivers/softpipe/sp_quad_pipe.c
+++ b/src/gallium/drivers/softpipe/sp_quad_pipe.c
@@ -31,88 +31,33 @@
 #include "pipe/p_shader_tokens.h"
 
 static void
-sp_push_quad_first(
-   struct softpipe_context *sp,
-   struct quad_stage *quad,
-   uint i )
+sp_push_quad_first( struct softpipe_context *sp,
+                    struct quad_stage *quad )
 {
-   quad->next = sp->quad[i].first;
-   sp->quad[i].first = quad;
+   quad->next = sp->quad.first;
+   sp->quad.first = quad;
 }
 
-static void
-sp_build_depth_stencil(
-   struct softpipe_context *sp,
-   uint i )
-{
-   if (sp->depth_stencil->stencil[0].enabled ||
-       sp->depth_stencil->stencil[1].enabled) {
-      sp_push_quad_first( sp, sp->quad[i].stencil_test, i );
-   }
-   else if (sp->depth_stencil->depth.enabled &&
-            sp->framebuffer.zsbuf) {
-      sp_push_quad_first( sp, sp->quad[i].depth_test, i );
-   }
-}
 
 void
 sp_build_quad_pipeline(struct softpipe_context *sp)
 {
-   uint i;
-
    boolean early_depth_test =
-               sp->depth_stencil->depth.enabled &&
-               sp->framebuffer.zsbuf &&
-               !sp->depth_stencil->alpha.enabled &&
-               !sp->fs->info.uses_kill &&
-               !sp->fs->info.writes_z;
-
-   /* build up the pipeline in reverse order... */
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      sp->quad[i].first = sp->quad[i].output;
-
-      if (sp->blend->colormask != 0xf) {
-         sp_push_quad_first( sp, sp->quad[i].colormask, i );
-      }
+      sp->depth_stencil->depth.enabled &&
+      sp->framebuffer.zsbuf &&
+      !sp->depth_stencil->alpha.enabled &&
+      !sp->fs->info.uses_kill &&
+      !sp->fs->info.writes_z;
 
-      if (sp->blend->blend_enable ||
-          sp->blend->logicop_enable) {
-         sp_push_quad_first( sp, sp->quad[i].blend, i );
-      }
+   sp->quad.first = sp->quad.blend;
 
-      if (sp->active_query_count) {
-         sp_push_quad_first( sp, sp->quad[i].occlusion, i );
-      }
-
-      if (sp->rasterizer->poly_smooth ||
-          sp->rasterizer->line_smooth ||
-          sp->rasterizer->point_smooth) {
-         sp_push_quad_first( sp, sp->quad[i].coverage, i );
-      }
-
-      if (!early_depth_test) {
-         sp_build_depth_stencil( sp, i );
-      }
-
-      if (sp->depth_stencil->alpha.enabled) {
-         sp_push_quad_first( sp, sp->quad[i].alpha_test, i );
-      }
-
-      /* XXX always enable shader? */
-      if (1) {
-         sp_push_quad_first( sp, sp->quad[i].shade, i );
-      }
-
-      if (early_depth_test) {
-         sp_build_depth_stencil( sp, i );
-         sp_push_quad_first( sp, sp->quad[i].earlyz, i );
-      }
-
-#if !USE_DRAW_STAGE_PSTIPPLE
-      if (sp->rasterizer->poly_stipple_enable) {
-         sp_push_quad_first( sp, sp->quad[i].polygon_stipple, i );
-      }
-#endif
+   if (early_depth_test) {
+      sp_push_quad_first( sp, sp->quad.shade );
+      sp_push_quad_first( sp, sp->quad.depth_test );
+   }
+   else {
+      sp_push_quad_first( sp, sp->quad.depth_test );
+      sp_push_quad_first( sp, sp->quad.shade );
    }
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_pipe.h b/src/gallium/drivers/softpipe/sp_quad_pipe.h
index 0e40586ffc..c0aa134831 100644
--- a/src/gallium/drivers/softpipe/sp_quad_pipe.h
+++ b/src/gallium/drivers/softpipe/sp_quad_pipe.h
@@ -49,7 +49,7 @@ struct quad_stage {
    void (*begin)(struct quad_stage *qs);
 
    /** the stage action */
-   void (*run)(struct quad_stage *qs, struct quad_header *quad);
+   void (*run)(struct quad_stage *qs, struct quad_header *quad[], unsigned nr);
 
    void (*destroy)(struct quad_stage *qs);
 };
@@ -69,6 +69,4 @@ struct quad_stage *sp_quad_output_stage( struct softpipe_context *softpipe );
 
 void sp_build_quad_pipeline(struct softpipe_context *sp);
 
-void sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad);
-
 #endif /* SP_QUAD_PIPE_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad_stencil.c b/src/gallium/drivers/softpipe/sp_quad_stencil.c
deleted file mode 100644
index 5e9d447737..0000000000
--- a/src/gallium/drivers/softpipe/sp_quad_stencil.c
+++ /dev/null
@@ -1,352 +0,0 @@
-
-/**
- * \brief Quad stencil testing
- */
-
-
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_tile_cache.h"
-#include "sp_quad_pipe.h"
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-
-
-/** Only 8-bit stencil supported */
-#define STENCIL_MAX 0xff
-
-
-/**
- * Do the basic stencil test (compare stencil buffer values against the
- * reference value.
- *
- * \param stencilVals  the stencil values from the stencil buffer
- * \param func  the stencil func (PIPE_FUNC_x)
- * \param ref  the stencil reference value
- * \param valMask  the stencil value mask indicating which bits of the stencil
- *                 values and ref value are to be used.
- * \return mask indicating which pixels passed the stencil test
- */
-static unsigned
-do_stencil_test(const ubyte stencilVals[QUAD_SIZE], unsigned func,
-                unsigned ref, unsigned valMask)
-{
-   unsigned passMask = 0x0;
-   unsigned j;
-
-   ref &= valMask;
-
-   switch (func) {
-   case PIPE_FUNC_NEVER:
-      /* passMask = 0x0 */
-      break;
-   case PIPE_FUNC_LESS:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref < (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_EQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref == (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_LEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref <= (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_GREATER:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref > (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref != (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_GEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref >= (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_ALWAYS:
-      passMask = MASK_ALL;
-      break;
-   default:
-      assert(0);
-   }
-
-   return passMask;
-}
-
-
-/**
- * Apply the stencil operator to stencil values.
- *
- * \param stencilVals  the stencil buffer values (read and written)
- * \param mask  indicates which pixels to update
- * \param op  the stencil operator (PIPE_STENCIL_OP_x)
- * \param ref  the stencil reference value
- * \param wrtMask  writemask controlling which bits are changed in the
- *                 stencil values
- */
-static void
-apply_stencil_op(ubyte stencilVals[QUAD_SIZE],
-                 unsigned mask, unsigned op, ubyte ref, ubyte wrtMask)
-{
-   unsigned j;
-   ubyte newstencil[QUAD_SIZE];
-
-   for (j = 0; j < QUAD_SIZE; j++) {
-      newstencil[j] = stencilVals[j];
-   }
-
-   switch (op) {
-   case PIPE_STENCIL_OP_KEEP:
-      /* no-op */
-      break;
-   case PIPE_STENCIL_OP_ZERO:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = 0;
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_REPLACE:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = ref;
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_INCR:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            if (stencilVals[j] < STENCIL_MAX) {
-               newstencil[j] = stencilVals[j] + 1;
-            }
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_DECR:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            if (stencilVals[j] > 0) {
-               newstencil[j] = stencilVals[j] - 1;
-            }
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_INCR_WRAP:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = stencilVals[j] + 1;
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_DECR_WRAP:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = stencilVals[j] - 1;
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_INVERT:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = ~stencilVals[j];
-         }
-      }
-      break;
-   default:
-      assert(0);
-   }
-
-   /*
-    * update the stencil values
-    */
-   if (wrtMask != STENCIL_MAX) {
-      /* apply bit-wise stencil buffer writemask */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         stencilVals[j] = (wrtMask & newstencil[j]) | (~wrtMask & stencilVals[j]);
-      }
-   }
-   else {
-      for (j = 0; j < QUAD_SIZE; j++) {
-         stencilVals[j] = newstencil[j];
-      }
-   }
-}
-
-
-/**
- * Do stencil (and depth) testing.  Stenciling depends on the outcome of
- * depth testing.
- */
-static void
-stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   struct pipe_surface *ps = softpipe->framebuffer.zsbuf;
-   unsigned func, zFailOp, zPassOp, failOp;
-   ubyte ref, wrtMask, valMask;
-   ubyte stencilVals[QUAD_SIZE];
-   struct softpipe_cached_tile *tile
-      = sp_get_cached_tile(softpipe, softpipe->zsbuf_cache, quad->input.x0, quad->input.y0);
-   uint j;
-   uint face = quad->input.facing;
-
-   if (!softpipe->depth_stencil->stencil[1].enabled) {
-      /* single-sided stencil test, use front (face=0) state */
-      face = 0;
-   }
-
-   /* choose front or back face function, operator, etc */
-   /* XXX we could do these initializations once per primitive */
-   func    = softpipe->depth_stencil->stencil[face].func;
-   failOp  = softpipe->depth_stencil->stencil[face].fail_op;
-   zFailOp = softpipe->depth_stencil->stencil[face].zfail_op;
-   zPassOp = softpipe->depth_stencil->stencil[face].zpass_op;
-   ref     = softpipe->depth_stencil->stencil[face].ref_value;
-   wrtMask = softpipe->depth_stencil->stencil[face].writemask;
-   valMask = softpipe->depth_stencil->stencil[face].valuemask;
-
-   assert(ps); /* shouldn't get here if there's no stencil buffer */
-
-   /* get stencil values from cached tile */
-   switch (ps->format) {
-   case PIPE_FORMAT_S8Z24_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         stencilVals[j] = tile->data.depth32[y][x] >> 24;
-      }
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         stencilVals[j] = tile->data.depth32[y][x] & 0xff;
-      }
-      break;
-   case PIPE_FORMAT_S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         stencilVals[j] = tile->data.stencil8[y][x];
-      }
-      break;
-   default:
-      assert(0);
-   }
-
-   /* do the stencil test first */
-   {
-      unsigned passMask, failMask;
-      passMask = do_stencil_test(stencilVals, func, ref, valMask);
-      failMask = quad->inout.mask & ~passMask;
-      quad->inout.mask &= passMask;
-
-      if (failOp != PIPE_STENCIL_OP_KEEP) {
-         apply_stencil_op(stencilVals, failMask, failOp, ref, wrtMask);
-      }
-   }
-
-   if (quad->inout.mask) {
-      /* now the pixels that passed the stencil test are depth tested */
-      if (softpipe->depth_stencil->depth.enabled) {
-         const unsigned origMask = quad->inout.mask;
-
-         sp_depth_test_quad(qs, quad);  /* quad->mask is updated */
-
-         /* update stencil buffer values according to z pass/fail result */
-         if (zFailOp != PIPE_STENCIL_OP_KEEP) {
-            const unsigned failMask = origMask & ~quad->inout.mask;
-            apply_stencil_op(stencilVals, failMask, zFailOp, ref, wrtMask);
-         }
-
-         if (zPassOp != PIPE_STENCIL_OP_KEEP) {
-            const unsigned passMask = origMask & quad->inout.mask;
-            apply_stencil_op(stencilVals, passMask, zPassOp, ref, wrtMask);
-         }
-      }
-      else {
-         /* no depth test, apply Zpass operator to stencil buffer values */
-         apply_stencil_op(stencilVals, quad->inout.mask, zPassOp, ref, wrtMask);
-      }
-
-   }
-
-   /* put new stencil values into cached tile */
-   switch (ps->format) {
-   case PIPE_FORMAT_S8Z24_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         uint s8z24 = tile->data.depth32[y][x];
-         s8z24 = (stencilVals[j] << 24) | (s8z24 & 0xffffff);
-         tile->data.depth32[y][x] = s8z24;
-      }
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         uint z24s8 = tile->data.depth32[y][x];
-         z24s8 = (z24s8 & 0xffffff00) | stencilVals[j];
-         tile->data.depth32[y][x] = z24s8;
-      }
-      break;
-   case PIPE_FORMAT_S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         tile->data.stencil8[y][x] = stencilVals[j];
-      }
-      break;
-   default:
-      assert(0);
-   }
-
-   if (quad->inout.mask)
-      qs->next->run(qs->next, quad);
-}
-
-
-static void stencil_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void stencil_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_stencil_test_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = stencil_begin;
-   stage->run = stencil_test_quad;
-   stage->destroy = stencil_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_stipple.c b/src/gallium/drivers/softpipe/sp_quad_stipple.c
index 07162db7b6..a0527a596a 100644
--- a/src/gallium/drivers/softpipe/sp_quad_stipple.c
+++ b/src/gallium/drivers/softpipe/sp_quad_stipple.c
@@ -14,14 +14,20 @@
  * Apply polygon stipple to quads produced by triangle rasterization
  */
 static void
-stipple_quad(struct quad_stage *qs, struct quad_header *quad)
+stipple_quad(struct quad_stage *qs, struct quad_header *quads[], unsigned nr)
 {
    static const uint bit31 = 1 << 31;
    static const uint bit30 = 1 << 30;
+   unsigned pass = nr;
+
+   struct softpipe_context *softpipe = qs->softpipe;
+   unsigned q;
+
+   pass = 0;
+
+   for (q = 0; q < nr; q++)  {
+      struct quad_header *quad = quads[q];
 
-   if (quad->input.prim == QUAD_PRIM_TRI) {
-      struct softpipe_context *softpipe = qs->softpipe;
-      /* need to invert Y to index into OpenGL's stipple pattern */
       const int col0 = quad->input.x0 % 32;
       const int y0 = quad->input.y0;
       const int y1 = y0 + 1;
@@ -41,13 +47,11 @@ stipple_quad(struct quad_stage *qs, struct quad_header *quad)
       if ((stipple1 & (bit30 >> col0)) == 0)
          quad->inout.mask &= ~MASK_BOTTOM_RIGHT;
 
-      if (!quad->inout.mask) {
-         /* all fragments failed stipple test, end of quad pipeline */
-         return;
-      }
+      if (quad->inout.mask)
+         quads[pass++] = quad;
    }
 
-   qs->next->run(qs->next, quad);
+   qs->next->run(qs->next, quads, pass);
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 6178c4ac7e..6cf45cded2 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -40,7 +40,7 @@
 static const char *
 softpipe_get_vendor(struct pipe_screen *screen)
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 
@@ -65,8 +65,6 @@ softpipe_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 1;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
@@ -141,6 +139,7 @@ softpipe_is_format_supported( struct pipe_screen *screen,
    case PIPE_FORMAT_DXT1_RGBA:
    case PIPE_FORMAT_DXT3_RGBA:
    case PIPE_FORMAT_DXT5_RGBA:
+   case PIPE_FORMAT_Z32_FLOAT:
       return FALSE;
    default:
       return TRUE;
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index de3ae3c369..ade125662a 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -33,7 +33,6 @@
  */
 
 #include "sp_context.h"
-#include "sp_prim_setup.h"
 #include "sp_quad.h"
 #include "sp_quad_pipe.h"
 #include "sp_setup.h"
@@ -61,87 +60,9 @@ struct edge {
    int lines;		/**< number of lines on this edge */
 };
 
-#if SP_NUM_QUAD_THREADS > 1
 
-/* Set to 1 if you want other threads to be instantly
- * notified of pending jobs.
- */
-#define INSTANT_NOTEMPTY_NOTIFY 0
-
-struct thread_info
-{
-   struct setup_context *setup;
-   uint id;
-   pipe_thread handle;
-};
-
-struct quad_job;
-
-typedef void (* quad_job_routine)( struct setup_context *setup, uint thread, struct quad_job *job );
-
-struct quad_job
-{
-   struct quad_header_input input;
-   struct quad_header_inout inout;
-   quad_job_routine routine;
-};
-
-#define NUM_QUAD_JOBS 64
-
-struct quad_job_que
-{
-   struct quad_job jobs[NUM_QUAD_JOBS];
-   uint first;
-   uint last;
-   pipe_mutex que_mutex;
-   pipe_condvar que_notfull_condvar;
-   pipe_condvar que_notempty_condvar;
-   uint jobs_added;
-   uint jobs_done;
-   pipe_condvar que_done_condvar;
-};
-
-static void
-add_quad_job( struct quad_job_que *que, struct quad_header *quad, quad_job_routine routine )
-{
-#if INSTANT_NOTEMPTY_NOTIFY
-   boolean empty;
-#endif
-
-   /* Wait for empty slot, see if the que is empty.
-    */
-   pipe_mutex_lock( que->que_mutex );
-   while ((que->last + 1) % NUM_QUAD_JOBS == que->first) {
-#if !INSTANT_NOTEMPTY_NOTIFY
-      pipe_condvar_broadcast( que->que_notempty_condvar );
-#endif
-      pipe_condvar_wait( que->que_notfull_condvar, que->que_mutex );
-   }
-#if INSTANT_NOTEMPTY_NOTIFY
-   empty = que->last == que->first;
-#endif
-   que->jobs_added++;
-   pipe_mutex_unlock( que->que_mutex );
+#define MAX_QUADS 16
 
-   /* Submit new job.
-    */
-   que->jobs[que->last].input = quad->input;
-   que->jobs[que->last].inout = quad->inout;
-   que->jobs[que->last].routine = routine;
-   que->last = (que->last + 1) % NUM_QUAD_JOBS;
-
-#if INSTANT_NOTEMPTY_NOTIFY
-   /* If the que was empty, notify consumers there's a job to be done.
-    */
-   if (empty) {
-      pipe_mutex_lock( que->que_mutex );
-      pipe_condvar_broadcast( que->que_notempty_condvar );
-      pipe_mutex_unlock( que->que_mutex );
-   }
-#endif
-}
-
-#endif
 
 /**
  * Triangle setup info (derived from draw_stage).
@@ -164,22 +85,19 @@ struct setup_context {
    struct edge emaj;
 
    float oneoverarea;
+   int facing;
+
+   struct quad_header quad[MAX_QUADS];
+   struct quad_header *quad_ptrs[MAX_QUADS];
+   unsigned count;
 
    struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
    struct tgsi_interp_coef posCoef;  /* For Z, W */
-   struct quad_header quad;
-
-#if SP_NUM_QUAD_THREADS > 1
-   struct quad_job_que que;
-   struct thread_info threads[SP_NUM_QUAD_THREADS];
-#endif
 
    struct {
       int left[2];   /**< [0] = row0, [1] = row1 */
       int right[2];
       int y;
-      unsigned y_flags;
-      unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
    } span;
 
 #if DEBUG_FRAGS
@@ -190,67 +108,6 @@ struct setup_context {
    unsigned winding;		/* which winding to cull */
 };
 
-#if SP_NUM_QUAD_THREADS > 1
-
-static PIPE_THREAD_ROUTINE( quad_thread, param )
-{
-   struct thread_info *info = (struct thread_info *) param;
-   struct quad_job_que *que = &info->setup->que;
-
-   for (;;) {
-      struct quad_job job;
-      boolean full;
-
-      /* Wait for an available job.
-       */
-      pipe_mutex_lock( que->que_mutex );
-      while (que->last == que->first)
-         pipe_condvar_wait( que->que_notempty_condvar, que->que_mutex );
-
-      /* See if the que is full.
-       */
-      full = (que->last + 1) % NUM_QUAD_JOBS == que->first;
-
-      /* Take a job and remove it from que.
-       */
-      job = que->jobs[que->first];
-      que->first = (que->first + 1) % NUM_QUAD_JOBS;
-
-      /* Notify the producer if the que is not full.
-       */
-      if (full)
-         pipe_condvar_signal( que->que_notfull_condvar );
-      pipe_mutex_unlock( que->que_mutex );
-
-      job.routine( info->setup, info->id, &job );
-
-      /* Notify the producer if that's the last finished job.
-       */
-      pipe_mutex_lock( que->que_mutex );
-      que->jobs_done++;
-      if (que->jobs_added == que->jobs_done)
-         pipe_condvar_signal( que->que_done_condvar );
-      pipe_mutex_unlock( que->que_mutex );
-   }
-
-   return NULL;
-}
-
-#define WAIT_FOR_COMPLETION(setup) \
-   do {\
-      pipe_mutex_lock( setup->que.que_mutex );\
-      if (!INSTANT_NOTEMPTY_NOTIFY)\
-         pipe_condvar_broadcast( setup->que.que_notempty_condvar );\
-      while (setup->que.jobs_added != setup->que.jobs_done)\
-         pipe_condvar_wait( setup->que.que_done_condvar, setup->que.que_mutex );\
-      pipe_mutex_unlock( setup->que.que_mutex );\
-   } while (0)
-
-#else
-
-#define WAIT_FOR_COMPLETION(setup) ((void) 0)
-
-#endif
 
 
 
@@ -313,98 +170,18 @@ quad_clip( struct setup_context *setup, struct quad_header *quad )
  * Emit a quad (pass to next stage) with clipping.
  */
 static INLINE void
-clip_emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
+clip_emit_quad( struct setup_context *setup, struct quad_header *quad )
 {
    quad_clip( setup, quad );
+
    if (quad->inout.mask) {
       struct softpipe_context *sp = setup->softpipe;
 
-      sp->quad[thread].first->run( sp->quad[thread].first, quad );
+      sp->quad.first->run( sp->quad.first, &quad, 1 );
    }
 }
 
-#if SP_NUM_QUAD_THREADS > 1
-
-static void
-clip_emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
-{
-   struct quad_header quad;
-
-   quad.input = job->input;
-   quad.inout = job->inout;
-   quad.coef = setup->quad.coef;
-   quad.posCoef = setup->quad.posCoef;
-   quad.nr_attrs = setup->quad.nr_attrs;
-   clip_emit_quad( setup, &quad, thread );
-}
-
-#define CLIP_EMIT_QUAD(setup) add_quad_job( &setup->que, &setup->quad, clip_emit_quad_job )
-
-#else
-
-#define CLIP_EMIT_QUAD(setup) clip_emit_quad( setup, &setup->quad, 0 )
-
-#endif
-
-/**
- * Emit a quad (pass to next stage).  No clipping is done.
- */
-static INLINE void
-emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
-{
-   struct softpipe_context *sp = setup->softpipe;
-#if DEBUG_FRAGS
-   uint mask = quad->inout.mask;
-#endif
-
-#if DEBUG_FRAGS
-   if (mask & 1) setup->numFragsEmitted++;
-   if (mask & 2) setup->numFragsEmitted++;
-   if (mask & 4) setup->numFragsEmitted++;
-   if (mask & 8) setup->numFragsEmitted++;
-#endif
-   sp->quad[thread].first->run( sp->quad[thread].first, quad );
-#if DEBUG_FRAGS
-   mask = quad->inout.mask;
-   if (mask & 1) setup->numFragsWritten++;
-   if (mask & 2) setup->numFragsWritten++;
-   if (mask & 4) setup->numFragsWritten++;
-   if (mask & 8) setup->numFragsWritten++;
-#endif
-}
-
-#if SP_NUM_QUAD_THREADS > 1
 
-static void
-emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
-{
-   struct quad_header quad;
-
-   quad.input = job->input;
-   quad.inout = job->inout;
-   quad.coef = setup->quad.coef;
-   quad.posCoef = setup->quad.posCoef;
-   quad.nr_attrs = setup->quad.nr_attrs;
-   emit_quad( setup, &quad, thread );
-}
-
-#define EMIT_QUAD(setup,x,y,mask) do {\
-      setup->quad.input.x0 = x;\
-      setup->quad.input.y0 = y;\
-      setup->quad.inout.mask = mask;\
-      add_quad_job( &setup->que, &setup->quad, emit_quad_job );\
-   } while (0)
-
-#else
-
-#define EMIT_QUAD(setup,x,y,mask) do {\
-      setup->quad.input.x0 = x;\
-      setup->quad.input.y0 = y;\
-      setup->quad.inout.mask = mask;\
-      emit_quad( setup, &setup->quad, 0 );\
-   } while (0)
-
-#endif
 
 /**
  * Given an X or Y coordinate, return the block/quad coordinate that it
@@ -412,7 +189,12 @@ emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
  */
 static INLINE int block( int x )
 {
-   return x & ~1;
+   return x & ~(2-1);
+}
+
+static INLINE int block_x( int x )
+{
+   return x & ~(16-1);
 }
 
 
@@ -421,72 +203,63 @@ static INLINE int block( int x )
  */
 static void flush_spans( struct setup_context *setup )
 {
+   const int step = 16;
    const int xleft0 = setup->span.left[0];
    const int xleft1 = setup->span.left[1];
    const int xright0 = setup->span.right[0];
    const int xright1 = setup->span.right[1];
-   int minleft, maxright;
+   struct quad_stage *pipe = setup->softpipe->quad.first;
+
+
+   int minleft = block_x(MIN2(xleft0, xleft1));
+   int maxright = MAX2(xright0, xright1);
    int x;
 
-   switch (setup->span.y_flags) {
-   case 0x3:
-      /* both odd and even lines written (both quad rows) */
-      minleft = block(MIN2(xleft0, xleft1));
-      maxright = block(MAX2(xright0, xright1));
-      for (x = minleft; x <= maxright; x += 2) {
-         /* determine which of the four pixels is inside the span bounds */
-         uint mask = 0x0;
-         if (x >= xleft0 && x < xright0)
-            mask |= MASK_TOP_LEFT;
-         if (x >= xleft1 && x < xright1)
-            mask |= MASK_BOTTOM_LEFT;
-         if (x+1 >= xleft0 && x+1 < xright0)
-            mask |= MASK_TOP_RIGHT;
-         if (x+1 >= xleft1 && x+1 < xright1)
-            mask |= MASK_BOTTOM_RIGHT;
-         if (mask)
-            EMIT_QUAD( setup, x, setup->span.y, mask );
-      }
-      break;
-
-   case 0x1:
-      /* only even line written (quad top row) */
-      minleft = block(xleft0);
-      maxright = block(xright0);
-      for (x = minleft; x <= maxright; x += 2) {
-         uint mask = 0x0;
-         if (x >= xleft0 && x < xright0)
-            mask |= MASK_TOP_LEFT;
-         if (x+1 >= xleft0 && x+1 < xright0)
-            mask |= MASK_TOP_RIGHT;
-         if (mask)
-            EMIT_QUAD( setup, x, setup->span.y, mask );
-      }
-      break;
-
-   case 0x2:
-      /* only odd line written (quad bottom row) */
-      minleft = block(xleft1);
-      maxright = block(xright1);
-      for (x = minleft; x <= maxright; x += 2) {
-         uint mask = 0x0;
-         if (x >= xleft1 && x < xright1)
-            mask |= MASK_BOTTOM_LEFT;
-         if (x+1 >= xleft1 && x+1 < xright1)
-            mask |= MASK_BOTTOM_RIGHT;
-         if (mask)
-            EMIT_QUAD( setup, x, setup->span.y, mask );
-      }
-      break;
+   for (x = minleft; x < maxright; x += step) {
+      unsigned skip_left0 = CLAMP(xleft0 - x, 0, step);
+      unsigned skip_left1 = CLAMP(xleft1 - x, 0, step);
+      unsigned skip_right0 = CLAMP(x + step - xright0, 0, step);
+      unsigned skip_right1 = CLAMP(x + step - xright1, 0, step);
+      unsigned lx = x;
+      unsigned q = 0;
 
-   default:
-      return;
+      unsigned skipmask_left0 = (1U << skip_left0) - 1U;
+      unsigned skipmask_left1 = (1U << skip_left1) - 1U;
+
+      /* These calculations fail when step == 32 and skip_right == 0.
+       */
+      unsigned skipmask_right0 = ~0U << (unsigned)(step - skip_right0);
+      unsigned skipmask_right1 = ~0U << (unsigned)(step - skip_right1);
+
+      unsigned mask0 = ~skipmask_left0 & ~skipmask_right0;
+      unsigned mask1 = ~skipmask_left1 & ~skipmask_right1;
+
+      if (mask0 | mask1) {
+         do {
+            unsigned quadmask = (mask0 & 3) | ((mask1 & 3) << 2);
+            if (quadmask) {
+               setup->quad[q].input.x0 = lx;
+               setup->quad[q].input.y0 = setup->span.y;
+               setup->quad[q].input.facing = setup->facing;
+               setup->quad[q].inout.mask = quadmask;
+               setup->quad_ptrs[q] = &setup->quad[q];
+               q++;
+            }
+            mask0 >>= 2;
+            mask1 >>= 2;
+            lx += 2;
+         } while (mask0 | mask1);
+
+         pipe->run( pipe, setup->quad_ptrs, q );
+      }
    }
 
+
    setup->span.y = 0;
-   setup->span.y_flags = 0;
    setup->span.right[0] = 0;
    setup->span.right[1] = 0;
+   setup->span.left[0] = 1000000;     /* greater than right[0] */
+   setup->span.left[1] = 1000000;     /* greater than right[1] */
 }
 
 
@@ -496,7 +269,7 @@ static void print_vertex(const struct setup_context *setup,
 {
    int i;
    debug_printf("   Vertex: (%p)\n", v);
-   for (i = 0; i < setup->quad.nr_attrs; i++) {
+   for (i = 0; i < setup->quad[0].nr_attrs; i++) {
       debug_printf("     %d: %f %f %f %f\n",  i,
               v[i][0], v[i][1], v[i][2], v[i][3]);
       if (util_is_inf_or_nan(v[i][0])) {
@@ -601,7 +374,9 @@ static boolean setup_sort_vertices( struct setup_context *setup,
     *  - the GLSL gl_FrontFacing fragment attribute (bool)
     *  - two-sided stencil test
     */
-   setup->quad.input.facing = (det > 0.0) ^ (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
+   setup->facing = 
+      ((det > 0.0) ^ 
+       (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW));
 
    return TRUE;
 }
@@ -788,7 +563,7 @@ static void setup_tri_coefficients( struct setup_context *setup )
       }
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].a0[0] = 1.0f - setup->facing;
          setup->coef[fragSlot].dadx[0] = 0.0;
          setup->coef[fragSlot].dady[0] = 0.0;
       }
@@ -844,11 +619,10 @@ static void subtriangle( struct setup_context *setup,
 
    /* clip top/bottom */
    start_y = sy;
-   finish_y = sy + lines;
-
    if (start_y < miny)
       start_y = miny;
 
+   finish_y = sy + lines;
    if (finish_y > maxy)
       finish_y = maxy;
 
@@ -885,7 +659,6 @@ static void subtriangle( struct setup_context *setup,
 
          setup->span.left[_y&1] = left;
          setup->span.right[_y&1] = right;
-         setup->span.y_flags |= 1<<(_y&1);
       }
    }
 
@@ -958,10 +731,9 @@ void setup_tri( struct setup_context *setup,
    setup_tri_coefficients( setup );
    setup_tri_edges( setup );
 
-   setup->quad.input.prim = QUAD_PRIM_TRI;
+   assert(setup->softpipe->reduced_prim == PIPE_PRIM_TRIANGLES);
 
    setup->span.y = 0;
-   setup->span.y_flags = 0;
    setup->span.right[0] = 0;
    setup->span.right[1] = 0;
    /*   setup->span.z_mode = tri_z_mode( setup->ctx ); */
@@ -983,8 +755,6 @@ void setup_tri( struct setup_context *setup,
 
    flush_spans( setup );
 
-   WAIT_FOR_COMPLETION(setup);
-
 #if DEBUG_FRAGS
    printf("Tri: %u frags emitted, %u written\n",
           setup->numFragsEmitted,
@@ -1101,7 +871,7 @@ setup_line_coefficients(struct setup_context *setup,
       }
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].a0[0] = 1.0f - setup->facing;
          setup->coef[fragSlot].dadx[0] = 0.0;
          setup->coef[fragSlot].dady[0] = 0.0;
       }
@@ -1122,20 +892,20 @@ plot(struct setup_context *setup, int x, int y)
    const int quadY = y - iy;
    const int mask = (1 << ix) << (2 * iy);
 
-   if (quadX != setup->quad.input.x0 ||
-       quadY != setup->quad.input.y0)
+   if (quadX != setup->quad[0].input.x0 ||
+       quadY != setup->quad[0].input.y0)
    {
       /* flush prev quad, start new quad */
 
-      if (setup->quad.input.x0 != -1)
-         CLIP_EMIT_QUAD(setup);
+      if (setup->quad[0].input.x0 != -1)
+         clip_emit_quad( setup, &setup->quad[0] );
 
-      setup->quad.input.x0 = quadX;
-      setup->quad.input.y0 = quadY;
-      setup->quad.inout.mask = 0x0;
+      setup->quad[0].input.x0 = quadX;
+      setup->quad[0].input.y0 = quadY;
+      setup->quad[0].inout.mask = 0x0;
    }
 
-   setup->quad.inout.mask |= mask;
+   setup->quad[0].inout.mask |= mask;
 }
 
 
@@ -1195,17 +965,18 @@ setup_line(struct setup_context *setup,
 
    assert(dx >= 0);
    assert(dy >= 0);
+   assert(setup->softpipe->reduced_prim == PIPE_PRIM_LINES);
+
+   setup->quad[0].input.x0 = setup->quad[0].input.y0 = -1;
+   setup->quad[0].inout.mask = 0x0;
 
-   setup->quad.input.x0 = setup->quad.input.y0 = -1;
-   setup->quad.inout.mask = 0x0;
-   setup->quad.input.prim = QUAD_PRIM_LINE;
    /* XXX temporary: set coverage to 1.0 so the line appears
     * if AA mode happens to be enabled.
     */
-   setup->quad.input.coverage[0] =
-   setup->quad.input.coverage[1] =
-   setup->quad.input.coverage[2] =
-   setup->quad.input.coverage[3] = 1.0;
+   setup->quad[0].input.coverage[0] =
+   setup->quad[0].input.coverage[1] =
+   setup->quad[0].input.coverage[2] =
+   setup->quad[0].input.coverage[3] = 1.0;
 
    if (dx > dy) {
       /*** X-major line ***/
@@ -1249,11 +1020,9 @@ setup_line(struct setup_context *setup,
    }
 
    /* draw final quad */
-   if (setup->quad.inout.mask) {
-      CLIP_EMIT_QUAD(setup);
+   if (setup->quad[0].inout.mask) {
+      clip_emit_quad( setup, &setup->quad[0] );
    }
-
-   WAIT_FOR_COMPLETION(setup);
 }
 
 
@@ -1300,6 +1069,8 @@ setup_point( struct setup_context *setup,
    if (softpipe->no_rast)
       return;
 
+   assert(setup->softpipe->reduced_prim == PIPE_PRIM_POINTS);
+
    /* For points, all interpolants are constant-valued.
     * However, for point sprites, we'll need to setup texcoords appropriately.
     * XXX: which coefficients are the texcoords???
@@ -1346,22 +1117,21 @@ setup_point( struct setup_context *setup,
       }
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].a0[0] = 1.0f - setup->facing;
          setup->coef[fragSlot].dadx[0] = 0.0;
          setup->coef[fragSlot].dady[0] = 0.0;
       }
    }
 
-   setup->quad.input.prim = QUAD_PRIM_POINT;
 
    if (halfSize <= 0.5 && !round) {
       /* special case for 1-pixel points */
       const int ix = ((int) x) & 1;
       const int iy = ((int) y) & 1;
-      setup->quad.input.x0 = (int) x - ix;
-      setup->quad.input.y0 = (int) y - iy;
-      setup->quad.inout.mask = (1 << ix) << (2 * iy);
-      CLIP_EMIT_QUAD(setup);
+      setup->quad[0].input.x0 = (int) x - ix;
+      setup->quad[0].input.y0 = (int) y - iy;
+      setup->quad[0].inout.mask = (1 << ix) << (2 * iy);
+      clip_emit_quad( setup, &setup->quad[0] );
    }
    else {
       if (round) {
@@ -1381,15 +1151,15 @@ setup_point( struct setup_context *setup,
             for (ix = ixmin; ix <= ixmax; ix += 2) {
                float dx, dy, dist2, cover;
 
-               setup->quad.inout.mask = 0x0;
+               setup->quad[0].inout.mask = 0x0;
 
                dx = (ix + 0.5f) - x;
                dy = (iy + 0.5f) - y;
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_TOP_LEFT;
+                  setup->quad[0].input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_TOP_LEFT;
                }
 
                dx = (ix + 1.5f) - x;
@@ -1397,8 +1167,8 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_TOP_RIGHT;
+                  setup->quad[0].input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_TOP_RIGHT;
                }
 
                dx = (ix + 0.5f) - x;
@@ -1406,8 +1176,8 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_BOTTOM_LEFT;
+                  setup->quad[0].input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_BOTTOM_LEFT;
                }
 
                dx = (ix + 1.5f) - x;
@@ -1415,14 +1185,14 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_BOTTOM_RIGHT;
+                  setup->quad[0].input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_BOTTOM_RIGHT;
                }
 
-               if (setup->quad.inout.mask) {
-                  setup->quad.input.x0 = ix;
-                  setup->quad.input.y0 = iy;
-                  CLIP_EMIT_QUAD(setup);
+               if (setup->quad[0].inout.mask) {
+                  setup->quad[0].input.x0 = ix;
+                  setup->quad[0].input.y0 = iy;
+                  clip_emit_quad( setup, &setup->quad[0] );
                }
             }
          }
@@ -1466,33 +1236,25 @@ setup_point( struct setup_context *setup,
                   mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
                }
 
-               setup->quad.inout.mask = mask;
-               setup->quad.input.x0 = ix;
-               setup->quad.input.y0 = iy;
-               CLIP_EMIT_QUAD(setup);
+               setup->quad[0].inout.mask = mask;
+               setup->quad[0].input.x0 = ix;
+               setup->quad[0].input.y0 = iy;
+               clip_emit_quad( setup, &setup->quad[0] );
             }
          }
       }
    }
-
-   WAIT_FOR_COMPLETION(setup);
 }
 
 void setup_prepare( struct setup_context *setup )
 {
    struct softpipe_context *sp = setup->softpipe;
-   unsigned i;
 
    if (sp->dirty) {
       softpipe_update_derived(sp);
    }
 
-   /* Note: nr_attrs is only used for debugging (vertex printing) */
-   setup->quad.nr_attrs = draw_num_vs_outputs(sp->draw);
-
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      sp->quad[i].first->begin( sp->quad[i].first );
-   }
+   sp->quad.first->begin( sp->quad.first );
 
    if (sp->reduced_api_prim == PIPE_PRIM_TRIANGLES &&
        sp->rasterizer->fill_cw == PIPE_POLYGON_MODE_FILL &&
@@ -1520,30 +1282,17 @@ void setup_destroy_context( struct setup_context *setup )
 struct setup_context *setup_create_context( struct softpipe_context *softpipe )
 {
    struct setup_context *setup = CALLOC_STRUCT(setup_context);
-#if SP_NUM_QUAD_THREADS > 1
-   uint i;
-#endif
+   unsigned i;
 
    setup->softpipe = softpipe;
 
-   setup->quad.coef = setup->coef;
-   setup->quad.posCoef = &setup->posCoef;
-
-#if SP_NUM_QUAD_THREADS > 1
-   setup->que.first = 0;
-   setup->que.last = 0;
-   pipe_mutex_init( setup->que.que_mutex );
-   pipe_condvar_init( setup->que.que_notfull_condvar );
-   pipe_condvar_init( setup->que.que_notempty_condvar );
-   setup->que.jobs_added = 0;
-   setup->que.jobs_done = 0;
-   pipe_condvar_init( setup->que.que_done_condvar );
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      setup->threads[i].setup = setup;
-      setup->threads[i].id = i;
-      setup->threads[i].handle = pipe_thread_create( quad_thread, &setup->threads[i] );
+   for (i = 0; i < MAX_QUADS; i++) {
+      setup->quad[i].coef = setup->coef;
+      setup->quad[i].posCoef = &setup->posCoef;
    }
-#endif
+
+   setup->span.left[0] = 1000000;     /* greater than right[0] */
+   setup->span.left[1] = 1000000;     /* greater than right[1] */
 
    return setup;
 }
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 9776e978e3..77ee3c1136 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -87,6 +87,7 @@ struct sp_fragment_shader {
 struct sp_vertex_shader {
    struct pipe_shader_state shader;
    struct draw_vertex_shader *draw_data;
+   int max_sampler;             /* -1 if no samplers */
 };
 
 
diff --git a/src/gallium/drivers/softpipe/sp_state_blend.c b/src/gallium/drivers/softpipe/sp_state_blend.c
index 384fe559af..efed082f82 100644
--- a/src/gallium/drivers/softpipe/sp_state_blend.c
+++ b/src/gallium/drivers/softpipe/sp_state_blend.c
@@ -45,7 +45,7 @@ void softpipe_bind_blend_state( struct pipe_context *pipe,
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   softpipe->blend = (const struct pipe_blend_state *)blend;
+   softpipe->blend = (struct pipe_blend_state *)blend;
 
    softpipe->dirty |= SP_NEW_BLEND;
 }
@@ -86,7 +86,7 @@ softpipe_bind_depth_stencil_state(struct pipe_context *pipe,
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   softpipe->depth_stencil = (const struct pipe_depth_stencil_alpha_state *)depth_stencil;
+   softpipe->depth_stencil = (struct pipe_depth_stencil_alpha_state *)depth_stencil;
 
    softpipe->dirty |= SP_NEW_DEPTH_STENCIL_ALPHA;
 }
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index 75551000c9..1faeca1c2a 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -32,7 +32,10 @@
 #include "draw/draw_vertex.h"
 #include "draw/draw_private.h"
 #include "sp_context.h"
+#include "sp_screen.h"
 #include "sp_state.h"
+#include "sp_texture.h"
+#include "sp_tex_tile_cache.h"
 
 
 /**
@@ -65,24 +68,19 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
       const struct sp_fragment_shader *spfs = softpipe->fs;
       const enum interp_mode colorInterp
          = softpipe->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+      struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
+      const uint num = draw_num_vs_outputs(softpipe->draw);
       uint i;
 
-      if (softpipe->vbuf) {
-         /* if using the post-transform vertex buffer, tell draw_vbuf to
-          * simply emit the whole post-xform vertex as-is:
-          */
-         struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
-         const uint num = draw_num_vs_outputs(softpipe->draw);
-         uint i;
-
-         /* No longer any need to try and emit draw vertex_header info.
-          */
-         vinfo_vbuf->num_attribs = 0;
-         for (i = 0; i < num; i++) {
-            draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
-         }
-         draw_compute_vertex_size(vinfo_vbuf);
+      /* Tell draw_vbuf to simply emit the whole post-xform vertex
+       * as-is.  No longer any need to try and emit draw vertex_header
+       * info.
+       */
+      vinfo_vbuf->num_attribs = 0;
+      for (i = 0; i < num; i++) {
+	 draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
       }
+      draw_compute_vertex_size(vinfo_vbuf);
 
       /*
        * Loop over fragment shader inputs, searching for the matching output
@@ -91,6 +89,23 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
       vinfo->num_attribs = 0;
       for (i = 0; i < spfs->info.num_inputs; i++) {
          int src;
+         enum interp_mode interp;
+
+         switch (spfs->info.input_interpolate[i]) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = INTERP_CONSTANT;
+            break;
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = INTERP_LINEAR;
+            break;
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = INTERP_PERSPECTIVE;
+            break;
+         default:
+            assert(0);
+            interp = INTERP_LINEAR;
+         }
+
          switch (spfs->info.input_semantic_name[i]) {
          case TGSI_SEMANTIC_POSITION:
             src = draw_find_vs_output(softpipe->draw,
@@ -106,7 +121,7 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
 
          case TGSI_SEMANTIC_FOG:
             src = draw_find_vs_output(softpipe->draw, TGSI_SEMANTIC_FOG, 0);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
             break;
 
          case TGSI_SEMANTIC_GENERIC:
@@ -114,7 +129,7 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
             /* this includes texcoords and varying vars */
             src = draw_find_vs_output(softpipe->draw, TGSI_SEMANTIC_GENERIC,
                                       spfs->info.input_semantic_index[i]);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
             break;
 
          default:
@@ -164,11 +179,19 @@ softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe)
 static void
 compute_cliprect(struct softpipe_context *sp)
 {
+   /* SP_NEW_FRAMEBUFFER
+    */
    uint surfWidth = sp->framebuffer.width;
    uint surfHeight = sp->framebuffer.height;
 
+   /* SP_NEW_RASTERIZER
+    */
    if (sp->rasterizer->scissor) {
-      /* clip to scissor rect */
+
+      /* SP_NEW_SCISSOR
+       *
+       * clip to scissor rect:
+       */
       sp->cliprect.minx = MAX2(sp->scissor.minx, 0);
       sp->cliprect.miny = MAX2(sp->scissor.miny, 0);
       sp->cliprect.maxx = MIN2(sp->scissor.maxx, surfWidth);
@@ -184,27 +207,63 @@ compute_cliprect(struct softpipe_context *sp)
 }
 
 
+static void
+update_tgsi_samplers( struct softpipe_context *softpipe )
+{
+   unsigned i;
+
+   softpipe_reset_sampler_varients( softpipe );
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      struct softpipe_tex_tile_cache *tc = softpipe->tex_cache[i];
+      if (tc->texture) {
+         struct softpipe_texture *spt = softpipe_texture(tc->texture);
+         if (spt->timestamp != tc->timestamp) {
+	    sp_tex_tile_cache_validate_texture( tc );
+            /*
+            _debug_printf("INV %d %d\n", tc->timestamp, spt->timestamp);
+            */
+            tc->timestamp = spt->timestamp;
+         }
+      }
+   }
+}
+
+
 /* Hopefully this will remain quite simple, otherwise need to pull in
  * something like the state tracker mechanism.
  */
 void softpipe_update_derived( struct softpipe_context *softpipe )
 {
+   struct softpipe_screen *sp_screen = softpipe_screen(softpipe->pipe.screen);
+
+   /* Check for updated textures.
+    */
+   if (softpipe->tex_timestamp != sp_screen->timestamp) {
+      softpipe->tex_timestamp = sp_screen->timestamp;
+      softpipe->dirty |= SP_NEW_TEXTURE;
+   }
+      
+   if (softpipe->dirty & (SP_NEW_SAMPLER |
+                          SP_NEW_TEXTURE |
+                          SP_NEW_FS | 
+                          SP_NEW_VS))
+      update_tgsi_samplers( softpipe );
+
    if (softpipe->dirty & (SP_NEW_RASTERIZER |
                           SP_NEW_FS |
                           SP_NEW_VS))
       invalidate_vertex_layout( softpipe );
 
    if (softpipe->dirty & (SP_NEW_SCISSOR |
-                          SP_NEW_DEPTH_STENCIL_ALPHA |
+                          SP_NEW_RASTERIZER |
                           SP_NEW_FRAMEBUFFER))
       compute_cliprect(softpipe);
 
    if (softpipe->dirty & (SP_NEW_BLEND |
                           SP_NEW_DEPTH_STENCIL_ALPHA |
                           SP_NEW_FRAMEBUFFER |
-                          SP_NEW_RASTERIZER |
-                          SP_NEW_FS | 
-			  SP_NEW_QUERY))
+                          SP_NEW_FS))
       sp_build_quad_pipeline(softpipe);
 
    softpipe->dirty = 0;
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
index 4330c20393..256faa94b8 100644
--- a/src/gallium/drivers/softpipe/sp_state_fs.c
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -31,9 +31,8 @@
 
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
-#include "pipe/internal/p_winsys_screen.h"
-#include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
+#include "draw/draw_vs.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_scan.h"
 #include "tgsi/tgsi_parse.h"
@@ -51,12 +50,9 @@ softpipe_create_fs_state(struct pipe_context *pipe,
       tgsi_dump(templ->tokens, 0);
 
    /* codegen */
-   state = softpipe_create_fs_llvm( softpipe, templ );
+   state = softpipe_create_fs_sse( softpipe, templ );
    if (!state) {
-      state = softpipe_create_fs_sse( softpipe, templ );
-      if (!state) {
-         state = softpipe_create_fs_exec( softpipe, templ );
-      }
+      state = softpipe_create_fs_exec( softpipe, templ );
    }
 
    assert(state);
@@ -111,6 +107,8 @@ softpipe_create_vs_state(struct pipe_context *pipe,
    if (state->draw_data == NULL) 
       goto fail;
 
+   state->max_sampler = state->draw_data->info.file_max[TGSI_FILE_SAMPLER];
+
    return state;
 
 fail:
@@ -128,7 +126,7 @@ softpipe_bind_vs_state(struct pipe_context *pipe, void *vs)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   softpipe->vs = (const struct sp_vertex_shader *)vs;
+   softpipe->vs = (struct sp_vertex_shader *) vs;
 
    draw_bind_vertex_shader(softpipe->draw,
                            (softpipe->vs ? softpipe->vs->draw_data : NULL));
@@ -142,8 +140,7 @@ softpipe_delete_vs_state(struct pipe_context *pipe, void *vs)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   struct sp_vertex_shader *state =
-      (struct sp_vertex_shader *)vs;
+   struct sp_vertex_shader *state = (struct sp_vertex_shader *) vs;
 
    draw_delete_vertex_shader(softpipe->draw, state->draw_data);
    FREE( state );
diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c
index cb517b02e4..db0b8ab76b 100644
--- a/src/gallium/drivers/softpipe/sp_state_sampler.c
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -32,21 +32,37 @@
 #include "util/u_memory.h"
 
 #include "draw/draw_context.h"
+#include "draw/draw_context.h"
 
 #include "sp_context.h"
-#include "sp_context.h"
 #include "sp_state.h"
 #include "sp_texture.h"
-#include "sp_tile_cache.h"
-#include "draw/draw_context.h"
+#include "sp_tex_sample.h"
+#include "sp_tex_tile_cache.h"
 
 
+struct sp_sampler {
+   struct pipe_sampler_state base;
+   struct sp_sampler_varient *varients;
+   struct sp_sampler_varient *current;
+};
+
+static struct sp_sampler *sp_sampler( struct pipe_sampler_state *sampler )
+{
+   return (struct sp_sampler *)sampler;
+}
+
 
 void *
 softpipe_create_sampler_state(struct pipe_context *pipe,
                               const struct pipe_sampler_state *sampler)
 {
-   return mem_dup(sampler, sizeof(*sampler));
+   struct sp_sampler *sp_sampler = CALLOC_STRUCT(sp_sampler);
+
+   sp_sampler->base = *sampler;
+   sp_sampler->varients = NULL;
+
+   return (void *)sp_sampler;
 }
 
 
@@ -97,7 +113,7 @@ softpipe_set_sampler_textures(struct pipe_context *pipe,
       struct pipe_texture *tex = i < num ? texture[i] : NULL;
 
       pipe_texture_reference(&softpipe->texture[i], tex);
-      sp_tile_cache_set_texture(pipe, softpipe->tex_cache[i], tex);
+      sp_tex_tile_cache_set_texture(softpipe->tex_cache[i], tex);
    }
 
    softpipe->num_textures = num;
@@ -106,10 +122,111 @@ softpipe_set_sampler_textures(struct pipe_context *pipe,
 }
 
 
+/**
+ * Find/create an sp_sampler_varient object for sampling the given texture,
+ * sampler and tex unit.
+ *
+ * Note that the tex unit is significant.  We can't re-use a sampler
+ * varient for multiple texture units because the sampler varient contains
+ * the texture object pointer.  If the texture object pointer were stored
+ * somewhere outside the sampler varient, we could re-use samplers for
+ * multiple texture units.
+ */
+static struct sp_sampler_varient *
+get_sampler_varient( unsigned unit,
+                     struct sp_sampler *sampler,
+                     struct pipe_texture *texture,
+                     unsigned processor )
+{
+   struct softpipe_texture *sp_texture = softpipe_texture(texture);
+   struct sp_sampler_varient *v = NULL;
+   union sp_sampler_key key;
+
+   /* if this fails, widen the key.unit field and update this assertion */
+   assert(PIPE_MAX_SAMPLERS <= 16);
+
+   key.bits.target = sp_texture->base.target;
+   key.bits.is_pot = sp_texture->pot;
+   key.bits.processor = processor;
+   key.bits.unit = unit;
+   key.bits.pad = 0;
+
+   if (sampler->current && 
+       key.value == sampler->current->key.value) {
+      v = sampler->current;
+   }
+
+   if (v == NULL) {
+      for (v = sampler->varients; v; v = v->next)
+         if (v->key.value == key.value)
+            break;
+
+      if (v == NULL) {
+         v = sp_create_sampler_varient( &sampler->base, key );
+         v->next = sampler->varients;
+         sampler->varients = v;
+      }
+   }
+   
+   sampler->current = v;
+   return v;
+}
+
+
+
+
+void
+softpipe_reset_sampler_varients(struct softpipe_context *softpipe)
+{
+   int i;
+
+   /* It's a bit hard to build these samplers ahead of time -- don't
+    * really know which samplers are going to be used for vertex and
+    * fragment programs.
+    */
+   for (i = 0; i <= softpipe->vs->max_sampler; i++) {
+      if (softpipe->sampler[i]) {
+         softpipe->tgsi.vert_samplers_list[i] = 
+            get_sampler_varient( i,
+                                 sp_sampler(softpipe->sampler[i]),
+                                 softpipe->texture[i],
+                                 TGSI_PROCESSOR_VERTEX );
+
+         sp_sampler_varient_bind_texture( softpipe->tgsi.vert_samplers_list[i], 
+                                          softpipe->tex_cache[i],
+                                          softpipe->texture[i] );
+      }
+   }
+
+   for (i = 0; i <= softpipe->fs->info.file_max[TGSI_FILE_SAMPLER]; i++) {
+      if (softpipe->sampler[i]) {
+         softpipe->tgsi.frag_samplers_list[i] =
+            get_sampler_varient( i,
+                                 sp_sampler(softpipe->sampler[i]),
+                                 softpipe->texture[i],
+                                 TGSI_PROCESSOR_FRAGMENT );
+
+         sp_sampler_varient_bind_texture( softpipe->tgsi.frag_samplers_list[i], 
+                                          softpipe->tex_cache[i],
+                                          softpipe->texture[i] );
+      }
+   }
+}
+
+
+
 void
 softpipe_delete_sampler_state(struct pipe_context *pipe,
                               void *sampler)
 {
+   struct sp_sampler *sp_sampler = (struct sp_sampler *)sampler;
+   struct sp_sampler_varient *v, *tmp;
+
+   for (v = sp_sampler->varients; v; v = tmp) {
+      tmp = v->next;
+      sp_sampler_varient_destroy(v);
+   }
+
    FREE( sampler );
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_state_surface.c b/src/gallium/drivers/softpipe/sp_state_surface.c
index 7c06d864a7..c8f55c3cec 100644
--- a/src/gallium/drivers/softpipe/sp_state_surface.c
+++ b/src/gallium/drivers/softpipe/sp_state_surface.c
@@ -53,7 +53,7 @@ softpipe_set_framebuffer_state(struct pipe_context *pipe,
       /* check if changing cbuf */
       if (sp->framebuffer.cbufs[i] != fb->cbufs[i]) {
          /* flush old */
-         sp_flush_tile_cache(sp, sp->cbuf_cache[i]);
+         sp_flush_tile_cache(sp->cbuf_cache[i]);
 
          /* assign new */
          sp->framebuffer.cbufs[i] = fb->cbufs[i];
@@ -68,58 +68,28 @@ softpipe_set_framebuffer_state(struct pipe_context *pipe,
    /* zbuf changing? */
    if (sp->framebuffer.zsbuf != fb->zsbuf) {
       /* flush old */
-      sp_flush_tile_cache(sp, sp->zsbuf_cache);
+      sp_flush_tile_cache(sp->zsbuf_cache);
 
       /* assign new */
       sp->framebuffer.zsbuf = fb->zsbuf;
 
       /* update cache */
       sp_tile_cache_set_surface(sp->zsbuf_cache, fb->zsbuf);
-   }
-
-#if 0
-   /* XXX combined depth/stencil here */
-
-   /* sbuf changing? */
-   if (sp->framebuffer.sbuf != fb->sbuf) {
-      /* flush old */
-      sp_flush_tile_cache(sp, sp->sbuf_cache_sep);
-
-      /* assign new */
-      sp->framebuffer.sbuf = fb->sbuf;
-
-      /* update cache */
-      if (fb->sbuf != fb->zbuf) {
-         /* separate stencil buf */
-         sp->sbuf_cache = sp->sbuf_cache_sep;
-         sp_tile_cache_set_surface(sp->sbuf_cache, fb->sbuf);
-      }
-      else {
-         /* combined depth/stencil */
-         sp->sbuf_cache = sp->zbuf_cache;
-         sp_tile_cache_set_surface(sp->sbuf_cache, fb->sbuf);
-      }
-   }
-#endif
 
-   /* Tell draw module how deep the Z/depth buffer is */
-   {
-      int depth_bits;
-      double mrd;
+      /* Tell draw module how deep the Z/depth buffer is */
       if (sp->framebuffer.zsbuf) {
+         int depth_bits;
+         double mrd;
          depth_bits = pf_get_component_bits(sp->framebuffer.zsbuf->format,
                                             PIPE_FORMAT_COMP_Z);
+         if (depth_bits > 16) {
+            mrd = 0.0000001;
+         }
+         else {
+            mrd = 0.00002;
+         }
+         draw_set_mrd(sp->draw, mrd);
       }
-      else {
-         depth_bits = 0;
-      }
-      if (depth_bits > 16) {
-         mrd = 0.0000001;
-      }
-      else {
-         mrd = 0.00002;
-      }
-      draw_set_mrd(sp->draw, mrd);
    }
 
    sp->framebuffer.width = fb->width;
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index f99a30277d..c22ee86b66 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -31,29 +31,33 @@
  *
  * Authors:
  *   Brian Paul
+ *   Keith Whitwell
  */
 
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_texture.h"
-#include "sp_tex_sample.h"
-#include "sp_tile_cache.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "sp_quad.h"   /* only for #define QUAD_* tokens */
+#include "sp_tex_sample.h"
+#include "sp_tex_tile_cache.h"
 
 
 
 /*
- * Note, the FRAC macro has to work perfectly.  Otherwise you'll sometimes
- * see 1-pixel bands of improperly weighted linear-filtered textures.
+ * Return fractional part of 'f'.  Used for computing interpolation weights.
+ * Need to be careful with negative values.
+ * Note, if this function isn't perfect you'll sometimes see 1-pixel bands
+ * of improperly weighted linear-filtered textures.
  * The tests/texwrap.c demo is a good test.
- * Also note, FRAC(x) doesn't truly return the fractional part of x for x < 0.
- * Instead, if x < 0 then FRAC(x) = 1 - true_frac(x).
  */
-#define FRAC(f)  ((f) - util_ifloor(f))
+static INLINE float
+frac(float f)
+{
+   return f - util_ifloor(f);
+}
+
 
 
 /**
@@ -100,10 +104,16 @@ lerp_3d(float a, float b, float c,
 
 
 /**
- * If A is a signed integer, A % B doesn't give the right value for A < 0
- * (in terms of texture repeat).  Just casting to unsigned fixes that.
+ * Compute coord % size for repeat wrap modes.
+ * Note that if coord is a signed integer, coord % size doesn't give
+ * the right value for coord < 0 (in terms of texture repeat).  Just
+ * casting to unsigned fixes that.
  */
-#define REMAINDER(A, B) ((unsigned) (A) % (unsigned) (B))
+static INLINE int
+repeat(int coord, unsigned size)
+{
+   return (int) ((unsigned) coord % size);
+}
 
 
 /**
@@ -115,133 +125,153 @@ lerp_3d(float a, float b, float c,
  * \param icoord  returns the integer texcoords
  * \return  integer texture index
  */
-static INLINE void
-nearest_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
-                   int icoord[4])
+static void
+wrap_nearest_repeat(const float s[4], unsigned size, int icoord[4])
 {
    uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      /* s limited to [0,1) */
-      /* i limited to [0,size-1] */
-      for (ch = 0; ch < 4; ch++) {
-         int i = util_ifloor(s[ch] * size);
-         icoord[ch] = REMAINDER(i, size);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP:
+   /* s limited to [0,1) */
+   /* i limited to [0,size-1] */
+   for (ch = 0; ch < 4; ch++) {
+      int i = util_ifloor(s[ch] * size);
+      icoord[ch] = repeat(i, size);
+   }
+}
+
+
+static void
+wrap_nearest_clamp(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   /* s limited to [0,1] */
+   /* i limited to [0,size-1] */
+   for (ch = 0; ch < 4; ch++) {
+      if (s[ch] <= 0.0F)
+         icoord[ch] = 0;
+      else if (s[ch] >= 1.0F)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(s[ch] * size);
+   }
+}
+
+
+static void
+wrap_nearest_clamp_to_edge(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [0, size-1] */
+   const float min = 1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      if (s[ch] < min)
+         icoord[ch] = 0;
+      else if (s[ch] > max)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(s[ch] * size);
+   }
+}
+
+
+static void
+wrap_nearest_clamp_to_border(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [-1, size] */
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      if (s[ch] <= min)
+         icoord[ch] = -1;
+      else if (s[ch] >= max)
+         icoord[ch] = size;
+      else
+         icoord[ch] = util_ifloor(s[ch] * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_repeat(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   const float min = 1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      const int flr = util_ifloor(s[ch]);
+      float u;
+      if (flr & 1)
+         u = 1.0F - (s[ch] - (float) flr);
+      else
+         u = s[ch] - (float) flr;
+      if (u < min)
+         icoord[ch] = 0;
+      else if (u > max)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(u * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_clamp(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
       /* s limited to [0,1] */
       /* i limited to [0,size-1] */
-      for (ch = 0; ch < 4; ch++) {
-         if (s[ch] <= 0.0F)
-            icoord[ch] = 0;
-         else if (s[ch] >= 1.0F)
-            icoord[ch] = size - 1;
-         else
-            icoord[ch] = util_ifloor(s[ch] * size);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            if (s[ch] < min)
-               icoord[ch] = 0;
-            else if (s[ch] > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(s[ch] * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [-1, size] */
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            if (s[ch] <= min)
-               icoord[ch] = -1;
-            else if (s[ch] >= max)
-               icoord[ch] = size;
-            else
-               icoord[ch] = util_ifloor(s[ch] * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      {
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const int flr = util_ifloor(s[ch]);
-            float u;
-            if (flr & 1)
-               u = 1.0F - (s[ch] - (float) flr);
-            else
-               u = s[ch] - (float) flr;
-            if (u < min)
-               icoord[ch] = 0;
-            else if (u > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         /* s limited to [0,1] */
-         /* i limited to [0,size-1] */
-         const float u = fabsf(s[ch]);
-         if (u <= 0.0F)
-            icoord[ch] = 0;
-         else if (u >= 1.0F)
-            icoord[ch] = size - 1;
-         else
-            icoord[ch] = util_ifloor(u * size);
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const float u = fabsf(s[ch]);
-            if (u < min)
-               icoord[ch] = 0;
-            else if (u > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const float u = fabsf(s[ch]);
-            if (u < min)
-               icoord[ch] = -1;
-            else if (u > max)
-               icoord[ch] = size;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   default:
-      assert(0);
+      const float u = fabsf(s[ch]);
+      if (u <= 0.0F)
+         icoord[ch] = 0;
+      else if (u >= 1.0F)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(u * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_clamp_to_edge(const float s[4], unsigned size,
+                                  int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [0, size-1] */
+   const float min = 1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      const float u = fabsf(s[ch]);
+      if (u < min)
+         icoord[ch] = 0;
+      else if (u > max)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(u * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_clamp_to_border(const float s[4], unsigned size,
+                                    int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [0, size-1] */
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      const float u = fabsf(s[ch]);
+      if (u < min)
+         icoord[ch] = -1;
+      else if (u > max)
+         icoord[ch] = size;
+      else
+         icoord[ch] = util_ifloor(u * size);
    }
 }
 
@@ -256,125 +286,156 @@ nearest_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
  * \param w  returns blend factor/weight between texture indexes
  * \param icoord  returns the computed integer texture coords
  */
-static INLINE void
-linear_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
+static void
+wrap_linear_repeat(const float s[4], unsigned size,
+                   int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = s[ch] * size - 0.5F;
+      icoord0[ch] = repeat(util_ifloor(u), size);
+      icoord1[ch] = repeat(icoord0[ch] + 1, size);
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_clamp(const float s[4], unsigned size,
                   int icoord0[4], int icoord1[4], float w[4])
 {
    uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], 0.0F, 1.0F);
+      u = u * size - 0.5f;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
 
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      for (ch = 0; ch < 4; ch++) {
-         float u = s[ch] * size - 0.5F;
-         icoord0[ch] = REMAINDER(util_ifloor(u), size);
-         icoord1[ch] = REMAINDER(icoord0[ch] + 1, size);
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.0F, 1.0F);
-         u = u * size - 0.5f;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.0F, 1.0F);
-         u = u * size - 0.5f;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      {
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            float u = CLAMP(s[ch], min, max);
-            u = u * size - 0.5f;
-            icoord0[ch] = util_ifloor(u);
-            icoord1[ch] = icoord0[ch] + 1;
-            w[ch] = FRAC(u);
-         }
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      for (ch = 0; ch < 4; ch++) {
-         const int flr = util_ifloor(s[ch]);
-         float u;
-         if (flr & 1)
-            u = 1.0F - (s[ch] - (float) flr);
-         else
-            u = s[ch] - (float) flr;
-         u = u * size - 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         float u = fabsf(s[ch]);
-         if (u >= 1.0F)
-            u = (float) size;
-         else
-            u *= size;
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      for (ch = 0; ch < 4; ch++) {
-         float u = fabsf(s[ch]);
-         if (u >= 1.0F)
-            u = (float) size;
-         else
-            u *= size;
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            float u = fabsf(s[ch]);
-            if (u <= min)
-               u = min * size;
-            else if (u >= max)
-               u = max * size;
-            else
-               u *= size;
-            u -= 0.5F;
-            icoord0[ch] = util_ifloor(u);
-            icoord1[ch] = icoord0[ch] + 1;
-            w[ch] = FRAC(u);
-         }
-      }
-      break;;
-   default:
-      assert(0);
+
+static void
+wrap_linear_clamp_to_edge(const float s[4], unsigned size,
+                          int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], 0.0F, 1.0F);
+      u = u * size - 0.5f;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord0[ch] < 0)
+         icoord0[ch] = 0;
+      if (icoord1[ch] >= (int) size)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_clamp_to_border(const float s[4], unsigned size,
+                            int icoord0[4], int icoord1[4], float w[4])
+{
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], min, max);
+      u = u * size - 0.5f;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_repeat(const float s[4], unsigned size,
+                          int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      const int flr = util_ifloor(s[ch]);
+      float u;
+      if (flr & 1)
+         u = 1.0F - (s[ch] - (float) flr);
+      else
+         u = s[ch] - (float) flr;
+      u = u * size - 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord0[ch] < 0)
+         icoord0[ch] = 0;
+      if (icoord1[ch] >= (int) size)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_clamp(const float s[4], unsigned size,
+                         int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = fabsf(s[ch]);
+      if (u >= 1.0F)
+         u = (float) size;
+      else
+         u *= size;
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_clamp_to_edge(const float s[4], unsigned size,
+                                 int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = fabsf(s[ch]);
+      if (u >= 1.0F)
+         u = (float) size;
+      else
+         u *= size;
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord0[ch] < 0)
+         icoord0[ch] = 0;
+      if (icoord1[ch] >= (int) size)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_clamp_to_border(const float s[4], unsigned size,
+                                   int icoord0[4], int icoord1[4], float w[4])
+{
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = fabsf(s[ch]);
+      if (u <= min)
+         u = min * size;
+      else if (u >= max)
+         u = max * size;
+      else
+         u *= size;
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
    }
 }
 
@@ -383,27 +444,27 @@ linear_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
  * For RECT textures / unnormalized texcoords
  * Only a subset of wrap modes supported.
  */
-static INLINE void
-nearest_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
-                          int icoord[4])
+static void
+wrap_nearest_unorm_clamp(const float s[4], unsigned size, int icoord[4])
 {
    uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         int i = util_ifloor(s[ch]);
-         icoord[ch]= CLAMP(i, 0, (int) size-1);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      /* fall-through */
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      for (ch = 0; ch < 4; ch++) {
-         icoord[ch]= util_ifloor( CLAMP(s[ch], 0.5F, (float) size - 0.5F) );
-      }
-      return;
-   default:
-      assert(0);
+   for (ch = 0; ch < 4; ch++) {
+      int i = util_ifloor(s[ch]);
+      icoord[ch]= CLAMP(i, 0, (int) size-1);
+   }
+}
+
+
+/**
+ * Handles clamp_to_edge and clamp_to_border:
+ */
+static void
+wrap_nearest_unorm_clamp_to_border(const float s[4], unsigned size,
+                                   int icoord[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      icoord[ch]= util_ifloor( CLAMP(s[ch], 0.5F, (float) size - 0.5F) );
    }
 }
 
@@ -412,358 +473,971 @@ nearest_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
  * For RECT textures / unnormalized texcoords.
  * Only a subset of wrap modes supported.
  */
-static INLINE void
-linear_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
-                         int icoord0[4], int icoord1[4], float w[4])
+static void
+wrap_linear_unorm_clamp(const float s[4], unsigned size,
+                        int icoord0[4], int icoord1[4], float w[4])
 {
    uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         /* Not exactly what the spec says, but it matches NVIDIA output */
-         float u = CLAMP(s[ch] - 0.5F, 0.0f, (float) size - 1.0f);
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      /* fall-through */
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.5F, (float) size - 0.5F);
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord1[ch] > (int) size - 1)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;
-   default:
-      assert(0);
+   for (ch = 0; ch < 4; ch++) {
+      /* Not exactly what the spec says, but it matches NVIDIA output */
+      float u = CLAMP(s[ch] - 0.5F, 0.0f, (float) size - 1.0f);
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
    }
 }
 
 
-static unsigned
-choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+static void
+wrap_linear_unorm_clamp_to_border(const float s[4], unsigned size,
+                                  int icoord0[4], int icoord1[4], float w[4])
 {
-   /*
-      major axis
-      direction     target                             sc     tc    ma
-      ----------    -------------------------------    ---    ---   ---
-       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
-       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
-       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
-       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
-       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
-       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
-   */
-   const float arx = fabsf(rx), ary = fabsf(ry), arz = fabsf(rz);
-   unsigned face;
-   float sc, tc, ma;
-
-   if (arx >= ary && arx >= arz) {
-      if (rx >= 0.0F) {
-         face = PIPE_TEX_FACE_POS_X;
-         sc = -rz;
-         tc = -ry;
-         ma = arx;
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], 0.5F, (float) size - 0.5F);
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord1[ch] > (int) size - 1)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+
+/**
+ * Examine the quad's texture coordinates to compute the partial
+ * derivatives w.r.t X and Y, then compute lambda (level of detail).
+ */
+static float
+compute_lambda_1d(const struct sp_sampler_varient *samp,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias)
+{
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
+   float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
+   float rho = MAX2(dsdx, dsdy) * texture->width[0];
+   float lambda;
+
+   lambda = util_fast_log2(rho);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
+
+   return lambda;
+}
+
+
+static float
+compute_lambda_2d(const struct sp_sampler_varient *samp,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias)
+{
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
+   float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
+   float dtdx = fabsf(t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT]);
+   float dtdy = fabsf(t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT]);
+   float maxx = MAX2(dsdx, dsdy) * texture->width[0];
+   float maxy = MAX2(dtdx, dtdy) * texture->height[0];
+   float rho  = MAX2(maxx, maxy);
+   float lambda;
+
+   lambda = util_fast_log2(rho);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
+
+   return lambda;
+}
+
+
+static float
+compute_lambda_3d(const struct sp_sampler_varient *samp,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias)
+{
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
+   float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
+   float dtdx = fabsf(t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT]);
+   float dtdy = fabsf(t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT]);
+   float dpdx = fabsf(p[QUAD_BOTTOM_RIGHT] - p[QUAD_BOTTOM_LEFT]);
+   float dpdy = fabsf(p[QUAD_TOP_LEFT]     - p[QUAD_BOTTOM_LEFT]);
+   float maxx = MAX2(dsdx, dsdy) * texture->width[0];
+   float maxy = MAX2(dtdx, dtdy) * texture->height[0];
+   float maxz = MAX2(dpdx, dpdy) * texture->depth[0];
+   float rho, lambda;
+
+   rho = MAX2(maxx, maxy);
+   rho = MAX2(rho, maxz);
+
+   lambda = util_fast_log2(rho);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
+
+   return lambda;
+}
+
+
+/**
+ * Compute lambda for a vertex texture sampler.
+ * Since there aren't derivatives to use, just return the LOD bias.
+ */
+static float
+compute_lambda_vert(const struct sp_sampler_varient *samp,
+                    const float s[QUAD_SIZE],
+                    const float t[QUAD_SIZE],
+                    const float p[QUAD_SIZE],
+                    float lodbias)
+{
+   return lodbias;
+}
+
+
+
+/**
+ * Get a texel from a texture, using the texture tile cache.
+ *
+ * \param addr  the template tex address containing cube, z, face info.
+ * \param x  the x coord of texel within 2D image
+ * \param y  the y coord of texel within 2D image
+ * \param rgba  the quad to put the texel/color into
+ *
+ * XXX maybe move this into sp_tex_tile_cache.c and merge with the
+ * sp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
+ */
+
+
+
+
+static INLINE const float *
+get_texel_2d_no_border(const struct sp_sampler_varient *samp,
+		       union tex_tile_address addr, int x, int y)
+{
+   const struct softpipe_tex_cached_tile *tile;
+
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   y %= TILE_SIZE;
+   x %= TILE_SIZE;
+
+   tile = sp_get_cached_tile_tex(samp->cache, addr);
+
+   return &tile->data.color[y][x][0];
+}
+
+
+static INLINE const float *
+get_texel_2d(const struct sp_sampler_varient *samp,
+	     union tex_tile_address addr, int x, int y)
+{
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level = addr.bits.level;
+
+   if (x < 0 || x >= (int) texture->width[level] ||
+       y < 0 || y >= (int) texture->height[level]) {
+      return samp->sampler->border_color;
+   }
+   else {
+      return get_texel_2d_no_border( samp, addr, x, y );
+   }
+}
+
+
+/* Gather a quad of adjacent texels within a tile:
+ */
+static INLINE void
+get_texel_quad_2d_no_border_single_tile(const struct sp_sampler_varient *samp,
+					union tex_tile_address addr, 
+					unsigned x, unsigned y, 
+					const float *out[4])
+{
+   const struct softpipe_tex_cached_tile *tile;
+
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   y %= TILE_SIZE;
+   x %= TILE_SIZE;
+
+   tile = sp_get_cached_tile_tex(samp->cache, addr);
+      
+   out[0] = &tile->data.color[y  ][x  ][0];
+   out[1] = &tile->data.color[y  ][x+1][0];
+   out[2] = &tile->data.color[y+1][x  ][0];
+   out[3] = &tile->data.color[y+1][x+1][0];
+}
+
+
+/* Gather a quad of potentially non-adjacent texels:
+ */
+static INLINE void
+get_texel_quad_2d_no_border(const struct sp_sampler_varient *samp,
+			    union tex_tile_address addr,
+			    int x0, int y0, 
+			    int x1, int y1,
+			    const float *out[4])
+{
+   out[0] = get_texel_2d_no_border( samp, addr, x0, y0 );
+   out[1] = get_texel_2d_no_border( samp, addr, x1, y0 );
+   out[2] = get_texel_2d_no_border( samp, addr, x0, y1 );
+   out[3] = get_texel_2d_no_border( samp, addr, x1, y1 );
+}
+
+/* Can involve a lot of unnecessary checks for border color:
+ */
+static INLINE void
+get_texel_quad_2d(const struct sp_sampler_varient *samp,
+		  union tex_tile_address addr,
+		  int x0, int y0, 
+		  int x1, int y1,
+		  const float *out[4])
+{
+   out[0] = get_texel_2d( samp, addr, x0, y0 );
+   out[1] = get_texel_2d( samp, addr, x1, y0 );
+   out[3] = get_texel_2d( samp, addr, x1, y1 );
+   out[2] = get_texel_2d( samp, addr, x0, y1 );
+}
+
+
+
+/* 3d varients:
+ */
+static INLINE const float *
+get_texel_3d_no_border(const struct sp_sampler_varient *samp,
+                       union tex_tile_address addr, int x, int y, int z)
+{
+   const struct softpipe_tex_cached_tile *tile;
+
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   addr.bits.z = z;
+   y %= TILE_SIZE;
+   x %= TILE_SIZE;
+
+   tile = sp_get_cached_tile_tex(samp->cache, addr);
+
+   return &tile->data.color[y][x][0];
+}
+
+
+static INLINE const float *
+get_texel_3d(const struct sp_sampler_varient *samp,
+	     union tex_tile_address addr, int x, int y, int z)
+{
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level = addr.bits.level;
+
+   if (x < 0 || x >= (int) texture->width[level] ||
+       y < 0 || y >= (int) texture->height[level] ||
+       z < 0 || z >= (int) texture->depth[level]) {
+      return samp->sampler->border_color;
+   }
+   else {
+      return get_texel_3d_no_border( samp, addr, x, y, z );
+   }
+}
+
+
+/**
+ * Given the logbase2 of a mipmap's base level size and a mipmap level,
+ * return the size (in texels) of that mipmap level.
+ * For example, if level[0].width = 256 then base_pot will be 8.
+ * If level = 2, then we'll return 64 (the width at level=2).
+ * Return 1 if level > base_pot.
+ */
+static INLINE unsigned
+pot_level_size(unsigned base_pot, unsigned level)
+{
+   return (base_pot >= level) ? (1 << (base_pot - level)) : 1;
+}
+
+
+/* Some image-filter fastpaths:
+ */
+static INLINE void
+img_filter_2d_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
+                                const float s[QUAD_SIZE],
+                                const float t[QUAD_SIZE],
+                                const float p[QUAD_SIZE],
+                                float lodbias,
+                                float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = pot_level_size(samp->xpot, level);
+   unsigned ypot = pot_level_size(samp->ypot, level);
+   unsigned xmax = (xpot - 1) & (TILE_SIZE - 1); /* MIN2(TILE_SIZE, xpot) - 1; */
+   unsigned ymax = (ypot - 1) & (TILE_SIZE - 1); /* MIN2(TILE_SIZE, ypot) - 1; */
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot - 0.5F;
+      float v = t[j] * ypot - 0.5F;
+
+      int uflr = util_ifloor(u);
+      int vflr = util_ifloor(v);
+
+      float xw = u - (float)uflr;
+      float yw = v - (float)vflr;
+
+      int x0 = uflr & (xpot - 1);
+      int y0 = vflr & (ypot - 1);
+
+      const float *tx[4];      
+      
+      /* Can we fetch all four at once:
+       */
+      if (x0 < xmax && y0 < ymax) {
+         get_texel_quad_2d_no_border_single_tile(samp, addr, x0, y0, tx);
       }
       else {
-         face = PIPE_TEX_FACE_NEG_X;
-         sc = rz;
-         tc = -ry;
-         ma = arx;
+         unsigned x1 = (x0 + 1) & (xpot - 1);
+         unsigned y1 = (y0 + 1) & (ypot - 1);
+         get_texel_quad_2d_no_border(samp, addr, x0, y0, x1, y1, tx);
+      }
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_2d(xw, yw, 
+                              tx[0][c], tx[1][c], 
+                              tx[2][c], tx[3][c]);
       }
    }
-   else if (ary >= arx && ary >= arz) {
-      if (ry >= 0.0F) {
-         face = PIPE_TEX_FACE_POS_Y;
-         sc = rx;
-         tc = rz;
-         ma = ary;
+}
+
+
+static INLINE void
+img_filter_2d_nearest_repeat_POT(struct tgsi_sampler *tgsi_sampler,
+                                 const float s[QUAD_SIZE],
+                                 const float t[QUAD_SIZE],
+                                 const float p[QUAD_SIZE],
+                                 float lodbias,
+                                 float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = pot_level_size(samp->xpot, level);
+   unsigned ypot = pot_level_size(samp->ypot, level);
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot;
+      float v = t[j] * ypot;
+
+      int uflr = util_ifloor(u);
+      int vflr = util_ifloor(v);
+
+      int x0 = uflr & (xpot - 1);
+      int y0 = vflr & (ypot - 1);
+
+      const float *out = get_texel_2d_no_border(samp, addr, x0, y0);
+
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
       }
-      else {
-         face = PIPE_TEX_FACE_NEG_Y;
-         sc = rx;
-         tc = -rz;
-         ma = ary;
+   }
+}
+
+
+static INLINE void
+img_filter_2d_nearest_clamp_POT(struct tgsi_sampler *tgsi_sampler,
+                                const float s[QUAD_SIZE],
+                                const float t[QUAD_SIZE],
+                                const float p[QUAD_SIZE],
+                                float lodbias,
+                                float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = pot_level_size(samp->xpot, level);
+   unsigned ypot = pot_level_size(samp->ypot, level);
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot;
+      float v = t[j] * ypot;
+
+      int x0, y0;
+      const float *out;
+
+      x0 = util_ifloor(u);
+      if (x0 < 0) 
+         x0 = 0;
+      else if (x0 > xpot - 1)
+         x0 = xpot - 1;
+
+      y0 = util_ifloor(v);
+      if (y0 < 0) 
+         y0 = 0;
+      else if (y0 > ypot - 1)
+         y0 = ypot - 1;
+      
+      out = get_texel_2d_no_border(samp, addr, x0, y0);
+
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
       }
    }
-   else {
-      if (rz > 0.0F) {
-         face = PIPE_TEX_FACE_POS_Z;
-         sc = rx;
-         tc = -ry;
-         ma = arz;
+}
+
+
+static void
+img_filter_1d_nearest(struct tgsi_sampler *tgsi_sampler,
+                        const float s[QUAD_SIZE],
+                        const float t[QUAD_SIZE],
+                        const float p[QUAD_SIZE],
+                        float lodbias,
+                        float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width;
+   int x[4];
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+
+   assert(width > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->nearest_texcoord_s(s, width, x);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_2d(samp, addr, x[j], 0);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
       }
-      else {
-         face = PIPE_TEX_FACE_NEG_Z;
-         sc = -rx;
-         tc = -ry;
-         ma = arz;
+   }
+}
+
+
+static void
+img_filter_2d_nearest(struct tgsi_sampler *tgsi_sampler,
+                      const float s[QUAD_SIZE],
+                      const float t[QUAD_SIZE],
+                      const float p[QUAD_SIZE],
+                      float lodbias,
+                      float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width, height;
+   int x[4], y[4];
+   union tex_tile_address addr;
+
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+ 
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->nearest_texcoord_s(s, width, x);
+   samp->nearest_texcoord_t(t, height, y);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_2d(samp, addr, x[j], y[j]);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
       }
    }
+}
 
-   *newS = ( sc / ma + 1.0F ) * 0.5F;
-   *newT = ( tc / ma + 1.0F ) * 0.5F;
 
-   return face;
+static INLINE union tex_tile_address
+face(union tex_tile_address addr, unsigned face )
+{
+   addr.bits.face = face;
+   return addr;
 }
 
 
-/**
- * Examine the quad's texture coordinates to compute the partial
- * derivatives w.r.t X and Y, then compute lambda (level of detail).
- *
- * This is only done for fragment shaders, not vertex shaders.
- */
-static float
-compute_lambda(const struct pipe_texture *tex,
-               const struct pipe_sampler_state *sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               float lodbias)
+static void
+img_filter_cube_nearest(struct tgsi_sampler *tgsi_sampler,
+                        const float s[QUAD_SIZE],
+                        const float t[QUAD_SIZE],
+                        const float p[QUAD_SIZE],
+                        float lodbias,
+                        float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
-   float rho, lambda;
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   const unsigned *faces = samp->faces; /* zero when not cube-mapping */
+   unsigned level0, j;
+   int width, height;
+   int x[4], y[4];
+   union tex_tile_address addr;
 
-   assert(sampler->normalized_coords);
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
 
-   assert(s);
-   {
-      float dsdx = s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT];
-      float dsdy = s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT];
-      dsdx = fabsf(dsdx);
-      dsdy = fabsf(dsdy);
-      rho = MAX2(dsdx, dsdy) * tex->width[0];
-   }
-   if (t) {
-      float dtdx = t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT];
-      float dtdy = t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT];
-      float max;
-      dtdx = fabsf(dtdx);
-      dtdy = fabsf(dtdy);
-      max = MAX2(dtdx, dtdy) * tex->height[0];
-      rho = MAX2(rho, max);
+   assert(width > 0);
+   assert(height > 0);
+ 
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->nearest_texcoord_s(s, width, x);
+   samp->nearest_texcoord_t(t, height, y);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_2d(samp, face(addr, faces[j]), x[j], y[j]);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
+      }
    }
-   if (p) {
-      float dpdx = p[QUAD_BOTTOM_RIGHT] - p[QUAD_BOTTOM_LEFT];
-      float dpdy = p[QUAD_TOP_LEFT]     - p[QUAD_BOTTOM_LEFT];
-      float max;
-      dpdx = fabsf(dpdx);
-      dpdy = fabsf(dpdy);
-      max = MAX2(dpdx, dpdy) * tex->depth[0];
-      rho = MAX2(rho, max);
+}
+
+
+static void
+img_filter_3d_nearest(struct tgsi_sampler *tgsi_sampler,
+                      const float s[QUAD_SIZE],
+                      const float t[QUAD_SIZE],
+                      const float p[QUAD_SIZE],
+                      float lodbias,
+                      float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width, height, depth;
+   int x[4], y[4], z[4];
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+   depth = texture->depth[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+   assert(depth > 0);
+
+   samp->nearest_texcoord_s(s, width,  x);
+   samp->nearest_texcoord_t(t, height, y);
+   samp->nearest_texcoord_p(p, depth,  z);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_3d(samp, addr, x[j], y[j], z[j]);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
+      }      
    }
+}
 
-   lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->lod_bias;
-   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
 
-   return lambda;
+static void
+img_filter_1d_linear(struct tgsi_sampler *tgsi_sampler,
+                     const float s[QUAD_SIZE],
+                     const float t[QUAD_SIZE],
+                     const float p[QUAD_SIZE],
+                     float lodbias,
+                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width;
+   int x0[4], x1[4];
+   float xw[4]; /* weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+
+   assert(width > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->linear_texcoord_s(s, width, x0, x1, xw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *tx0 = get_texel_2d(samp, addr, x0[j], 0);
+      const float *tx1 = get_texel_2d(samp, addr, x1[j], 0);
+      int c;
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp(xw[j], tx0[c], tx1[c]);
+      }
+   }
 }
 
 
-/**
- * Do several things here:
- * 1. Compute lambda from the texcoords, if needed
- * 2. Determine if we're minifying or magnifying
- * 3. If minifying, choose mipmap levels
- * 4. Return image filter to use within mipmap images
- * \param level0  Returns first mipmap level to sample from
- * \param level1  Returns second mipmap level to sample from
- * \param levelBlend  Returns blend factor between levels, in [0,1]
- * \param imgFilter  Returns either the min or mag filter, depending on lambda
- */
 static void
-choose_mipmap_levels(const struct pipe_texture *texture,
-                     const struct pipe_sampler_state *sampler,
+img_filter_2d_linear(struct tgsi_sampler *tgsi_sampler,
                      const float s[QUAD_SIZE],
                      const float t[QUAD_SIZE],
                      const float p[QUAD_SIZE],
-                     boolean computeLambda,
                      float lodbias,
-                     unsigned *level0, unsigned *level1, float *levelBlend,
-                     unsigned *imgFilter)
-{
-   if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
-      /* no mipmap selection needed */
-      *level0 = *level1 = CLAMP((int) sampler->min_lod,
-                                0, (int) texture->last_level);
-
-      if (sampler->min_img_filter != sampler->mag_img_filter) {
-         /* non-mipmapped texture, but still need to determine if doing
-          * minification or magnification.
-          */
-         float lambda = compute_lambda(texture, sampler, s, t, p, lodbias);
-         if (lambda <= 0.0) {
-            *imgFilter = sampler->mag_img_filter;
-         }
-         else {
-            *imgFilter = sampler->min_img_filter;
-         }
+                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width, height;
+   int x0[4], y0[4], x1[4], y1[4];
+   float xw[4], yw[4]; /* weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->linear_texcoord_s(s, width,  x0, x1, xw);
+   samp->linear_texcoord_t(t, height, y0, y1, yw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *tx0 = get_texel_2d(samp, addr, x0[j], y0[j]);
+      const float *tx1 = get_texel_2d(samp, addr, x1[j], y0[j]);
+      const float *tx2 = get_texel_2d(samp, addr, x0[j], y1[j]);
+      const float *tx3 = get_texel_2d(samp, addr, x1[j], y1[j]);
+      int c;
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_2d(xw[j], yw[j],
+                              tx0[c], tx1[c],
+                              tx2[c], tx3[c]);
       }
-      else {
-         *imgFilter = sampler->mag_img_filter;
+   }
+}
+
+
+static void
+img_filter_cube_linear(struct tgsi_sampler *tgsi_sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       float lodbias,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   const unsigned *faces = samp->faces; /* zero when not cube-mapping */
+   unsigned level0, j;
+   int width, height;
+   int x0[4], y0[4], x1[4], y1[4];
+   float xw[4], yw[4]; /* weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->linear_texcoord_s(s, width,  x0, x1, xw);
+   samp->linear_texcoord_t(t, height, y0, y1, yw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      union tex_tile_address addrj = face(addr, faces[j]);
+      const float *tx0 = get_texel_2d(samp, addrj, x0[j], y0[j]);
+      const float *tx1 = get_texel_2d(samp, addrj, x1[j], y0[j]);
+      const float *tx2 = get_texel_2d(samp, addrj, x0[j], y1[j]);
+      const float *tx3 = get_texel_2d(samp, addrj, x1[j], y1[j]);
+      int c;
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_2d(xw[j], yw[j],
+                              tx0[c], tx1[c],
+                              tx2[c], tx3[c]);
       }
    }
-   else {
-      float lambda;
+}
 
-      if (computeLambda)
-         /* fragment shader */
-         lambda = compute_lambda(texture, sampler, s, t, p, lodbias);
-      else
-         /* vertex shader */
-         lambda = lodbias; /* not really a bias, but absolute LOD */
 
-      if (lambda <= 0.0) { /* XXX threshold depends on the filter */
-         /* magnifying */
-         *imgFilter = sampler->mag_img_filter;
-         *level0 = *level1 = 0;
+static void
+img_filter_3d_linear(struct tgsi_sampler *tgsi_sampler,
+                     const float s[QUAD_SIZE],
+                     const float t[QUAD_SIZE],
+                     const float p[QUAD_SIZE],
+                     float lodbias,
+                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width, height, depth;
+   int x0[4], x1[4], y0[4], y1[4], z0[4], z1[4];
+   float xw[4], yw[4], zw[4]; /* interpolation weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+   depth = texture->depth[level0];
+
+   addr.value = 0;
+   addr.bits.level = level0;
+
+   assert(width > 0);
+   assert(height > 0);
+   assert(depth > 0);
+
+   samp->linear_texcoord_s(s, width,  x0, x1, xw);
+   samp->linear_texcoord_t(t, height, y0, y1, yw);
+   samp->linear_texcoord_p(p, depth,  z0, z1, zw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      const float *tx00 = get_texel_3d(samp, addr, x0[j], y0[j], z0[j]);
+      const float *tx01 = get_texel_3d(samp, addr, x1[j], y0[j], z0[j]);
+      const float *tx02 = get_texel_3d(samp, addr, x0[j], y1[j], z0[j]);
+      const float *tx03 = get_texel_3d(samp, addr, x1[j], y1[j], z0[j]);
+      
+      const float *tx10 = get_texel_3d(samp, addr, x0[j], y0[j], z1[j]);
+      const float *tx11 = get_texel_3d(samp, addr, x1[j], y0[j], z1[j]);
+      const float *tx12 = get_texel_3d(samp, addr, x0[j], y1[j], z1[j]);
+      const float *tx13 = get_texel_3d(samp, addr, x1[j], y1[j], z1[j]);
+      
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_3d(xw[j], yw[j], zw[j],
+                              tx00[c], tx01[c],
+                              tx02[c], tx03[c],
+                              tx10[c], tx11[c],
+                              tx12[c], tx13[c]);
       }
-      else {
-         /* minifying */
-         *imgFilter = sampler->min_img_filter;
-
-         /* choose mipmap level(s) and compute the blend factor between them */
-         if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
-            /* Nearest mipmap level */
-            const int lvl = (int) (lambda + 0.5);
-            *level0 =
-            *level1 = CLAMP(lvl, 0, (int) texture->last_level);
-         }
-         else {
-            /* Linear interpolation between mipmap levels */
-            const int lvl = (int) lambda;
-            *level0 = CLAMP(lvl,     0, (int) texture->last_level);
-            *level1 = CLAMP(lvl + 1, 0, (int) texture->last_level);
-            *levelBlend = FRAC(lambda);  /* blending weight between levels */
+   }
+}
+
+
+static void
+mip_filter_linear(struct tgsi_sampler *tgsi_sampler,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias,
+                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   int level0;
+   float lambda;
+
+   lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+   level0 = (int)lambda;
+
+   if (lambda < 0.0) { 
+      samp->level = 0;
+      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+   }
+   else if (level0 >= texture->last_level) {
+      samp->level = texture->last_level;
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+   }
+   else {
+      float levelBlend = lambda - level0;
+      float rgba0[4][4];
+      float rgba1[4][4];
+      int c,j;
+
+      samp->level = level0;
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba0 );
+
+      samp->level = level0+1;
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba1 );
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = lerp(levelBlend, rgba0[c][j], rgba1[c][j]);
          }
       }
    }
 }
 
 
-/**
- * Get a texel from a texture, using the texture tile cache.
- *
- * \param face  the cube face in 0..5
- * \param level  the mipmap level
- * \param x  the x coord of texel within 2D image
- * \param y  the y coord of texel within 2D image
- * \param z  which slice of a 3D texture
- * \param rgba  the quad to put the texel/color into
- * \param j  which element of the rgba quad to write to
- *
- * XXX maybe move this into sp_tile_cache.c and merge with the
- * sp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
- */
 static void
-get_texel(const struct tgsi_sampler *tgsi_sampler,
-          unsigned face, unsigned level, int x, int y, int z,
-          float rgba[NUM_CHANNELS][QUAD_SIZE], unsigned j)
+mip_filter_nearest(struct tgsi_sampler *tgsi_sampler,
+                   const float s[QUAD_SIZE],
+                   const float t[QUAD_SIZE],
+                   const float p[QUAD_SIZE],
+                   float lodbias,
+                   float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   float lambda;
 
-   if (x < 0 || x >= (int) texture->width[level] ||
-       y < 0 || y >= (int) texture->height[level] ||
-       z < 0 || z >= (int) texture->depth[level]) {
-      rgba[0][j] = sampler->border_color[0];
-      rgba[1][j] = sampler->border_color[1];
-      rgba[2][j] = sampler->border_color[2];
-      rgba[3][j] = sampler->border_color[3];
+   lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+
+   if (lambda < 0.0) { 
+      samp->level = 0;
+      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
    }
    else {
-      const int tx = x % TILE_SIZE;
-      const int ty = y % TILE_SIZE;
-      const struct softpipe_cached_tile *tile
-         = sp_get_cached_tile_tex(sp, samp->cache,
-                                  x, y, z, face, level);
-      rgba[0][j] = tile->data.color[ty][tx][0];
-      rgba[1][j] = tile->data.color[ty][tx][1];
-      rgba[2][j] = tile->data.color[ty][tx][2];
-      rgba[3][j] = tile->data.color[ty][tx][3];
-      if (0)
-      {
-         debug_printf("Get texel %f %f %f %f from %s\n",
-                      rgba[0][j], rgba[1][j], rgba[2][j], rgba[3][j],
-                      pf_name(texture->format));
-      }
+      samp->level = (int)(lambda + 0.5) ;
+      samp->level = MIN2(samp->level, (int)texture->last_level);
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+   }
+
+#if 0
+   printf("RGBA %g %g %g %g, %g %g %g %g, %g %g %g %g, %g %g %g %g\n",
+          rgba[0][0], rgba[1][0], rgba[2][0], rgba[3][0],
+          rgba[0][1], rgba[1][1], rgba[2][1], rgba[3][1],
+          rgba[0][2], rgba[1][2], rgba[2][2], rgba[3][2],
+          rgba[0][3], rgba[1][3], rgba[2][3], rgba[3][3]);
+#endif
+}
+
+
+static void
+mip_filter_none(struct tgsi_sampler *tgsi_sampler,
+                const float s[QUAD_SIZE],
+                const float t[QUAD_SIZE],
+                const float p[QUAD_SIZE],
+                float lodbias,
+                float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   float lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+
+   if (lambda < 0.0) { 
+      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+   }
+   else {
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
    }
 }
 
 
+
 /**
- * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
- * When we sampled the depth texture, the depth value was put into all
- * RGBA channels.  We look at the red channel here.
- * \param rgba  quad of (depth) texel values
- * \param p  texture 'P' components for four pixels in quad
- * \param j  which pixel in the quad to test [0..3]
+ * Specialized version of mip_filter_linear with hard-wired calls to
+ * 2d lambda calculation and 2d_linear_repeat_POT img filters.
  */
-static INLINE void
-shadow_compare(const struct pipe_sampler_state *sampler,
-               float rgba[NUM_CHANNELS][QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               uint j)
+static void
+mip_filter_linear_2d_linear_repeat_POT(
+   struct tgsi_sampler *tgsi_sampler,
+   const float s[QUAD_SIZE],
+   const float t[QUAD_SIZE],
+   const float p[QUAD_SIZE],
+   float lodbias,
+   float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
-   int k;
-   switch (sampler->compare_func) {
-   case PIPE_FUNC_LESS:
-      k = p[j] < rgba[0][j];
-      break;
-   case PIPE_FUNC_LEQUAL:
-      k = p[j] <= rgba[0][j];
-      break;
-   case PIPE_FUNC_GREATER:
-      k = p[j] > rgba[0][j];
-      break;
-   case PIPE_FUNC_GEQUAL:
-      k = p[j] >= rgba[0][j];
-      break;
-   case PIPE_FUNC_EQUAL:
-      k = p[j] == rgba[0][j];
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      k = p[j] != rgba[0][j];
-      break;
-   case PIPE_FUNC_ALWAYS:
-      k = 1;
-      break;
-   case PIPE_FUNC_NEVER:
-      k = 0;
-      break;
-   default:
-      k = 0;
-      assert(0);
-      break;
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   int level0;
+   float lambda;
+
+   lambda = compute_lambda_2d(samp, s, t, p, lodbias);
+   level0 = (int)lambda;
+
+   /* Catches both negative and large values of level0:
+    */
+   if ((unsigned)level0 >= texture->last_level) { 
+      if (level0 < 0)
+         samp->level = 0;
+      else
+         samp->level = texture->last_level;
+
+      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba );
    }
+   else {
+      float levelBlend = lambda - level0;
+      float rgba0[4][4];
+      float rgba1[4][4];
+      int c,j;
 
-   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
-   rgba[0][j] = rgba[1][j] = rgba[2][j] = (float) k;
-   rgba[3][j] = 1.0F;
+      samp->level = level0;
+      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba0 );
+
+      samp->level = level0+1;
+      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba1 );
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = lerp(levelBlend, rgba0[c][j], rgba1[c][j]);
+         }
+      }
+   }
 }
 
 
+
 /**
- * As above, but do four z/texture comparisons.
+ * Do shadow/depth comparisons.
  */
-static INLINE void
-shadow_compare4(const struct pipe_sampler_state *sampler,
-                float rgba[NUM_CHANNELS][QUAD_SIZE],
-                const float p[QUAD_SIZE])
+static void
+sample_compare(struct tgsi_sampler *tgsi_sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               float lodbias,
+               float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_sampler_state *sampler = samp->sampler;
    int j, k0, k1, k2, k3;
    float val;
 
+   samp->mip_filter( tgsi_sampler, s, t, p, lodbias, rgba );
+
+   /**
+    * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
+    * When we sampled the depth texture, the depth value was put into all
+    * RGBA channels.  We look at the red channel here.
+    */
+
    /* compare four texcoords vs. four texture samples */
    switch (sampler->compare_func) {
    case PIPE_FUNC_LESS:
@@ -826,470 +1500,392 @@ shadow_compare4(const struct pipe_sampler_state *sampler,
 
 
 /**
- * Common code for sampling 1D/2D/cube textures.
- * Could probably extend for 3D...
+ * Compute which cube face is referenced by each texcoord and put that
+ * info into the sampler faces[] array.  Then sample the cube faces
  */
 static void
-sp_get_samples_2d_common(const struct tgsi_sampler *tgsi_sampler,
-                         const float s[QUAD_SIZE],
-                         const float t[QUAD_SIZE],
-                         const float p[QUAD_SIZE],
-                         boolean computeLambda,
-                         float lodbias,
-                         float rgba[NUM_CHANNELS][QUAD_SIZE],
-                         const unsigned faces[4])
-{
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   const struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
-   unsigned level0, level1, j, imgFilter;
-   int width, height;
-   float levelBlend;
-
-   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   assert(sampler->normalized_coords);
-
-   width = texture->width[level0];
-   height = texture->height[level0];
-
-   assert(width > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4];
-         nearest_texcoord_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_4(sampler->wrap_t, t, height, y);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, faces[j], level0, x[j], y[j], 0, rgba, j);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare(sampler, rgba, p, j);
-            }
+sample_cube(struct tgsi_sampler *tgsi_sampler,
+            const float s[QUAD_SIZE],
+            const float t[QUAD_SIZE],
+            const float p[QUAD_SIZE],
+            float lodbias,
+            float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned j;
+   float ssss[4], tttt[4];
 
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               unsigned c;
-               x[j] /= 2;
-               y[j] /= 2;
-               get_texel(tgsi_sampler, faces[j], level1, x[j], y[j], 0,
-                         rgba2, j);
-               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
-                  shadow_compare(sampler, rgba2, p, j);
-               }
-
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
+   /*
+     major axis
+     direction     target                             sc     tc    ma
+     ----------    -------------------------------    ---    ---   ---
+     +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+     -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+     +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+     -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+     +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+     -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   for (j = 0; j < QUAD_SIZE; j++) {
+      float rx = s[j];
+      float ry = t[j];
+      float rz = p[j];
+      const float arx = fabsf(rx), ary = fabsf(ry), arz = fabsf(rz);
+      unsigned face;
+      float sc, tc, ma;
+
+      if (arx >= ary && arx >= arz) {
+         if (rx >= 0.0F) {
+            face = PIPE_TEX_FACE_POS_X;
+            sc = -rz;
+            tc = -ry;
+            ma = arx;
+         }
+         else {
+            face = PIPE_TEX_FACE_NEG_X;
+            sc = rz;
+            tc = -ry;
+            ma = arx;
          }
       }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
+      else if (ary >= arx && ary >= arz) {
+         if (ry >= 0.0F) {
+            face = PIPE_TEX_FACE_POS_Y;
+            sc = rx;
+            tc = rz;
+            ma = ary;
+         }
+         else {
+            face = PIPE_TEX_FACE_NEG_Y;
+            sc = rx;
+            tc = -rz;
+            ma = ary;
+         }
+      }
+      else {
+         if (rz > 0.0F) {
+            face = PIPE_TEX_FACE_POS_Z;
+            sc = rx;
+            tc = -ry;
+            ma = arz;
+         }
+         else {
+            face = PIPE_TEX_FACE_NEG_Z;
+            sc = -rx;
+            tc = -ry;
+            ma = arz;
+         }
+      }
+
       {
-         int x0[4], y0[4], x1[4], y1[4];
-         float xw[4], yw[4]; /* weights */
-
-         linear_texcoord_4(sampler->wrap_s, s, width, x0, x1, xw);
-         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            float tx[4][4]; /* texels */
-            int c;
-            get_texel(tgsi_sampler, faces[j], level0, x0[j], y0[j], 0, tx, 0);
-            get_texel(tgsi_sampler, faces[j], level0, x1[j], y0[j], 0, tx, 1);
-            get_texel(tgsi_sampler, faces[j], level0, x0[j], y1[j], 0, tx, 2);
-            get_texel(tgsi_sampler, faces[j], level0, x1[j], y1[j], 0, tx, 3);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare4(sampler, tx, p);
-            }
+	 const float ima = 1.0 / ma;
+	 ssss[j] = ( sc * ima + 1.0F ) * 0.5F;
+	 tttt[j] = ( tc * ima + 1.0F ) * 0.5F;
+	 samp->faces[j] = face;
+      }
+   }
 
-            /* interpolate R, G, B, A */
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_2d(xw[j], yw[j],
-                                    tx[c][0], tx[c][1],
-                                    tx[c][2], tx[c][3]);
-            }
+   /* In our little pipeline, the compare stage is next.  If compare
+    * is not active, this will point somewhere deeper into the
+    * pipeline, eg. to mip_filter or even img_filter.
+    */
+   samp->compare(tgsi_sampler, ssss, tttt, NULL, lodbias, rgba);
+}
 
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               x0[j] /= 2;
-               y0[j] /= 2;
-               x1[j] /= 2;
-               y1[j] /= 2;
-               get_texel(tgsi_sampler, faces[j], level1, x0[j], y0[j], 0, tx, 0);
-               get_texel(tgsi_sampler, faces[j], level1, x1[j], y0[j], 0, tx, 1);
-               get_texel(tgsi_sampler, faces[j], level1, x0[j], y1[j], 0, tx, 2);
-               get_texel(tgsi_sampler, faces[j], level1, x1[j], y1[j], 0, tx, 3);
-               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
-                  shadow_compare4(sampler, tx, p);
-               }
-
-               /* interpolate R, G, B, A */
-               for (c = 0; c < 4; c++) {
-                  rgba2[c][j] = lerp_2d(xw[j], yw[j],
-                                        tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
-               }
-
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
+
+
+static wrap_nearest_func
+get_nearest_unorm_wrap(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_nearest_unorm_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_nearest_unorm_clamp_to_border;
    default:
       assert(0);
+      return wrap_nearest_unorm_clamp;
    }
 }
 
 
-static INLINE void
-sp_get_samples_1d(const struct tgsi_sampler *sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  boolean computeLambda,
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+static wrap_nearest_func
+get_nearest_wrap(unsigned mode)
 {
-   static const unsigned faces[4] = {0, 0, 0, 0};
-   static const float tzero[4] = {0, 0, 0, 0};
-   sp_get_samples_2d_common(sampler, s, tzero, NULL,
-                            computeLambda, lodbias, rgba, faces);
+   switch (mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      return wrap_nearest_repeat;
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_nearest_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return wrap_nearest_clamp_to_edge;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_nearest_clamp_to_border;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      return wrap_nearest_mirror_repeat;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      return wrap_nearest_mirror_clamp;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      return wrap_nearest_mirror_clamp_to_edge;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      return wrap_nearest_mirror_clamp_to_border;
+   default:
+      assert(0);
+      return wrap_nearest_repeat;
+   }
 }
 
 
-static INLINE void
-sp_get_samples_2d(const struct tgsi_sampler *sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  boolean computeLambda,
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+static wrap_linear_func
+get_linear_unorm_wrap(unsigned mode)
 {
-   static const unsigned faces[4] = {0, 0, 0, 0};
-   sp_get_samples_2d_common(sampler, s, t, p,
-                            computeLambda, lodbias, rgba, faces);
+   switch (mode) {
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_linear_unorm_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_linear_unorm_clamp_to_border;
+   default:
+      assert(0);
+      return wrap_linear_unorm_clamp;
+   }
 }
 
 
-static INLINE void
-sp_get_samples_3d(const struct tgsi_sampler *tgsi_sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  boolean computeLambda,
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+static wrap_linear_func
+get_linear_wrap(unsigned mode)
 {
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   const struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
-   /* get/map pipe_surfaces corresponding to 3D tex slices */
-   unsigned level0, level1, j, imgFilter;
-   int width, height, depth;
-   float levelBlend;
-   const uint face = 0;
-
-   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   assert(sampler->normalized_coords);
-
-   width = texture->width[level0];
-   height = texture->height[level0];
-   depth = texture->depth[level0];
-
-   assert(width > 0);
-   assert(height > 0);
-   assert(depth > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4], z[4];
-         nearest_texcoord_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_4(sampler->wrap_t, t, height, y);
-         nearest_texcoord_4(sampler->wrap_r, p, depth, z);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, face, level0, x[j], y[j], z[j], rgba, j);
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               unsigned c;
-               x[j] /= 2;
-               y[j] /= 2;
-               z[j] /= 2;
-               get_texel(tgsi_sampler, face, level1, x[j], y[j], z[j], rgba2, j);
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba2[c][j], rgba[c][j]);
-               }
-            }
-         }
-      }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
-      {
-         int x0[4], x1[4], y0[4], y1[4], z0[4], z1[4];
-         float xw[4], yw[4], zw[4]; /* interpolation weights */
-         linear_texcoord_4(sampler->wrap_s, s, width,  x0, x1, xw);
-         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
-         linear_texcoord_4(sampler->wrap_r, p, depth,  z0, z1, zw);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int c;
-            float tx0[4][4], tx1[4][4];
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z0[j], tx0, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z0[j], tx0, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z0[j], tx0, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z0[j], tx0, 3);
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z1[j], tx1, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z1[j], tx1, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z1[j], tx1, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z1[j], tx1, 3);
-
-            /* interpolate R, G, B, A */
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_3d(xw[j], yw[j], zw[j],
-                                    tx0[c][0], tx0[c][1],
-                                    tx0[c][2], tx0[c][3],
-                                    tx1[c][0], tx1[c][1],
-                                    tx1[c][2], tx1[c][3]);
-            }
-
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               x0[j] /= 2;
-               y0[j] /= 2;
-               z0[j] /= 2;
-               x1[j] /= 2;
-               y1[j] /= 2;
-               z1[j] /= 2;
-               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z0[j], tx0, 0);
-               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z0[j], tx0, 1);
-               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z0[j], tx0, 2);
-               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z0[j], tx0, 3);
-               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z1[j], tx1, 0);
-               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z1[j], tx1, 1);
-               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z1[j], tx1, 2);
-               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z1[j], tx1, 3);
-
-               /* interpolate R, G, B, A */
-               for (c = 0; c < 4; c++) {
-                  rgba2[c][j] = lerp_3d(xw[j], yw[j], zw[j],
-                                        tx0[c][0], tx0[c][1],
-                                        tx0[c][2], tx0[c][3],
-                                        tx1[c][0], tx1[c][1],
-                                        tx1[c][2], tx1[c][3]);
-               }
-
-               /* blend mipmap levels */
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
+   switch (mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      return wrap_linear_repeat;
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_linear_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return wrap_linear_clamp_to_edge;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_linear_clamp_to_border;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      return wrap_linear_mirror_repeat;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      return wrap_linear_mirror_clamp;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      return wrap_linear_mirror_clamp_to_edge;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      return wrap_linear_mirror_clamp_to_border;
    default:
       assert(0);
+      return wrap_linear_repeat;
    }
 }
 
 
-static void
-sp_get_samples_cube(const struct tgsi_sampler *sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    boolean computeLambda,
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
+static compute_lambda_func
+get_lambda_func(const union sp_sampler_key key)
 {
-   unsigned faces[QUAD_SIZE], j;
-   float ssss[4], tttt[4];
-   for (j = 0; j < QUAD_SIZE; j++) {
-      faces[j] = choose_cube_face(s[j], t[j], p[j], ssss + j, tttt + j);
+   if (key.bits.processor == TGSI_PROCESSOR_VERTEX)
+      return compute_lambda_vert;
+   
+   switch (key.bits.target) {
+   case PIPE_TEXTURE_1D:
+      return compute_lambda_1d;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_CUBE:
+      return compute_lambda_2d;
+   case PIPE_TEXTURE_3D:
+      return compute_lambda_3d;
+   default:
+      assert(0);
+      return compute_lambda_1d;
    }
-   sp_get_samples_2d_common(sampler, ssss, tttt, NULL,
-                            computeLambda, lodbias, rgba, faces);
 }
 
 
-static void
-sp_get_samples_rect(const struct tgsi_sampler *tgsi_sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    boolean computeLambda,
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   const struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
-   const uint face = 0;
-   unsigned level0, level1, j, imgFilter;
-   int width, height;
-   float levelBlend;
-
-   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   /* texture RECTS cannot be mipmapped */
-   assert(level0 == level1);
-
-   width = texture->width[level0];
-   height = texture->height[level0];
-
-   assert(width > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4];
-         nearest_texcoord_unnorm_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_unnorm_4(sampler->wrap_t, t, height, y);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, face, level0, x[j], y[j], 0, rgba, j);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare(sampler, rgba, p, j);
-            }
-         }
-      }
+static filter_func
+get_img_filter(const union sp_sampler_key key,
+               unsigned filter,
+               const struct pipe_sampler_state *sampler)
+{
+   switch (key.bits.target) {
+   case PIPE_TEXTURE_1D:
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_1d_nearest;
+      else
+         return img_filter_1d_linear;
       break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
+   case PIPE_TEXTURE_2D:
+      /* Try for fast path:
+       */
+      if (key.bits.is_pot &&
+          sampler->wrap_s == sampler->wrap_t &&
+          sampler->normalized_coords) 
       {
-         int x0[4], y0[4], x1[4], y1[4];
-         float xw[4], yw[4]; /* weights */
-         linear_texcoord_unnorm_4(sampler->wrap_s, s, width,  x0, x1, xw);
-         linear_texcoord_unnorm_4(sampler->wrap_t, t, height, y0, y1, yw);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            float tx[4][4]; /* texels */
-            int c;
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], 0, tx, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], 0, tx, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], 0, tx, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], 0, tx, 3);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare4(sampler, tx, p);
+         switch (sampler->wrap_s) {
+         case PIPE_TEX_WRAP_REPEAT:
+            switch (filter) {
+            case PIPE_TEX_FILTER_NEAREST:
+               return img_filter_2d_nearest_repeat_POT;
+            case PIPE_TEX_FILTER_LINEAR:
+               return img_filter_2d_linear_repeat_POT;
+            default:
+               break;
             }
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_2d(xw[j], yw[j],
-                                    tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
+            break;
+         case PIPE_TEX_WRAP_CLAMP:
+            switch (filter) {
+            case PIPE_TEX_FILTER_NEAREST:
+               return img_filter_2d_nearest_clamp_POT;
+            default:
+               break;
             }
          }
       }
+      /* Otherwise use default versions:
+       */
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_2d_nearest;
+      else
+         return img_filter_2d_linear;
+      break;
+   case PIPE_TEXTURE_CUBE:
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_cube_nearest;
+      else
+         return img_filter_cube_linear;
+      break;
+   case PIPE_TEXTURE_3D:
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_3d_nearest;
+      else
+         return img_filter_3d_linear;
       break;
    default:
       assert(0);
+      return img_filter_1d_nearest;
    }
 }
 
 
 /**
- * Common code for vertex/fragment program texture sampling.
+ * Bind the given texture object and texture cache to the sampler varient.
  */
-static INLINE void
-sp_get_samples(struct tgsi_sampler *tgsi_sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               boolean computeLambda,
-               float lodbias,
-               float rgba[NUM_CHANNELS][QUAD_SIZE])
+void
+sp_sampler_varient_bind_texture( struct sp_sampler_varient *samp,
+                                 struct softpipe_tex_tile_cache *tex_cache,
+                                 const struct pipe_texture *texture )
 {
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   const struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
-
-   if (!texture)
-      return;
+   const struct pipe_sampler_state *sampler = samp->sampler;
 
-   switch (texture->target) {
-   case PIPE_TEXTURE_1D:
-      assert(sampler->normalized_coords);
-      sp_get_samples_1d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      break;
-   case PIPE_TEXTURE_2D:
-      if (sampler->normalized_coords)
-         sp_get_samples_2d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      else
-         sp_get_samples_rect(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      break;
-   case PIPE_TEXTURE_3D:
-      assert(sampler->normalized_coords);
-      sp_get_samples_3d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      break;
-   case PIPE_TEXTURE_CUBE:
-      assert(sampler->normalized_coords);
-      sp_get_samples_cube(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      break;
-   default:
-      assert(0);
-   }
-
-#if 0 /* DEBUG */
-   {
-      int i;
-      printf("Sampled at %f, %f, %f:\n", s[0], t[0], p[0]);
-      for (i = 0; i < 4; i++) {
-         printf("Frag %d: %f %f %f %f\n", i,
-                rgba[0][i],
-                rgba[1][i],
-                rgba[2][i],
-                rgba[3][i]);
-      }
-   }
-#endif
+   samp->texture = texture;
+   samp->cache = tex_cache;
+   samp->xpot = util_unsigned_logbase2( texture->width[0] );
+   samp->ypot = util_unsigned_logbase2( texture->height[0] );
+   samp->level = CLAMP((int) sampler->min_lod, 0, (int) texture->last_level);
 }
 
 
-/**
- * Called via tgsi_sampler::get_samples() when running a fragment shader.
- * Get four filtered RGBA values from the sampler's texture.
- */
 void
-sp_get_samples_fragment(struct tgsi_sampler *tgsi_sampler,
-                        const float s[QUAD_SIZE],
-                        const float t[QUAD_SIZE],
-                        const float p[QUAD_SIZE],
-                        float lodbias,
-                        float rgba[NUM_CHANNELS][QUAD_SIZE])
+sp_sampler_varient_destroy( struct sp_sampler_varient *samp )
 {
-   sp_get_samples(tgsi_sampler, s, t, p, TRUE, lodbias, rgba);
+   FREE(samp);
 }
 
 
 /**
- * Called via tgsi_sampler::get_samples() when running a vertex shader.
- * Get four filtered RGBA values from the sampler's texture.
+ * Create a sampler varient for a given set of non-orthogonal state.
  */
-void
-sp_get_samples_vertex(struct tgsi_sampler *tgsi_sampler,
-                      const float s[QUAD_SIZE],
-                      const float t[QUAD_SIZE],
-                      const float p[QUAD_SIZE],
-                      float lodbias,
-                      float rgba[NUM_CHANNELS][QUAD_SIZE])
+struct sp_sampler_varient *
+sp_create_sampler_varient( const struct pipe_sampler_state *sampler,
+                           const union sp_sampler_key key )
 {
-   sp_get_samples(tgsi_sampler, s, t, p, FALSE, lodbias, rgba);
+   struct sp_sampler_varient *samp = CALLOC_STRUCT(sp_sampler_varient);
+   if (!samp)
+      return NULL;
+
+   samp->sampler = sampler;
+   samp->key = key;
+
+   /* Note that (for instance) linear_texcoord_s and
+    * nearest_texcoord_s may be active at the same time, if the
+    * sampler min_img_filter differs from its mag_img_filter.
+    */
+   if (sampler->normalized_coords) {
+      samp->linear_texcoord_s = get_linear_wrap( sampler->wrap_s );
+      samp->linear_texcoord_t = get_linear_wrap( sampler->wrap_t );
+      samp->linear_texcoord_p = get_linear_wrap( sampler->wrap_r );
+      
+      samp->nearest_texcoord_s = get_nearest_wrap( sampler->wrap_s );
+      samp->nearest_texcoord_t = get_nearest_wrap( sampler->wrap_t );
+      samp->nearest_texcoord_p = get_nearest_wrap( sampler->wrap_r );
+   }
+   else {
+      samp->linear_texcoord_s = get_linear_unorm_wrap( sampler->wrap_s );
+      samp->linear_texcoord_t = get_linear_unorm_wrap( sampler->wrap_t );
+      samp->linear_texcoord_p = get_linear_unorm_wrap( sampler->wrap_r );
+      
+      samp->nearest_texcoord_s = get_nearest_unorm_wrap( sampler->wrap_s );
+      samp->nearest_texcoord_t = get_nearest_unorm_wrap( sampler->wrap_t );
+      samp->nearest_texcoord_p = get_nearest_unorm_wrap( sampler->wrap_r );
+   }
+   
+   samp->compute_lambda = get_lambda_func( key );
+
+   samp->min_img_filter = get_img_filter(key, sampler->min_img_filter, sampler);
+   samp->mag_img_filter = get_img_filter(key, sampler->mag_img_filter, sampler);
+
+   switch (sampler->min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NONE:
+      if (sampler->min_img_filter == sampler->mag_img_filter) 
+         samp->mip_filter = samp->min_img_filter;         
+      else
+         samp->mip_filter = mip_filter_none;
+      break;
+
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      samp->mip_filter = mip_filter_nearest;
+      break;
+
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      if (key.bits.is_pot &&
+          sampler->min_img_filter == sampler->mag_img_filter &&
+          sampler->normalized_coords &&
+          sampler->wrap_s == PIPE_TEX_WRAP_REPEAT &&
+          sampler->wrap_t == PIPE_TEX_WRAP_REPEAT &&
+          sampler->min_img_filter == PIPE_TEX_FILTER_LINEAR)
+      {
+         samp->mip_filter = mip_filter_linear_2d_linear_repeat_POT;
+      }
+      else 
+      {
+         samp->mip_filter = mip_filter_linear;
+      }
+      break;
+   }
+
+   if (sampler->compare_mode != FALSE) {
+      samp->compare = sample_compare;
+   }
+   else {
+      /* Skip compare operation by promoting the mip_filter function
+       * pointer:
+       */
+      samp->compare = samp->mip_filter;
+   }
+   
+   if (key.bits.target == PIPE_TEXTURE_CUBE) {
+      samp->base.get_samples = sample_cube;
+   }
+   else {
+      samp->faces[0] = 0;
+      samp->faces[1] = 0;
+      samp->faces[2] = 0;
+      samp->faces[3] = 0;
+
+      /* Skip cube face determination by promoting the compare
+       * function pointer:
+       */
+      samp->base.get_samples = samp->compare;
+   }
+
+   return samp;
 }
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.h b/src/gallium/drivers/softpipe/sp_tex_sample.h
index 40d8eb2c2a..b0797711d3 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.h
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.h
@@ -31,43 +31,122 @@
 
 #include "tgsi/tgsi_exec.h"
 
+struct sp_sampler_varient;
+
+typedef void (*wrap_nearest_func)(const float s[4],
+                                  unsigned size,
+                                  int icoord[4]);
+
+typedef void (*wrap_linear_func)(const float s[4], 
+                                 unsigned size,
+                                 int icoord0[4],
+                                 int icoord1[4],
+                                 float w[4]);
+
+typedef float (*compute_lambda_func)(const struct sp_sampler_varient *sampler,
+                                     const float s[QUAD_SIZE],
+                                     const float t[QUAD_SIZE],
+                                     const float p[QUAD_SIZE],
+                                     float lodbias);
+
+typedef void (*filter_func)(struct tgsi_sampler *tgsi_sampler,
+                            const float s[QUAD_SIZE],
+                            const float t[QUAD_SIZE],
+                            const float p[QUAD_SIZE],
+                            float lodbias,
+                            float rgba[NUM_CHANNELS][QUAD_SIZE]);
+
+
+union sp_sampler_key {
+   struct {
+      unsigned target:3;
+      unsigned is_pot:1;
+      unsigned processor:2;
+      unsigned unit:4;
+      unsigned pad:22;
+   } bits;
+   unsigned value;
+};
 
 /**
  * Subclass of tgsi_sampler
  */
-struct sp_shader_sampler
+struct sp_sampler_varient
 {
    struct tgsi_sampler base;  /**< base class */
 
-   uint unit;
-   struct softpipe_context *sp;
-   struct softpipe_tile_cache *cache;
+   union sp_sampler_key key;
+
+   /* The owner of this struct:
+    */
+   const struct pipe_sampler_state *sampler;
+
+
+   /* Currently bound texture:
+    */
+   const struct pipe_texture *texture;
+   struct softpipe_tex_tile_cache *cache;
+
+   unsigned processor;
+
+   /* For sp_get_samples_2d_linear_POT:
+    */
+   unsigned xpot;
+   unsigned ypot;
+   unsigned level;
+
+   unsigned faces[4];
+   
+   wrap_nearest_func nearest_texcoord_s;
+   wrap_nearest_func nearest_texcoord_t;
+   wrap_nearest_func nearest_texcoord_p;
+
+   wrap_linear_func linear_texcoord_s;
+   wrap_linear_func linear_texcoord_t;
+   wrap_linear_func linear_texcoord_p;
+
+   filter_func min_img_filter;
+   filter_func mag_img_filter;
+
+   compute_lambda_func compute_lambda;
+
+   filter_func mip_filter;
+   filter_func compare;
+   
+   /* Linked list:
+    */
+   struct sp_sampler_varient *next;
 };
 
+struct sp_sampler;
 
+/* Create a sampler varient for a given set of non-orthogonal state.  Currently the 
+ */
+struct sp_sampler_varient *
+sp_create_sampler_varient( const struct pipe_sampler_state *sampler,
+                           const union sp_sampler_key key );
 
-static INLINE const struct sp_shader_sampler *
-sp_shader_sampler(const struct tgsi_sampler *sampler)
-{
-   return (const struct sp_shader_sampler *) sampler;
-}
+void sp_sampler_varient_bind_texture( struct sp_sampler_varient *varient,
+                                      struct softpipe_tex_tile_cache *tex_cache,
+                                      const struct pipe_texture *tex );
 
+void sp_sampler_varient_destroy( struct sp_sampler_varient * );
 
-extern void
-sp_get_samples_fragment(struct tgsi_sampler *tgsi_sampler,
-                        const float s[QUAD_SIZE],
-                        const float t[QUAD_SIZE],
-                        const float p[QUAD_SIZE],
-                        float lodbias,
-                        float rgba[NUM_CHANNELS][QUAD_SIZE]);
+
+
+static INLINE struct sp_sampler_varient *
+sp_sampler_varient(const struct tgsi_sampler *sampler)
+{
+   return (struct sp_sampler_varient *) sampler;
+}
 
 extern void
-sp_get_samples_vertex(struct tgsi_sampler *tgsi_sampler,
-                      const float s[QUAD_SIZE],
-                      const float t[QUAD_SIZE],
-                      const float p[QUAD_SIZE],
-                      float lodbias,
-                      float rgba[NUM_CHANNELS][QUAD_SIZE]);
+sp_get_samples(struct tgsi_sampler *tgsi_sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               float lodbias,
+               float rgba[NUM_CHANNELS][QUAD_SIZE]);
 
 
 #endif /* SP_TEX_SAMPLE_H */
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
new file mode 100644
index 0000000000..407a22a9f4
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
@@ -0,0 +1,273 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture tile caching.
+ *
+ * Author:
+ *    Brian Paul
+ */
+
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_tile.h"
+#include "sp_context.h"
+#include "sp_surface.h"
+#include "sp_texture.h"
+#include "sp_tex_tile_cache.h"
+
+   
+
+struct softpipe_tex_tile_cache *
+sp_create_tex_tile_cache( struct pipe_screen *screen )
+{
+   struct softpipe_tex_tile_cache *tc;
+   uint pos;
+
+   tc = CALLOC_STRUCT( softpipe_tex_tile_cache );
+   if (tc) {
+      tc->screen = screen;
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         tc->entries[pos].addr.bits.invalid = 1;
+      }
+      tc->last_tile = &tc->entries[0]; /* any tile */
+   }
+   return tc;
+}
+
+
+void
+sp_destroy_tex_tile_cache(struct softpipe_tex_tile_cache *tc)
+{
+   struct pipe_screen *screen;
+   uint pos;
+
+   for (pos = 0; pos < NUM_ENTRIES; pos++) {
+      /*assert(tc->entries[pos].x < 0);*/
+   }
+   if (tc->transfer) {
+      screen = tc->transfer->texture->screen;
+      screen->tex_transfer_destroy(tc->transfer);
+   }
+   if (tc->tex_trans) {
+      screen = tc->tex_trans->texture->screen;
+      screen->tex_transfer_destroy(tc->tex_trans);
+   }
+
+   FREE( tc );
+}
+
+
+
+
+void
+sp_tex_tile_cache_map_transfers(struct softpipe_tex_tile_cache *tc)
+{
+   if (tc->tex_trans && !tc->tex_trans_map)
+      tc->tex_trans_map = tc->screen->transfer_map(tc->screen, tc->tex_trans);
+}
+
+
+void
+sp_tex_tile_cache_unmap_transfers(struct softpipe_tex_tile_cache *tc)
+{
+   if (tc->tex_trans_map) {
+      tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
+      tc->tex_trans_map = NULL;
+   }
+}
+
+/**
+ * Invalidate all cached tiles for the cached texture.
+ * Should be called when the texture is modified.
+ */
+void
+sp_tex_tile_cache_validate_texture(struct softpipe_tex_tile_cache *tc)
+{
+   unsigned i;
+
+   assert(tc);
+   assert(tc->texture);
+
+   for (i = 0; i < NUM_ENTRIES; i++) {
+      tc->entries[i].addr.bits.invalid = 1;
+   }
+}
+
+/**
+ * Specify the texture to cache.
+ */
+void
+sp_tex_tile_cache_set_texture(struct softpipe_tex_tile_cache *tc,
+                          struct pipe_texture *texture)
+{
+   uint i;
+
+   assert(!tc->transfer);
+
+   if (tc->texture != texture) {
+      pipe_texture_reference(&tc->texture, texture);
+
+      if (tc->tex_trans) {
+         struct pipe_screen *screen = tc->tex_trans->texture->screen;
+         
+         if (tc->tex_trans_map) {
+            screen->transfer_unmap(screen, tc->tex_trans);
+            tc->tex_trans_map = NULL;
+         }
+
+         screen->tex_transfer_destroy(tc->tex_trans);
+         tc->tex_trans = NULL;
+      }
+
+      /* mark as entries as invalid/empty */
+      /* XXX we should try to avoid this when the teximage hasn't changed */
+      for (i = 0; i < NUM_ENTRIES; i++) {
+         tc->entries[i].addr.bits.invalid = 1;
+      }
+
+      tc->tex_face = -1; /* any invalid value here */
+   }
+}
+
+
+
+
+/**
+ * Flush the tile cache: write all dirty tiles back to the transfer.
+ * any tiles "flagged" as cleared will be "really" cleared.
+ */
+void
+sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc)
+{
+   int pos;
+
+   if (tc->texture) {
+      /* caching a texture, mark all entries as empty */
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         tc->entries[pos].addr.bits.invalid = 1;
+      }
+      tc->tex_face = -1;
+   }
+
+}
+
+
+/**
+ * Given the texture face, level, zslice, x and y values, compute
+ * the cache entry position/index where we'd hope to find the
+ * cached texture tile.
+ * This is basically a direct-map cache.
+ * XXX There's probably lots of ways in which we can improve this.
+ */
+static INLINE uint
+tex_cache_pos( union tex_tile_address addr )
+{
+   uint entry = (addr.bits.x + 
+                 addr.bits.y * 9 + 
+                 addr.bits.z * 3 + 
+                 addr.bits.face + 
+                 addr.bits.level * 7);
+
+   return entry % NUM_ENTRIES;
+}
+
+/**
+ * Similar to sp_get_cached_tile() but for textures.
+ * Tiles are read-only and indexed with more params.
+ */
+const struct softpipe_tex_cached_tile *
+sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
+                        union tex_tile_address addr )
+{
+   struct pipe_screen *screen = tc->screen;
+   struct softpipe_tex_cached_tile *tile;
+   
+   tile = tc->entries + tex_cache_pos( addr );
+
+   if (addr.value != tile->addr.value) {
+
+      /* cache miss.  Most misses are because we've invaldiated the
+       * texture cache previously -- most commonly on binding a new
+       * texture.  Currently we effectively flush the cache on texture
+       * bind.
+       */
+#if 0
+      _debug_printf("miss at %u:  x=%d y=%d z=%d face=%d level=%d\n"
+                    "   tile %u:  x=%d y=%d z=%d face=%d level=%d\n",
+                    pos, x/TILE_SIZE, y/TILE_SIZE, z, face, level,
+                    pos, tile->addr.bits.x, tile->addr.bits.y, tile->z, tile->face, tile->level);
+#endif
+
+      /* check if we need to get a new transfer */
+      if (!tc->tex_trans ||
+          tc->tex_face != addr.bits.face ||
+          tc->tex_level != addr.bits.level ||
+          tc->tex_z != addr.bits.z) {
+         /* get new transfer (view into texture) */
+
+         if (tc->tex_trans) {
+            if (tc->tex_trans_map) {
+               tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
+               tc->tex_trans_map = NULL;
+            }
+
+            screen->tex_transfer_destroy(tc->tex_trans);
+            tc->tex_trans = NULL;
+         }
+
+         tc->tex_trans = 
+            screen->get_tex_transfer(screen, tc->texture, 
+                                     addr.bits.face, 
+                                     addr.bits.level, 
+                                     addr.bits.z, 
+                                     PIPE_TRANSFER_READ, 0, 0,
+                                     tc->texture->width[addr.bits.level],
+                                     tc->texture->height[addr.bits.level]);
+
+         tc->tex_trans_map = screen->transfer_map(screen, tc->tex_trans);
+
+         tc->tex_face = addr.bits.face;
+         tc->tex_level = addr.bits.level;
+         tc->tex_z = addr.bits.z;
+      }
+
+      /* get tile from the transfer (view into texture) */
+      pipe_get_tile_rgba(tc->tex_trans,
+                         addr.bits.x * TILE_SIZE, 
+                         addr.bits.y * TILE_SIZE,
+                         TILE_SIZE, TILE_SIZE,
+                         (float *) tile->data.color);
+      tile->addr = addr;
+   }
+
+   tc->last_tile = tile;
+   return tile;
+}
+
+
+
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
new file mode 100644
index 0000000000..ac6886a3df
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
@@ -0,0 +1,155 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_TEX_TILE_CACHE_H
+#define SP_TEX_TILE_CACHE_H
+
+
+#include "pipe/p_compiler.h"
+
+
+struct softpipe_context;
+struct softpipe_tex_tile_cache;
+
+
+/**
+ * Cache tile size (width and height). This needs to be a power of two.
+ */
+#define TILE_SIZE 64
+
+
+/* If we need to support > 4096, just expand this to be a 64 bit
+ * union, or consider tiling in Z as well.
+ */
+union tex_tile_address {
+   struct {
+      unsigned x:6;             /* 4096 / TILE_SIZE */
+      unsigned y:6;             /* 4096 / TILE_SIZE */
+      unsigned z:12;            /* 4096 -- z not tiled */
+      unsigned face:3;
+      unsigned level:4;
+      unsigned invalid:1;
+   } bits;
+   unsigned value;
+};
+
+
+struct softpipe_tex_cached_tile
+{
+   union tex_tile_address addr;
+   union {
+      float color[TILE_SIZE][TILE_SIZE][4];
+   } data;
+};
+
+#define NUM_ENTRIES 50
+
+struct softpipe_tex_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct pipe_texture *texture;  /**< if caching a texture */
+   unsigned timestamp;
+
+   struct softpipe_tex_cached_tile entries[NUM_ENTRIES];
+
+   struct pipe_transfer *tex_trans;
+   void *tex_trans_map;
+   int tex_face, tex_level, tex_z;
+
+   struct softpipe_tex_cached_tile *last_tile;  /**< most recently retrieved tile */
+};
+
+
+extern struct softpipe_tex_tile_cache *
+sp_create_tex_tile_cache( struct pipe_screen *screen );
+
+extern void
+sp_destroy_tex_tile_cache(struct softpipe_tex_tile_cache *tc);
+
+
+extern void
+sp_tex_tile_cache_map_transfers(struct softpipe_tex_tile_cache *tc);
+
+extern void
+sp_tex_tile_cache_unmap_transfers(struct softpipe_tex_tile_cache *tc);
+
+extern void
+sp_tex_tile_cache_set_texture(struct softpipe_tex_tile_cache *tc,
+                          struct pipe_texture *texture);
+
+void
+sp_tex_tile_cache_validate_texture(struct softpipe_tex_tile_cache *tc);
+
+extern void
+sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc);
+
+
+
+extern const struct softpipe_tex_cached_tile *
+sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
+                         union tex_tile_address addr );
+
+static INLINE union tex_tile_address
+tex_tile_address( unsigned x,
+		  unsigned y,
+		  unsigned z,
+		  unsigned face,
+		  unsigned level )
+{
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   addr.bits.z = z;
+   addr.bits.face = face;
+   addr.bits.level = level;
+      
+   return addr;
+}
+
+/* Quickly retrieve tile if it matches last lookup.
+ */
+static INLINE const struct softpipe_tex_cached_tile *
+sp_get_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
+                         union tex_tile_address addr )
+{
+   if (tc->last_tile->addr.value == addr.value)
+      return tc->last_tile;
+
+   return sp_find_cached_tile_tex( tc, addr );
+}
+
+
+
+
+
+#endif /* SP_TEX_TILE_CACHE_H */
+
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index 70f0932431..49b51afda7 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -30,26 +30,21 @@
   *   Michel Dänzer <michel@tungstengraphics.com>
   */
 
-#include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
 #include "sp_context.h"
 #include "sp_state.h"
 #include "sp_texture.h"
-#include "sp_tile_cache.h"
 #include "sp_screen.h"
 #include "sp_winsys.h"
 
 
-/* Simple, maximally packed layout.
- */
-
-
-/* Conventional allocation path for non-display textures:
+/**
+ * Conventional allocation path for non-display textures:
+ * Use a simple, maximally packed layout.
  */
 static boolean
 softpipe_texture_layout(struct pipe_screen *screen,
@@ -89,6 +84,10 @@ softpipe_texture_layout(struct pipe_screen *screen,
    return spt->buffer != NULL;
 }
 
+
+/**
+ * Texture layout for simple color buffers.
+ */
 static boolean
 softpipe_displaytarget_layout(struct pipe_screen *screen,
                               struct softpipe_texture * spt)
@@ -112,21 +111,22 @@ softpipe_displaytarget_layout(struct pipe_screen *screen,
 }
 
 
-
-
-
 static struct pipe_texture *
 softpipe_texture_create(struct pipe_screen *screen,
-                        const struct pipe_texture *templat)
+                        const struct pipe_texture *template)
 {
    struct softpipe_texture *spt = CALLOC_STRUCT(softpipe_texture);
    if (!spt)
       return NULL;
 
-   spt->base = *templat;
+   spt->base = *template;
    pipe_reference_init(&spt->base.reference, 1);
    spt->base.screen = screen;
 
+   spt->pot = (util_is_power_of_two(template->width[0]) &&
+               util_is_power_of_two(template->height[0]) &&
+               util_is_power_of_two(template->depth[0]));
+
    if (spt->base.tex_usage & (PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
                               PIPE_TEXTURE_USAGE_PRIMARY)) {
       if (!softpipe_displaytarget_layout(screen, spt))
@@ -222,6 +222,13 @@ softpipe_get_tex_surface(struct pipe_screen *screen,
       if (ps->usage & PIPE_BUFFER_USAGE_GPU_READ)
          ps->usage |= PIPE_BUFFER_USAGE_CPU_READ;
 
+      if (ps->usage & (PIPE_BUFFER_USAGE_CPU_WRITE |
+                       PIPE_BUFFER_USAGE_GPU_WRITE)) {
+         /* Mark the surface as dirty.  The tile cache will look for this. */
+         spt->timestamp++;
+         softpipe_screen(screen)->timestamp++;
+      }
+
       ps->face = face;
       ps->level = level;
       ps->zslice = zslice;
@@ -342,14 +349,13 @@ softpipe_transfer_map( struct pipe_screen *screen,
    /* May want to different things here depending on read/write nature
     * of the map:
     */
-   if (transfer->texture && transfer->usage != PIPE_TRANSFER_READ) 
-   {
+   if (transfer->texture && transfer->usage != PIPE_TRANSFER_READ) {
       /* Do something to notify sharing contexts of a texture change.
        * In softpipe, that would mean flushing the texture cache.
        */
       softpipe_screen(screen)->timestamp++;
    }
-   
+
    xfer_map = map + softpipe_transfer(transfer)->offset +
       transfer->y / transfer->block.height * transfer->stride +
       transfer->x / transfer->block.width * transfer->block.size;
@@ -360,7 +366,7 @@ softpipe_transfer_map( struct pipe_screen *screen,
 
 static void
 softpipe_transfer_unmap(struct pipe_screen *screen,
-                       struct pipe_transfer *transfer)
+                        struct pipe_transfer *transfer)
 {
    struct softpipe_texture *spt;
 
@@ -371,18 +377,12 @@ softpipe_transfer_unmap(struct pipe_screen *screen,
 
    if (transfer->usage != PIPE_TRANSFER_READ) {
       /* Mark the texture as dirty to expire the tile caches. */
-      spt->modified = TRUE;
+      spt->timestamp++;
    }
 }
 
 
 void
-softpipe_init_texture_funcs(struct softpipe_context *sp)
-{
-}
-
-
-void
 softpipe_init_screen_texture_funcs(struct pipe_screen *screen)
 {
    screen->texture_create = softpipe_texture_create;
@@ -404,7 +404,7 @@ softpipe_get_texture_buffer( struct pipe_texture *texture,
                              struct pipe_buffer **buf,
                              unsigned *stride )
 {
-   struct softpipe_texture *tex = (struct softpipe_texture *)texture;
+   struct softpipe_texture *tex = (struct softpipe_texture *) texture;
 
    if (!tex)
       return FALSE;
diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h
index 893aa7d11d..2537ab6a40 100644
--- a/src/gallium/drivers/softpipe/sp_texture.h
+++ b/src/gallium/drivers/softpipe/sp_texture.h
@@ -48,7 +48,11 @@ struct softpipe_texture
     */
    struct pipe_buffer *buffer;
 
-   boolean modified;
+   /* True if texture images are power-of-two in all dimensions:
+    */
+   boolean pot;
+
+   unsigned timestamp;
 };
 
 struct softpipe_transfer
@@ -74,9 +78,6 @@ softpipe_transfer(struct pipe_transfer *pt)
 
 
 extern void
-softpipe_init_texture_funcs( struct softpipe_context *softpipe );
-
-extern void
 softpipe_init_screen_texture_funcs(struct pipe_screen *screen);
 
 
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index 461cbb9f95..de47970985 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 /**
- * Texture tile caching.
+ * Render target tile caching.
  *
  * Author:
  *    Brian Paul
@@ -35,38 +35,8 @@
 #include "pipe/p_inlines.h"
 #include "util/u_memory.h"
 #include "util/u_tile.h"
-#include "sp_context.h"
-#include "sp_surface.h"
-#include "sp_texture.h"
 #include "sp_tile_cache.h"
 
-#define NUM_ENTRIES 50
-
-
-/** XXX move these */
-#define MAX_WIDTH 4096
-#define MAX_HEIGHT 4096
-
-
-struct softpipe_tile_cache
-{
-   struct pipe_screen *screen;
-   struct pipe_surface *surface;  /**< the surface we're caching */
-   struct pipe_transfer *transfer;
-   void *transfer_map;
-   struct pipe_texture *texture;  /**< if caching a texture */
-   struct softpipe_cached_tile entries[NUM_ENTRIES];
-   uint clear_flags[(MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32];
-   float clear_color[4];  /**< for color bufs */
-   uint clear_val;        /**< for z+stencil, or packed color clear value */
-   boolean depth_stencil; /**< Is the surface a depth/stencil format? */
-
-   struct pipe_transfer *tex_trans;
-   void *tex_trans_map;
-   int tex_face, tex_level, tex_z;
-
-   struct softpipe_cached_tile tile;  /**< scratch tile for clears */
-};
 
 
 /**
@@ -76,7 +46,7 @@ struct softpipe_tile_cache
  * a LRU replacement policy.
  */
 #define CACHE_POS(x, y) \
-   (((x) / TILE_SIZE + ((y) / TILE_SIZE) * 5) % NUM_ENTRIES)
+   (((x) + (y) * 5) % NUM_ENTRIES)
 
 
 
@@ -84,12 +54,10 @@ struct softpipe_tile_cache
  * Is the tile at (x,y) in cleared state?
  */
 static INLINE uint
-is_clear_flag_set(const uint *bitvec, int x, int y)
+is_clear_flag_set(const uint *bitvec, union tile_address addr)
 {
    int pos, bit;
-   x /= TILE_SIZE;
-   y /= TILE_SIZE;
-   pos = y * (MAX_WIDTH / TILE_SIZE) + x;
+   pos = addr.bits.y * (MAX_WIDTH / TILE_SIZE) + addr.bits.x;
    assert(pos / 32 < (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32);
    bit = bitvec[pos / 32] & (1 << (pos & 31));
    return bit;
@@ -100,12 +68,10 @@ is_clear_flag_set(const uint *bitvec, int x, int y)
  * Mark the tile at (x,y) as not cleared.
  */
 static INLINE void
-clear_clear_flag(uint *bitvec, int x, int y)
+clear_clear_flag(uint *bitvec, union tile_address addr)
 {
    int pos;
-   x /= TILE_SIZE;
-   y /= TILE_SIZE;
-   pos = y * (MAX_WIDTH / TILE_SIZE) + x;
+   pos = addr.bits.y * (MAX_WIDTH / TILE_SIZE) + addr.bits.x;
    assert(pos / 32 < (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32);
    bitvec[pos / 32] &= ~(1 << (pos & 31));
 }
@@ -127,9 +93,9 @@ sp_create_tile_cache( struct pipe_screen *screen )
    if (tc) {
       tc->screen = screen;
       for (pos = 0; pos < NUM_ENTRIES; pos++) {
-         tc->entries[pos].x =
-         tc->entries[pos].y = -1;
+         tc->entries[pos].addr.bits.invalid = 1;
       }
+      tc->last_tile = &tc->entries[0]; /* any tile */
    }
    return tc;
 }
@@ -148,10 +114,6 @@ sp_destroy_tile_cache(struct softpipe_tile_cache *tc)
       screen = tc->transfer->texture->screen;
       screen->tex_transfer_destroy(tc->transfer);
    }
-   if (tc->tex_trans) {
-      screen = tc->tex_trans->texture->screen;
-      screen->tex_transfer_destroy(tc->tex_trans);
-   }
 
    FREE( tc );
 }
@@ -164,8 +126,6 @@ void
 sp_tile_cache_set_surface(struct softpipe_tile_cache *tc,
                           struct pipe_surface *ps)
 {
-   assert(!tc->texture);
-
    if (tc->transfer) {
       struct pipe_screen *screen = tc->transfer->texture->screen;
 
@@ -217,9 +177,6 @@ sp_tile_cache_map_transfers(struct softpipe_tile_cache *tc)
 {
    if (tc->transfer && !tc->transfer_map)
       tc->transfer_map = tc->screen->transfer_map(tc->screen, tc->transfer);
-
-   if (tc->tex_trans && !tc->tex_trans_map)
-      tc->tex_trans_map = tc->screen->transfer_map(tc->screen, tc->tex_trans);
 }
 
 
@@ -230,47 +187,6 @@ sp_tile_cache_unmap_transfers(struct softpipe_tile_cache *tc)
       tc->screen->transfer_unmap(tc->screen, tc->transfer);
       tc->transfer_map = NULL;
    }
-
-   if (tc->tex_trans_map) {
-      tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
-      tc->tex_trans_map = NULL;
-   }
-}
-
-
-/**
- * Specify the texture to cache.
- */
-void
-sp_tile_cache_set_texture(struct pipe_context *pipe,
-                          struct softpipe_tile_cache *tc,
-                          struct pipe_texture *texture)
-{
-   uint i;
-
-   assert(!tc->transfer);
-
-   pipe_texture_reference(&tc->texture, texture);
-
-   if (tc->tex_trans) {
-      struct pipe_screen *screen = tc->tex_trans->texture->screen;
-
-      if (tc->tex_trans_map) {
-         screen->transfer_unmap(screen, tc->tex_trans);
-         tc->tex_trans_map = NULL;
-      }
-
-      screen->tex_transfer_destroy(tc->tex_trans);
-      tc->tex_trans = NULL;
-   }
-
-   /* mark as entries as invalid/empty */
-   /* XXX we should try to avoid this when the teximage hasn't changed */
-   for (i = 0; i < NUM_ENTRIES; i++) {
-      tc->entries[i].x = -1;
-   }
-
-   tc->tex_face = -1; /* any invalid value here */
 }
 
 
@@ -314,7 +230,7 @@ clear_tile(struct softpipe_cached_tile *tile,
 
    switch (pf_get_size(format)) {
    case 1:
-      memset(tile->data.any, 0, TILE_SIZE * TILE_SIZE);
+      memset(tile->data.any, clear_value, TILE_SIZE * TILE_SIZE);
       break;
    case 2:
       if (clear_value == 0) {
@@ -350,8 +266,7 @@ clear_tile(struct softpipe_cached_tile *tile,
  * Actually clear the tiles which were flagged as being in a clear state.
  */
 static void
-sp_tile_cache_flush_clear(struct pipe_context *pipe,
-                          struct softpipe_tile_cache *tc)
+sp_tile_cache_flush_clear(struct softpipe_tile_cache *tc)
 {
    struct pipe_transfer *pt = tc->transfer;
    const uint w = tc->transfer->width;
@@ -365,13 +280,15 @@ sp_tile_cache_flush_clear(struct pipe_context *pipe,
    /* push the tile to all positions marked as clear */
    for (y = 0; y < h; y += TILE_SIZE) {
       for (x = 0; x < w; x += TILE_SIZE) {
-         if (is_clear_flag_set(tc->clear_flags, x, y)) {
+         union tile_address addr = tile_address(x, y);
+
+         if (is_clear_flag_set(tc->clear_flags, addr)) {
             pipe_put_tile_raw(pt,
                               x, y, TILE_SIZE, TILE_SIZE,
                               tc->tile.data.color32, 0/*STRIDE*/);
 
             /* do this? */
-            clear_clear_flag(tc->clear_flags, x, y);
+            clear_clear_flag(tc->clear_flags, addr);
 
             numCleared++;
          }
@@ -388,8 +305,7 @@ sp_tile_cache_flush_clear(struct pipe_context *pipe,
  * any tiles "flagged" as cleared will be "really" cleared.
  */
 void
-sp_flush_tile_cache(struct softpipe_context *softpipe,
-                    struct softpipe_tile_cache *tc)
+sp_flush_tile_cache(struct softpipe_tile_cache *tc)
 {
    struct pipe_transfer *pt = tc->transfer;
    int inuse = 0, pos;
@@ -398,33 +314,30 @@ sp_flush_tile_cache(struct softpipe_context *softpipe,
       /* caching a drawing transfer */
       for (pos = 0; pos < NUM_ENTRIES; pos++) {
          struct softpipe_cached_tile *tile = tc->entries + pos;
-         if (tile->x >= 0) {
+         if (!tile->addr.bits.invalid) {
             if (tc->depth_stencil) {
                pipe_put_tile_raw(pt,
-                                 tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                                 tile->addr.bits.x * TILE_SIZE, 
+                                 tile->addr.bits.y * TILE_SIZE, 
+                                 TILE_SIZE, TILE_SIZE,
                                  tile->data.depth32, 0/*STRIDE*/);
             }
             else {
                pipe_put_tile_rgba(pt,
-                                  tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                                  tile->addr.bits.x * TILE_SIZE, 
+                                  tile->addr.bits.y * TILE_SIZE, 
+                                  TILE_SIZE, TILE_SIZE,
                                   (float *) tile->data.color);
             }
-            tile->x = tile->y = -1;  /* mark as empty */
+            tile->addr.bits.invalid = 1;  /* mark as empty */
             inuse++;
          }
       }
 
 #if TILE_CLEAR_OPTIMIZATION
-      sp_tile_cache_flush_clear(&softpipe->pipe, tc);
+      sp_tile_cache_flush_clear(tc);
 #endif
    }
-   else if (tc->texture) {
-      /* caching a texture, mark all entries as empty */
-      for (pos = 0; pos < NUM_ENTRIES; pos++) {
-         tc->entries[pos].x = -1;
-      }
-      tc->tex_face = -1;
-   }
 
 #if 0
    debug_printf("flushed tiles in use: %d\n", inuse);
@@ -437,40 +350,39 @@ sp_flush_tile_cache(struct softpipe_context *softpipe,
  * \param x, y  position of tile, in pixels
  */
 struct softpipe_cached_tile *
-sp_get_cached_tile(struct softpipe_context *softpipe,
-                   struct softpipe_tile_cache *tc, int x, int y)
+sp_find_cached_tile(struct softpipe_tile_cache *tc, 
+                    union tile_address addr )
 {
    struct pipe_transfer *pt = tc->transfer;
-
-   /* tile pos in framebuffer: */
-   const int tile_x = x & ~(TILE_SIZE - 1);
-   const int tile_y = y & ~(TILE_SIZE - 1);
-
+   
    /* cache pos/entry: */
-   const int pos = CACHE_POS(x, y);
+   const int pos = CACHE_POS(addr.bits.x,
+                             addr.bits.y);
    struct softpipe_cached_tile *tile = tc->entries + pos;
 
-   if (tile_x != tile->x ||
-       tile_y != tile->y) {
+   if (addr.value != tile->addr.value) {
 
-      if (tile->x != -1) {
+      if (tile->addr.bits.invalid == 0) {
          /* put dirty tile back in framebuffer */
          if (tc->depth_stencil) {
             pipe_put_tile_raw(pt,
-                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->addr.bits.x * TILE_SIZE,
+                              tile->addr.bits.y * TILE_SIZE,
+                              TILE_SIZE, TILE_SIZE,
                               tile->data.depth32, 0/*STRIDE*/);
          }
          else {
             pipe_put_tile_rgba(pt,
-                               tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                               tile->addr.bits.x * TILE_SIZE,
+                               tile->addr.bits.y * TILE_SIZE,
+                               TILE_SIZE, TILE_SIZE,
                                (float *) tile->data.color);
          }
       }
 
-      tile->x = tile_x;
-      tile->y = tile_y;
+      tile->addr = addr;
 
-      if (is_clear_flag_set(tc->clear_flags, x, y)) {
+      if (is_clear_flag_set(tc->clear_flags, addr)) {
          /* don't get tile from framebuffer, just clear it */
          if (tc->depth_stencil) {
             clear_tile(tile, pt->format, tc->clear_val);
@@ -478,125 +390,33 @@ sp_get_cached_tile(struct softpipe_context *softpipe,
          else {
             clear_tile_rgba(tile, pt->format, tc->clear_color);
          }
-         clear_clear_flag(tc->clear_flags, x, y);
+         clear_clear_flag(tc->clear_flags, addr);
       }
       else {
          /* get new tile data from transfer */
          if (tc->depth_stencil) {
             pipe_get_tile_raw(pt,
-                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->addr.bits.x * TILE_SIZE, 
+                              tile->addr.bits.y * TILE_SIZE, 
+                              TILE_SIZE, TILE_SIZE,
                               tile->data.depth32, 0/*STRIDE*/);
          }
          else {
             pipe_get_tile_rgba(pt,
-                               tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                               tile->addr.bits.x * TILE_SIZE, 
+                               tile->addr.bits.y * TILE_SIZE,
+                               TILE_SIZE, TILE_SIZE,
                                (float *) tile->data.color);
          }
       }
    }
 
+   tc->last_tile = tile;
    return tile;
 }
 
 
-/**
- * Given the texture face, level, zslice, x and y values, compute
- * the cache entry position/index where we'd hope to find the
- * cached texture tile.
- * This is basically a direct-map cache.
- * XXX There's probably lots of ways in which we can improve this.
- */
-static INLINE uint
-tex_cache_pos(int x, int y, int z, int face, int level)
-{
-   uint entry = x + y * 9 + z * 3 + face + level * 7;
-   return entry % NUM_ENTRIES;
-}
-
 
-/**
- * Similar to sp_get_cached_tile() but for textures.
- * Tiles are read-only and indexed with more params.
- */
-const struct softpipe_cached_tile *
-sp_get_cached_tile_tex(struct softpipe_context *sp,
-                       struct softpipe_tile_cache *tc, int x, int y, int z,
-                       int face, int level)
-{
-   struct pipe_screen *screen = sp->pipe.screen;
-   /* tile pos in framebuffer: */
-   const int tile_x = x & ~(TILE_SIZE - 1);
-   const int tile_y = y & ~(TILE_SIZE - 1);
-   /* cache pos/entry: */
-   const uint pos = tex_cache_pos(x / TILE_SIZE, y / TILE_SIZE, z,
-                                  face, level);
-   struct softpipe_cached_tile *tile = tc->entries + pos;
-
-   if (tc->texture) {
-      struct softpipe_texture *spt = softpipe_texture(tc->texture);
-      if (spt->modified) {
-         /* texture was modified, invalidate all cached tiles */
-         uint p;
-         for (p = 0; p < NUM_ENTRIES; p++) {
-            tile = tc->entries + p;
-            tile->x = -1;
-         }
-         spt->modified = FALSE;
-      }
-   }
-
-   if (tile_x != tile->x ||
-       tile_y != tile->y ||
-       z != tile->z ||
-       face != tile->face ||
-       level != tile->level) {
-      /* cache miss */
-
-#if 0
-      printf("miss at %u  x=%d y=%d z=%d face=%d level=%d\n", pos,
-             x/TILE_SIZE, y/TILE_SIZE, z, face, level);
-#endif
-      /* check if we need to get a new transfer */
-      if (!tc->tex_trans ||
-          tc->tex_face != face ||
-          tc->tex_level != level ||
-          tc->tex_z != z) {
-         /* get new transfer (view into texture) */
-
-         if (tc->tex_trans) {
-            if (tc->tex_trans_map) {
-               tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
-               tc->tex_trans_map = NULL;
-            }
-
-            screen->tex_transfer_destroy(tc->tex_trans);
-            tc->tex_trans = NULL;
-         }
-
-         tc->tex_trans = screen->get_tex_transfer(screen, tc->texture, face, level, z, 
-                                                  PIPE_TRANSFER_READ, 0, 0,
-                                                  tc->texture->width[level],
-                                                  tc->texture->height[level]);
-         tc->tex_trans_map = screen->transfer_map(screen, tc->tex_trans);
-
-         tc->tex_face = face;
-         tc->tex_level = level;
-         tc->tex_z = z;
-      }
-
-      /* get tile from the transfer (view into texture) */
-      pipe_get_tile_rgba(tc->tex_trans,
-                         tile_x, tile_y, TILE_SIZE, TILE_SIZE,
-                         (float *) tile->data.color);
-      tile->x = tile_x;
-      tile->y = tile_y;
-      tile->z = z;
-      tile->face = face;
-      tile->level = level;
-   }
-
-   return tile;
-}
 
 
 /**
@@ -627,6 +447,6 @@ sp_tile_cache_clear(struct softpipe_tile_cache *tc, const float *rgba,
 
    for (pos = 0; pos < NUM_ENTRIES; pos++) {
       struct softpipe_cached_tile *tile = tc->entries + pos;
-      tile->x = tile->y = -1;
+      tile->addr.bits.invalid = 1;
    }
 }
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.h b/src/gallium/drivers/softpipe/sp_tile_cache.h
index 8f247d0e58..a12092702a 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.h
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.h
@@ -34,7 +34,6 @@
 #include "pipe/p_compiler.h"
 
 
-struct softpipe_context;
 struct softpipe_tile_cache;
 
 
@@ -44,11 +43,23 @@ struct softpipe_tile_cache;
 #define TILE_SIZE 64
 
 
+/* If we need to support > 4096, just expand this to be a 64 bit
+ * union, or consider tiling in Z as well.
+ */
+union tile_address {
+   struct {
+      unsigned x:6;             /* 4096 / TILE_SIZE */
+      unsigned y:6;             /* 4096 / TILE_SIZE */
+      unsigned invalid:1;
+      unsigned pad:19;
+   } bits;
+   unsigned value;
+};
+
 
 struct softpipe_cached_tile
 {
-   int x, y;           /**< pos of tile in window coords */
-   int z, face, level; /**< Extra texture indexes */
+   union tile_address addr;
    union {
       float color[TILE_SIZE][TILE_SIZE][4];
       uint color32[TILE_SIZE][TILE_SIZE];
@@ -59,6 +70,32 @@ struct softpipe_cached_tile
    } data;
 };
 
+#define NUM_ENTRIES 50
+
+
+/** XXX move these */
+#define MAX_WIDTH 4096
+#define MAX_HEIGHT 4096
+
+
+struct softpipe_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_surface *surface;  /**< the surface we're caching */
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct softpipe_cached_tile entries[NUM_ENTRIES];
+   uint clear_flags[(MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32];
+   float clear_color[4];  /**< for color bufs */
+   uint clear_val;        /**< for z+stencil, or packed color clear value */
+   boolean depth_stencil; /**< Is the surface a depth/stencil format? */
+
+   struct softpipe_cached_tile tile;  /**< scratch tile for clears */
+
+   struct softpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
+};
+
 
 extern struct softpipe_tile_cache *
 sp_create_tile_cache( struct pipe_screen *screen );
@@ -80,26 +117,45 @@ extern void
 sp_tile_cache_unmap_transfers(struct softpipe_tile_cache *tc);
 
 extern void
-sp_tile_cache_set_texture(struct pipe_context *pipe,
-                          struct softpipe_tile_cache *tc,
-                          struct pipe_texture *texture);
-
-extern void
-sp_flush_tile_cache(struct softpipe_context *softpipe,
-                    struct softpipe_tile_cache *tc);
+sp_flush_tile_cache(struct softpipe_tile_cache *tc);
 
 extern void
 sp_tile_cache_clear(struct softpipe_tile_cache *tc, const float *rgba,
                     uint clearValue);
 
 extern struct softpipe_cached_tile *
-sp_get_cached_tile(struct softpipe_context *softpipe,
-                   struct softpipe_tile_cache *tc, int x, int y);
+sp_find_cached_tile(struct softpipe_tile_cache *tc, 
+                    union tile_address addr );
+
+
+static INLINE union tile_address
+tile_address( unsigned x,
+              unsigned y )
+{
+   union tile_address addr;
+
+   addr.value = 0;
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+      
+   return addr;
+}
+
+/* Quickly retrieve tile if it matches last lookup.
+ */
+static INLINE struct softpipe_cached_tile *
+sp_get_cached_tile(struct softpipe_tile_cache *tc, 
+                   int x, int y )
+{
+   union tile_address addr = tile_address( x, y );
+
+   if (tc->last_tile->addr.value == addr.value)
+      return tc->last_tile;
+
+   return sp_find_cached_tile( tc, addr );
+}
+
 
-extern const struct softpipe_cached_tile *
-sp_get_cached_tile_tex(struct softpipe_context *softpipe,
-                       struct softpipe_tile_cache *tc, int x, int y, int z,
-                       int face, int level);
 
 
 #endif /* SP_TILE_CACHE_H */
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index ae0af4d055..bf470b46ae 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -125,11 +125,11 @@ trace_context_draw_block(struct trace_context *tr_ctx, int flag)
    } else if ((tr_ctx->draw_rule.blocker & flag) &&
               (tr_ctx->draw_blocker & 4)) {
       boolean block = FALSE;
-      debug_printf("%s (%lu %lu) (%lu %lu) (%lu %u) (%lu %u)\n", __FUNCTION__,
-					tr_ctx->draw_rule.fs, tr_ctx->curr.fs,
-					tr_ctx->draw_rule.vs, tr_ctx->curr.vs,
-					tr_ctx->draw_rule.surf, 0,
-					tr_ctx->draw_rule.tex, 0);
+      debug_printf("%s (%p %p) (%p %p) (%p %u) (%p %u)\n", __FUNCTION__,
+                   (void *) tr_ctx->draw_rule.fs, (void *) tr_ctx->curr.fs,
+                   (void *) tr_ctx->draw_rule.vs, (void *) tr_ctx->curr.vs,
+                   (void *) tr_ctx->draw_rule.surf, 0,
+                   (void *) tr_ctx->draw_rule.tex, 0);
       if (tr_ctx->draw_rule.fs &&
           tr_ctx->draw_rule.fs == tr_ctx->curr.fs)
          block = TRUE;