73 files changed, 4166 insertions, 1894 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
index 2de764c4ee..abd33f6eef 100644
--- a/src/gallium/auxiliary/Makefile
+++ b/src/gallium/auxiliary/Makefile
@@ -8,6 +8,7 @@ C_SOURCES = \
 	cso_cache/cso_context.c \
 	cso_cache/cso_hash.c \
 	draw/draw_context.c \
+	draw/draw_fs.c \
 	draw/draw_gs.c \
 	draw/draw_pipe.c \
 	draw/draw_pipe_aaline.c \
@@ -121,6 +122,7 @@ C_SOURCES = \
 	util/u_handle_table.c \
 	util/u_hash.c \
 	util/u_hash_table.c \
+	util/u_index_modify.c \
 	util/u_keymap.c \
 	util/u_linear.c \
 	util/u_linkage.c \
@@ -174,6 +176,7 @@ GALLIVM_SOURCES = \
         gallivm/lp_bld_struct.c \
         gallivm/lp_bld_swizzle.c \
         gallivm/lp_bld_tgsi_aos.c \
+        gallivm/lp_bld_tgsi_info.c \
         gallivm/lp_bld_tgsi_soa.c \
         gallivm/lp_bld_type.c \
         draw/draw_llvm.c \
diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript
index 294df30094..94cd74424a 100644
--- a/src/gallium/auxiliary/SConscript
+++ b/src/gallium/auxiliary/SConscript
@@ -54,6 +54,7 @@ source = [
     'cso_cache/cso_context.c',
     'cso_cache/cso_hash.c',
     'draw/draw_context.c',
+    'draw/draw_fs.c',
     'draw/draw_gs.c',
     'draw/draw_pipe.c',
     'draw/draw_pipe_aaline.c',
@@ -170,6 +171,7 @@ source = [
     'util/u_handle_table.c',
     'util/u_hash.c',
     'util/u_hash_table.c',
+    'util/u_index_modify.c',
     'util/u_keymap.c',
     'util/u_linear.c',
     'util/u_linkage.c',
@@ -225,6 +227,7 @@ if env['llvm']:
     'gallivm/lp_bld_struct.c',
     'gallivm/lp_bld_swizzle.c',
     'gallivm/lp_bld_tgsi_aos.c',
+    'gallivm/lp_bld_tgsi_info.c',
     'gallivm/lp_bld_tgsi_soa.c',
     'gallivm/lp_bld_type.c',
     'draw/draw_llvm.c',
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 937b093479..40f654643b 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -413,6 +413,42 @@ draw_set_force_passthrough( struct draw_context *draw, boolean enable )
 }
 
 
+
+/**
+ * Allocate an extra vertex/geometry shader vertex attribute.
+ * This is used by some of the optional draw module stages such
+ * as wide_point which may need to allocate additional generic/texcoord
+ * attributes.
+ */
+int
+draw_alloc_extra_vertex_attrib(struct draw_context *draw,
+                               uint semantic_name, uint semantic_index)
+{
+   const int num_outputs = draw_current_shader_outputs(draw);
+   const int n = draw->extra_shader_outputs.num;
+
+   assert(n < Elements(draw->extra_shader_outputs.semantic_name));
+
+   draw->extra_shader_outputs.semantic_name[n] = semantic_name;
+   draw->extra_shader_outputs.semantic_index[n] = semantic_index;
+   draw->extra_shader_outputs.slot[n] = num_outputs + n;
+   draw->extra_shader_outputs.num++;
+
+   return draw->extra_shader_outputs.slot[n];
+}
+
+
+/**
+ * Remove all extra vertex attributes that were allocated with
+ * draw_alloc_extra_vertex_attrib().
+ */
+void
+draw_remove_extra_vertex_attribs(struct draw_context *draw)
+{
+   draw->extra_shader_outputs.num = 0;
+}
+
+
 /**
  * Ask the draw module for the location/slot of the given vertex attribute in
  * a post-transformed vertex.
@@ -446,12 +482,12 @@ draw_find_shader_output(const struct draw_context *draw,
          return i;
    }
 
-   /* XXX there may be more than one extra vertex attrib.
-    * For example, simulated gl_FragCoord and gl_PointCoord.
-    */
-   if (draw->extra_shader_outputs.semantic_name == semantic_name &&
-       draw->extra_shader_outputs.semantic_index == semantic_index) {
-      return draw->extra_shader_outputs.slot;
+   /* Search the extra vertex attributes */
+   for (i = 0; i < draw->extra_shader_outputs.num; i++) {
+      if (draw->extra_shader_outputs.semantic_name[i] == semantic_name &&
+          draw->extra_shader_outputs.semantic_index[i] == semantic_index) {
+         return draw->extra_shader_outputs.slot[i];
+      }
    }
 
    return 0;
@@ -470,16 +506,18 @@ draw_find_shader_output(const struct draw_context *draw,
 uint
 draw_num_shader_outputs(const struct draw_context *draw)
 {
-   uint count = draw->vs.vertex_shader->info.num_outputs;
+   uint count;
 
    /* If a geometry shader is present, its outputs go to the
     * driver, else the vertex shader's outputs.
     */
    if (draw->gs.geometry_shader)
       count = draw->gs.geometry_shader->info.num_outputs;
+   else
+      count = draw->vs.vertex_shader->info.num_outputs;
+
+   count += draw->extra_shader_outputs.num;
 
-   if (draw->extra_shader_outputs.slot > 0)
-      count++;
    return count;
 }
 
@@ -671,6 +709,11 @@ draw_set_samplers(struct draw_context *draw,
       draw->samplers[i] = NULL;
 
    draw->num_samplers = num;
+
+#ifdef HAVE_LLVM
+   if (draw->llvm)
+      draw_llvm_set_sampler_state(draw);
+#endif
 }
 
 void
@@ -678,9 +721,9 @@ draw_set_mapped_texture(struct draw_context *draw,
                         unsigned sampler_idx,
                         uint32_t width, uint32_t height, uint32_t depth,
                         uint32_t last_level,
-                        uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
-                        uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
-                        const void *data[DRAW_MAX_TEXTURE_LEVELS])
+                        uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS],
+                        uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
+                        const void *data[PIPE_MAX_TEXTURE_LEVELS])
 {
 #ifdef HAVE_LLVM
    if(draw->llvm)
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index 4f0d30123a..ff4f753604 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -46,9 +46,9 @@ struct draw_context;
 struct draw_stage;
 struct draw_vertex_shader;
 struct draw_geometry_shader;
+struct draw_fragment_shader;
 struct tgsi_sampler;
 
-#define DRAW_MAX_TEXTURE_LEVELS 13  /* 4K x 4K for now */
 
 struct draw_context *draw_create( struct pipe_context *pipe );
 
@@ -119,9 +119,9 @@ draw_set_mapped_texture(struct draw_context *draw,
                         unsigned sampler_idx,
                         uint32_t width, uint32_t height, uint32_t depth,
                         uint32_t last_level,
-                        uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
-                        uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
-                        const void *data[DRAW_MAX_TEXTURE_LEVELS]);
+                        uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS],
+                        uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
+                        const void *data[PIPE_MAX_TEXTURE_LEVELS]);
 
 
 /*
@@ -138,6 +138,17 @@ void draw_delete_vertex_shader(struct draw_context *draw,
 
 
 /*
+ * Fragment shader functions
+ */
+struct draw_fragment_shader *
+draw_create_fragment_shader(struct draw_context *draw,
+                            const struct pipe_shader_state *shader);
+void draw_bind_fragment_shader(struct draw_context *draw,
+                               struct draw_fragment_shader *dvs);
+void draw_delete_fragment_shader(struct draw_context *draw,
+                                 struct draw_fragment_shader *dvs);
+
+/*
  * Geometry shader functions
  */
 struct draw_geometry_shader *
diff --git a/src/gallium/auxiliary/draw/draw_fs.c b/src/gallium/auxiliary/draw/draw_fs.c
new file mode 100644
index 0000000000..1543bd86f1
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_fs.c
@@ -0,0 +1,73 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_shader_tokens.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_prim.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "draw_fs.h"
+#include "draw_private.h"
+#include "draw_context.h"
+
+
+struct draw_fragment_shader *
+draw_create_fragment_shader(struct draw_context *draw,
+                            const struct pipe_shader_state *shader)
+{
+   struct draw_fragment_shader *dfs;
+
+   dfs = CALLOC_STRUCT(draw_fragment_shader);
+   if (dfs) {
+      dfs->base = *shader;
+      tgsi_scan_shader(shader->tokens, &dfs->info);
+   }
+
+   return dfs;
+}
+
+
+void
+draw_bind_fragment_shader(struct draw_context *draw,
+                          struct draw_fragment_shader *dfs)
+{
+   draw_do_flush(draw, DRAW_FLUSH_STATE_CHANGE);
+
+   draw->fs.fragment_shader = dfs;
+}
+
+
+void
+draw_delete_fragment_shader(struct draw_context *draw,
+                            struct draw_fragment_shader *dfs)
+{
+   FREE(dfs);
+}
+
diff --git a/src/gallium/auxiliary/draw/draw_fs.h b/src/gallium/auxiliary/draw/draw_fs.h
new file mode 100644
index 0000000000..44995b8277
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_fs.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef DRAW_FS_H
+#define DRAW_FS_H
+
+
+#include "tgsi/tgsi_scan.h"
+
+
+struct draw_fragment_shader
+{
+   struct pipe_shader_state base;
+   struct tgsi_shader_info info;
+};
+
+
+#endif /* DRAW_FS_H */
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 8759c38cab..d94340367c 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -44,6 +44,7 @@
 #include "tgsi/tgsi_dump.h"
 
 #include "util/u_cpu_detect.h"
+#include "util/u_math.h"
 #include "util/u_pointer.h"
 #include "util/u_string.h"
 
@@ -71,12 +72,17 @@ init_globals(struct draw_llvm *llvm)
       elem_types[DRAW_JIT_TEXTURE_DEPTH] = LLVMInt32Type();
       elem_types[DRAW_JIT_TEXTURE_LAST_LEVEL] = LLVMInt32Type();
       elem_types[DRAW_JIT_TEXTURE_ROW_STRIDE] =
-         LLVMArrayType(LLVMInt32Type(), DRAW_MAX_TEXTURE_LEVELS);
+         LLVMArrayType(LLVMInt32Type(), PIPE_MAX_TEXTURE_LEVELS);
       elem_types[DRAW_JIT_TEXTURE_IMG_STRIDE] =
-         LLVMArrayType(LLVMInt32Type(), DRAW_MAX_TEXTURE_LEVELS);
+         LLVMArrayType(LLVMInt32Type(), PIPE_MAX_TEXTURE_LEVELS);
       elem_types[DRAW_JIT_TEXTURE_DATA] =
          LLVMArrayType(LLVMPointerType(LLVMInt8Type(), 0),
-                       DRAW_MAX_TEXTURE_LEVELS);
+                       PIPE_MAX_TEXTURE_LEVELS);
+      elem_types[DRAW_JIT_TEXTURE_MIN_LOD] = LLVMFloatType();
+      elem_types[DRAW_JIT_TEXTURE_MAX_LOD] = LLVMFloatType();
+      elem_types[DRAW_JIT_TEXTURE_LOD_BIAS] = LLVMFloatType();
+      elem_types[DRAW_JIT_TEXTURE_BORDER_COLOR] = 
+         LLVMArrayType(LLVMFloatType(), 4);
 
       texture_type = LLVMStructType(elem_types, Elements(elem_types), 0);
 
@@ -101,6 +107,18 @@ init_globals(struct draw_llvm *llvm)
       LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, data,
                              llvm->target, texture_type,
                              DRAW_JIT_TEXTURE_DATA);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, min_lod,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_MIN_LOD);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, max_lod,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_MAX_LOD);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, lod_bias,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_LOD_BIAS);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, border_color,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_BORDER_COLOR);
       LP_CHECK_STRUCT_SIZE(struct draw_jit_texture,
                            llvm->target, texture_type);
 
@@ -1048,9 +1066,9 @@ draw_llvm_set_mapped_texture(struct draw_context *draw,
                              unsigned sampler_idx,
                              uint32_t width, uint32_t height, uint32_t depth,
                              uint32_t last_level,
-                             uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
-                             uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
-                             const void *data[DRAW_MAX_TEXTURE_LEVELS])
+                             uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS],
+                             uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
+                             const void *data[PIPE_MAX_TEXTURE_LEVELS])
 {
    unsigned j;
    struct draw_jit_texture *jit_tex;
@@ -1072,6 +1090,25 @@ draw_llvm_set_mapped_texture(struct draw_context *draw,
    }
 }
 
+
+void
+draw_llvm_set_sampler_state(struct draw_context *draw)
+{
+   unsigned i;
+
+   for (i = 0; i < draw->num_samplers; i++) {
+      struct draw_jit_texture *jit_tex = &draw->llvm->jit_context.textures[i];
+
+      if (draw->samplers[i]) {
+         jit_tex->min_lod = draw->samplers[i]->min_lod;
+         jit_tex->max_lod = draw->samplers[i]->max_lod;
+         jit_tex->lod_bias = draw->samplers[i]->lod_bias;
+         COPY_4V(jit_tex->border_color, draw->samplers[i]->border_color);
+      }
+   }
+}
+
+
 void
 draw_llvm_destroy_variant(struct draw_llvm_variant *variant)
 {
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
index 6196b2f983..de89b657f3 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -41,7 +41,6 @@
 #include <llvm-c/Target.h>
 #include <llvm-c/ExecutionEngine.h>
 
-#define DRAW_MAX_TEXTURE_LEVELS 13  /* 4K x 4K for now */
 
 struct draw_llvm;
 struct llvm_vertex_shader;
@@ -52,9 +51,13 @@ struct draw_jit_texture
    uint32_t height;
    uint32_t depth;
    uint32_t last_level;
-   uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS];
-   uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS];
-   const void *data[DRAW_MAX_TEXTURE_LEVELS];
+   uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS];
+   uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS];
+   const void *data[PIPE_MAX_TEXTURE_LEVELS];
+   float min_lod;
+   float max_lod;
+   float lod_bias;
+   float border_color[4];
 };
 
 enum {
@@ -65,6 +68,10 @@ enum {
    DRAW_JIT_TEXTURE_ROW_STRIDE,
    DRAW_JIT_TEXTURE_IMG_STRIDE,
    DRAW_JIT_TEXTURE_DATA,
+   DRAW_JIT_TEXTURE_MIN_LOD,
+   DRAW_JIT_TEXTURE_MAX_LOD,
+   DRAW_JIT_TEXTURE_LOD_BIAS,
+   DRAW_JIT_TEXTURE_BORDER_COLOR,
    DRAW_JIT_TEXTURE_NUM_FIELDS  /* number of fields above */
 };
 
@@ -275,12 +282,15 @@ draw_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
                              LLVMValueRef context_ptr);
 
 void
+draw_llvm_set_sampler_state(struct draw_context *draw);
+
+void
 draw_llvm_set_mapped_texture(struct draw_context *draw,
                              unsigned sampler_idx,
                              uint32_t width, uint32_t height, uint32_t depth,
                              uint32_t last_level,
-                             uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
-                             uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
-                             const void *data[DRAW_MAX_TEXTURE_LEVELS]);
+                             uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS],
+                             uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
+                             const void *data[PIPE_MAX_TEXTURE_LEVELS]);
 
 #endif
diff --git a/src/gallium/auxiliary/draw/draw_llvm_sample.c b/src/gallium/auxiliary/draw/draw_llvm_sample.c
index e9811010db..ac1fbb179c 100644
--- a/src/gallium/auxiliary/draw/draw_llvm_sample.c
+++ b/src/gallium/auxiliary/draw/draw_llvm_sample.c
@@ -146,6 +146,10 @@ DRAW_LLVM_TEXTURE_MEMBER(last_level, DRAW_JIT_TEXTURE_LAST_LEVEL, TRUE)
 DRAW_LLVM_TEXTURE_MEMBER(row_stride, DRAW_JIT_TEXTURE_ROW_STRIDE, FALSE)
 DRAW_LLVM_TEXTURE_MEMBER(img_stride, DRAW_JIT_TEXTURE_IMG_STRIDE, FALSE)
 DRAW_LLVM_TEXTURE_MEMBER(data_ptr,   DRAW_JIT_TEXTURE_DATA, FALSE)
+DRAW_LLVM_TEXTURE_MEMBER(min_lod,    DRAW_JIT_TEXTURE_MIN_LOD, TRUE)
+DRAW_LLVM_TEXTURE_MEMBER(max_lod,    DRAW_JIT_TEXTURE_MAX_LOD, TRUE)
+DRAW_LLVM_TEXTURE_MEMBER(lod_bias,   DRAW_JIT_TEXTURE_LOD_BIAS, TRUE)
+DRAW_LLVM_TEXTURE_MEMBER(border_color, DRAW_JIT_TEXTURE_BORDER_COLOR, FALSE)
 
 
 static void
@@ -207,6 +211,10 @@ draw_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
    sampler->dynamic_state.base.row_stride = draw_llvm_texture_row_stride;
    sampler->dynamic_state.base.img_stride = draw_llvm_texture_img_stride;
    sampler->dynamic_state.base.data_ptr = draw_llvm_texture_data_ptr;
+   sampler->dynamic_state.base.min_lod = draw_llvm_texture_min_lod;
+   sampler->dynamic_state.base.max_lod = draw_llvm_texture_max_lod;
+   sampler->dynamic_state.base.lod_bias = draw_llvm_texture_lod_bias;
+   sampler->dynamic_state.base.border_color = draw_llvm_texture_border_color;
    sampler->dynamic_state.static_state = static_state;
    sampler->dynamic_state.context_ptr = context_ptr;
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index eac21110be..d1aba76309 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -688,10 +688,9 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)
    aaline->tex_slot = draw_current_shader_outputs(draw);
    aaline->pos_slot = draw_current_shader_position_output(draw);;
 
-   /* advertise the extra post-transformed vertex attribute */
-   draw->extra_shader_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
-   draw->extra_shader_outputs.semantic_index = aaline->fs->generic_attrib;
-   draw->extra_shader_outputs.slot = aaline->tex_slot;
+   /* allocate the extra post-transformed vertex attribute */
+   (void) draw_alloc_extra_vertex_attrib(draw, TGSI_SEMANTIC_GENERIC,
+                                         aaline->fs->generic_attrib);
 
    /* how many samplers? */
    /* we'll use sampler/texture[pstip->sampler_unit] for the stipple */
@@ -744,7 +743,7 @@ aaline_flush(struct draw_stage *stage, unsigned flags)
 
    draw->suspend_flushing = FALSE;
 
-   draw->extra_shader_outputs.slot = 0;
+   draw_remove_extra_vertex_attribs(draw);
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
index d406a86ccb..5ea552f51c 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
@@ -701,9 +701,9 @@ aapoint_first_point(struct draw_stage *stage, struct prim_header *header)
 
    aapoint->pos_slot = draw_current_shader_position_output(draw);
 
-   draw->extra_shader_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
-   draw->extra_shader_outputs.semantic_index = aapoint->fs->generic_attrib;
-   draw->extra_shader_outputs.slot = aapoint->tex_slot;
+   /* allocate the extra post-transformed vertex attribute */
+   (void) draw_alloc_extra_vertex_attrib(draw, TGSI_SEMANTIC_GENERIC,
+                                         aapoint->fs->generic_attrib);
 
    /* find psize slot in post-transform vertex */
    aapoint->psize_slot = -1;
@@ -754,7 +754,7 @@ aapoint_flush(struct draw_stage *stage, unsigned flags)
 
    draw->suspend_flushing = FALSE;
 
-   draw->extra_shader_outputs.slot = 0;
+   draw_remove_extra_vertex_attribs(draw);
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index 8a3d499feb..a10d8e9edc 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -263,6 +263,8 @@ do_clip_tri( struct draw_stage *stage,
       clipmask &= ~(1<<plane_idx);
 
       assert(n < MAX_CLIPPED_VERTICES);
+      if (n >= MAX_CLIPPED_VERTICES)
+         return;
       inlist[n] = inlist[0]; /* prevent rotation of vertices */
 
       for (i = 1; i <= n; i++) {
@@ -272,16 +274,22 @@ do_clip_tri( struct draw_stage *stage,
 
 	 if (!IS_NEGATIVE(dp_prev)) {
             assert(outcount < MAX_CLIPPED_VERTICES);
+            if (outcount >= MAX_CLIPPED_VERTICES)
+               return;
 	    outlist[outcount++] = vert_prev;
 	 }
 
 	 if (DIFFERENT_SIGNS(dp, dp_prev)) {
 	    struct vertex_header *new_vert;
 
-            assert(tmpnr < MAX_CLIPPED_VERTICES+1);
+            assert(tmpnr < MAX_CLIPPED_VERTICES + 1);
+            if (tmpnr >= MAX_CLIPPED_VERTICES + 1)
+               return;
             new_vert = clipper->stage.tmp[tmpnr++];
 
             assert(outcount < MAX_CLIPPED_VERTICES);
+            if (outcount >= MAX_CLIPPED_VERTICES)
+               return;
 	    outlist[outcount++] = new_vert;
 
 	    if (IS_NEGATIVE(dp)) {
@@ -321,27 +329,32 @@ do_clip_tri( struct draw_stage *stage,
 
    /* If flat-shading, copy provoking vertex color to polygon vertex[0]
     */
-   if (clipper->flat) {
-      if (stage->draw->rasterizer->flatshade_first) {
-         if (inlist[0] != header->v[0]) {
-            assert(tmpnr < MAX_CLIPPED_VERTICES + 1);
-            inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
-            copy_colors(stage, inlist[0], header->v[0]);
+   if (n >= 3) {
+      if (clipper->flat) {
+         if (stage->draw->rasterizer->flatshade_first) {
+            if (inlist[0] != header->v[0]) {
+               assert(tmpnr < MAX_CLIPPED_VERTICES + 1);
+               if (tmpnr >= MAX_CLIPPED_VERTICES + 1)
+                  return;
+               inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
+               copy_colors(stage, inlist[0], header->v[0]);
+            }
          }
-      }
-      else {
-         if (inlist[0] != header->v[2]) {
-            assert(tmpnr < MAX_CLIPPED_VERTICES + 1);
-            inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
-            copy_colors(stage, inlist[0], header->v[2]);
+         else {
+            if (inlist[0] != header->v[2]) {
+               assert(tmpnr < MAX_CLIPPED_VERTICES + 1);
+               if (tmpnr >= MAX_CLIPPED_VERTICES + 1)
+                  return;
+               inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
+               copy_colors(stage, inlist[0], header->v[2]);
+            }
          }
       }
-   }
-
-   /* Emit the polygon as triangles to the setup stage:
-    */
-   if (n >= 3)
+      
+      /* Emit the polygon as triangles to the setup stage:
+       */
       emit_poly( stage, inlist, n, header );
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_validate.c b/src/gallium/auxiliary/draw/draw_pipe_validate.c
index 8b92543987..c575a8ac7c 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_validate.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_validate.c
@@ -172,7 +172,7 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
    wide_lines = (rast->line_width > draw->pipeline.wide_line_threshold
                  && !rast->line_smooth);
 
-   /* drawing large points? */
+   /* drawing large/sprite points (but not AA points)? */
    if (rast->sprite_coord_enable && draw->pipeline.point_sprite)
       wide_points = TRUE;
    else if (rast->point_smooth && draw->pipeline.aapoint)
@@ -207,7 +207,7 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
       precalc_flat = TRUE;
    }
 
-   if (wide_points || rast->sprite_coord_enable) {
+   if (wide_points) {
       draw->pipeline.wide_point->next = next;
       next = draw->pipeline.wide_point;
    }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
index ee2945c7c9..3646c6a714 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
@@ -57,26 +57,24 @@
 #include "util/u_memory.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
+#include "draw_fs.h"
 #include "draw_vs.h"
 #include "draw_pipe.h"
 
 
 struct widepoint_stage {
-   struct draw_stage stage;
+   struct draw_stage stage;  /**< base class */
 
    float half_point_size;
 
    float xbias;
    float ybias;
 
-   uint texcoord_slot[PIPE_MAX_SHADER_OUTPUTS];
-   uint texcoord_enable[PIPE_MAX_SHADER_OUTPUTS];
-   uint num_texcoords;
-   uint texcoord_mode;
+   /** for automatic texcoord generation/replacement */
+   uint num_texcoord_gen;
+   uint texcoord_gen_slot[PIPE_MAX_SHADER_OUTPUTS];
 
    int psize_slot;
-
-   int point_coord_fs_input;  /**< input for pointcoord */
 };
 
 
@@ -96,30 +94,20 @@ widepoint_stage( struct draw_stage *stage )
 static void set_texcoords(const struct widepoint_stage *wide,
                           struct vertex_header *v, const float tc[4])
 {
+   const struct draw_context *draw = wide->stage.draw;
+   const struct pipe_rasterizer_state *rast = draw->rasterizer;
+   const uint texcoord_mode = rast->sprite_coord_mode;
    uint i;
-   for (i = 0; i < wide->num_texcoords; i++) {
-      if (wide->texcoord_enable[i]) {
-         uint j = wide->texcoord_slot[i];
-         v->data[j][0] = tc[0];
-         if (wide->texcoord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
-            v->data[j][1] = 1.0f - tc[1];
-         else
-            v->data[j][1] = tc[1];
-         v->data[j][2] = tc[2];
-         v->data[j][3] = tc[3];
-      }
-   }
 
-   if (wide->point_coord_fs_input >= 0) {
-      /* put gl_PointCoord into the extra vertex slot */
-      uint slot = wide->stage.draw->extra_shader_outputs.slot;
+   for (i = 0; i < wide->num_texcoord_gen; i++) {
+      const uint slot = wide->texcoord_gen_slot[i];
       v->data[slot][0] = tc[0];
-      if (wide->texcoord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
+      if (texcoord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
          v->data[slot][1] = 1.0f - tc[1];
       else
          v->data[slot][1] = tc[1];
-      v->data[slot][2] = 0.0F;
-      v->data[slot][3] = 1.0F;
+      v->data[slot][2] = tc[2];
+      v->data[slot][3] = tc[3];
    }
 }
 
@@ -201,18 +189,9 @@ static void widepoint_point( struct draw_stage *stage,
 }
 
 
-static int
-find_pntc_input_attrib(struct draw_context *draw)
-{
-   /* Scan the fragment program's input decls to find the pointcoord
-    * attribute.  The xy components will store the point coord.
-    */
-   return 0; /* XXX fix this */
-}
-
-
-static void widepoint_first_point( struct draw_stage *stage, 
-			      struct prim_header *header )
+static void
+widepoint_first_point(struct draw_stage *stage, 
+                      struct prim_header *header)
 {
    struct widepoint_stage *wide = widepoint_stage(stage);
    struct draw_context *draw = stage->draw;
@@ -244,31 +223,49 @@ static void widepoint_first_point( struct draw_stage *stage,
       stage->point = draw_pipe_passthrough_point;
    }
 
+   draw_remove_extra_vertex_attribs(draw);
+
    if (rast->point_quad_rasterization) {
-      /* find vertex shader texcoord outputs */
-      const struct draw_vertex_shader *vs = draw->vs.vertex_shader;
-      uint i, j = 0;
-      wide->texcoord_mode = rast->sprite_coord_mode;
-      for (i = 0; i < vs->info.num_outputs; i++) {
-         if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_GENERIC) {
-            wide->texcoord_slot[j] = i;
-            wide->texcoord_enable[j] = (rast->sprite_coord_enable >> j) & 1;
-            j++;
+      const struct draw_fragment_shader *fs = draw->fs.fragment_shader;
+      uint i;
+
+      wide->num_texcoord_gen = 0;
+
+      /* Loop over fragment shader inputs looking for generic inputs
+       * for which bit 'k' in sprite_coord_enable is set.
+       */
+      for (i = 0; i < fs->info.num_inputs; i++) {
+         if (fs->info.input_semantic_name[i] == TGSI_SEMANTIC_GENERIC) {
+            const int generic_index = fs->info.input_semantic_index[i];
+            /* Note that sprite_coord enable is a bitfield of
+             * PIPE_MAX_SHADER_OUTPUTS bits.
+             */
+            if (generic_index < PIPE_MAX_SHADER_OUTPUTS &&
+                (rast->sprite_coord_enable & (1 << generic_index))) {
+               /* OK, this generic attribute needs to be replaced with a
+                * texcoord (see above).
+                */
+               int slot = draw_find_shader_output(draw,
+                                                  TGSI_SEMANTIC_GENERIC,
+                                                  generic_index);
+
+               if (slot > 0) {
+                  /* there's already a post-vertex shader attribute
+                   * for this fragment shader input attribute.
+                   */
+               }
+               else {
+                  /* need to allocate a new post-vertex shader attribute */
+                  slot = draw_alloc_extra_vertex_attrib(draw,
+                                                        TGSI_SEMANTIC_GENERIC,
+                                                        generic_index);
+               }
+
+               /* add this slot to the texcoord-gen list */
+               wide->texcoord_gen_slot[wide->num_texcoord_gen++] = slot;
+            }
          }
       }
-      wide->num_texcoords = j;
-
-      /* find fragment shader PointCoord input */
-      wide->point_coord_fs_input = find_pntc_input_attrib(draw);
-
-      /* setup extra vp output (point coord implemented as a texcoord) */
-      draw->extra_shader_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
-      draw->extra_shader_outputs.semantic_index = 0;
-      draw->extra_shader_outputs.slot = draw_current_shader_outputs(draw);
-   }
-   else {
-      wide->point_coord_fs_input = -1;
-      draw->extra_shader_outputs.slot = 0;
    }
 
    wide->psize_slot = -1;
@@ -295,7 +292,8 @@ static void widepoint_flush( struct draw_stage *stage, unsigned flags )
 
    stage->point = widepoint_first_point;
    stage->next->flush( stage->next, flags );
-   stage->draw->extra_shader_outputs.slot = 0;
+
+   draw_remove_extra_vertex_attribs(draw);
 
    /* restore original rasterizer state */
    if (draw->rast_handle) {
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 362f563ba6..d417f825a0 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -250,6 +250,11 @@ struct draw_context
       struct tgsi_sampler **samplers;
    } gs;
 
+   /** Fragment shader state */
+   struct {
+      struct draw_fragment_shader *fragment_shader;
+   } fs;
+
    /** Stream output (vertex feedback) state */
    struct {
       struct pipe_stream_output_state state;
@@ -266,9 +271,10 @@ struct draw_context
    /* If a prim stage introduces new vertex attributes, they'll be stored here
     */
    struct {
-      uint semantic_name;
-      uint semantic_index;
-      int slot;
+      uint num;
+      uint semantic_name[10];
+      uint semantic_index[10];
+      uint slot[10];
    } extra_shader_outputs;
 
    unsigned reduced_prim;
@@ -362,6 +368,11 @@ void draw_gs_destroy( struct draw_context *draw );
 uint draw_current_shader_outputs(const struct draw_context *draw);
 uint draw_current_shader_position_output(const struct draw_context *draw);
 
+int draw_alloc_extra_vertex_attrib(struct draw_context *draw,
+                                   uint semantic_name, uint semantic_index);
+void draw_remove_extra_vertex_attribs(struct draw_context *draw);
+
+
 /*******************************************************************************
  * Vertex processing (was passthrough) code:
  */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index dce3c3745b..00f419a486 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -614,17 +614,15 @@ lp_build_div(struct lp_build_context *bld,
 
 
 /**
- * Linear interpolation.
- *
- * This also works for integer values with a few caveats.
+ * Linear interpolation -- without any checks.
  *
  * @sa http://www.stereopsis.com/doubleblend.html
  */
-LLVMValueRef
-lp_build_lerp(struct lp_build_context *bld,
-              LLVMValueRef x,
-              LLVMValueRef v0,
-              LLVMValueRef v1)
+static INLINE LLVMValueRef
+lp_build_lerp_simple(struct lp_build_context *bld,
+                     LLVMValueRef x,
+                     LLVMValueRef v0,
+                     LLVMValueRef v1)
 {
    LLVMValueRef delta;
    LLVMValueRef res;
@@ -639,12 +637,80 @@ lp_build_lerp(struct lp_build_context *bld,
 
    res = lp_build_add(bld, v0, res);
 
-   if(bld->type.fixed)
+   if (bld->type.fixed) {
       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
        * but it will be wrong for other uses. Basically we need a more
        * powerful lp_type, capable of further distinguishing the values
        * interpretation from the value storage. */
       res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
+   }
+
+   return res;
+}
+
+
+/**
+ * Linear interpolation.
+ */
+LLVMValueRef
+lp_build_lerp(struct lp_build_context *bld,
+              LLVMValueRef x,
+              LLVMValueRef v0,
+              LLVMValueRef v1)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   assert(lp_check_value(type, x));
+   assert(lp_check_value(type, v0));
+   assert(lp_check_value(type, v1));
+
+   if (type.norm) {
+      struct lp_type wide_type;
+      struct lp_build_context wide_bld;
+      LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
+      LLVMValueRef shift;
+
+      assert(type.length >= 2);
+      assert(!type.sign);
+
+      /*
+       * Create a wider type, enough to hold the intermediate result of the
+       * multiplication.
+       */
+      memset(&wide_type, 0, sizeof wide_type);
+      wide_type.fixed  = TRUE;
+      wide_type.width  = type.width*2;
+      wide_type.length = type.length/2;
+
+      lp_build_context_init(&wide_bld, bld->builder, wide_type);
+
+      lp_build_unpack2(bld->builder, type, wide_type, x,  &xl,  &xh);
+      lp_build_unpack2(bld->builder, type, wide_type, v0, &v0l, &v0h);
+      lp_build_unpack2(bld->builder, type, wide_type, v1, &v1l, &v1h);
+
+      /*
+       * Scale x from [0, 255] to [0, 256]
+       */
+
+      shift = lp_build_const_int_vec(wide_type, type.width - 1);
+
+      xl = lp_build_add(&wide_bld, xl,
+                        LLVMBuildAShr(bld->builder, xl, shift, ""));
+      xh = lp_build_add(&wide_bld, xh,
+                        LLVMBuildAShr(bld->builder, xh, shift, ""));
+
+      /*
+       * Lerp both halves.
+       */
+
+      resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
+      resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
+
+      res = lp_build_pack2(bld->builder, wide_type, type, resl, resh);
+   } else {
+      res = lp_build_lerp_simple(bld, x, v0, v1);
+   }
 
    return res;
 }
@@ -923,35 +989,122 @@ lp_build_round_sse41(struct lp_build_context *bld,
                      enum lp_build_round_sse41_mode mode)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMTypeRef i32t = LLVMInt32Type();
    const char *intrinsic;
+   LLVMValueRef res;
 
    assert(type.floating);
-   assert(type.width*type.length == 128);
+
    assert(lp_check_value(type, a));
    assert(util_cpu_caps.has_sse4_1);
 
-   switch(type.width) {
-   case 32:
-      intrinsic = "llvm.x86.sse41.round.ps";
-      break;
-   case 64:
-      intrinsic = "llvm.x86.sse41.round.pd";
-      break;
-   default:
-      assert(0);
-      return bld->undef;
+   if (type.length == 1) {
+      LLVMTypeRef vec_type;
+      LLVMValueRef undef;
+      LLVMValueRef args[3];
+      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
+
+      switch(type.width) {
+      case 32:
+         intrinsic = "llvm.x86.sse41.round.ss";
+         break;
+      case 64:
+         intrinsic = "llvm.x86.sse41.round.sd";
+         break;
+      default:
+         assert(0);
+         return bld->undef;
+      }
+
+      vec_type = LLVMVectorType(bld->elem_type, 4);
+
+      undef = LLVMGetUndef(vec_type);
+
+      args[0] = undef;
+      args[1] = LLVMBuildInsertElement(bld->builder, undef, a, index0, "");
+      args[2] = LLVMConstInt(i32t, mode, 0);
+
+      res = lp_build_intrinsic(bld->builder, intrinsic,
+                               vec_type, args, Elements(args));
+
+      res = LLVMBuildExtractElement(bld->builder, res, index0, "");
+   }
+   else {
+      assert(type.width*type.length == 128);
+
+      switch(type.width) {
+      case 32:
+         intrinsic = "llvm.x86.sse41.round.ps";
+         break;
+      case 64:
+         intrinsic = "llvm.x86.sse41.round.pd";
+         break;
+      default:
+         assert(0);
+         return bld->undef;
+      }
+
+      res = lp_build_intrinsic_binary(bld->builder, intrinsic,
+                                      bld->vec_type, a,
+                                      LLVMConstInt(i32t, mode, 0));
+   }
+
+   return res;
+}
+
+
+static INLINE LLVMValueRef
+lp_build_iround_nearest_sse2(struct lp_build_context *bld,
+                             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMTypeRef ret_type = lp_build_int_vec_type(type);
+   const char *intrinsic;
+   LLVMValueRef res;
+
+   assert(type.floating);
+   /* using the double precision conversions is a bit more complicated */
+   assert(type.width == 32);
+
+   assert(lp_check_value(type, a));
+   assert(util_cpu_caps.has_sse2);
+
+   /* This is relying on MXCSR rounding mode, which should always be nearest. */
+   if (type.length == 1) {
+      LLVMTypeRef vec_type;
+      LLVMValueRef undef;
+      LLVMValueRef arg;
+      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
+
+      vec_type = LLVMVectorType(bld->elem_type, 4);
+
+      intrinsic = "llvm.x86.sse.cvtss2si";
+
+      undef = LLVMGetUndef(vec_type);
+
+      arg = LLVMBuildInsertElement(bld->builder, undef, a, index0, "");
+
+      res = lp_build_intrinsic_unary(bld->builder, intrinsic,
+                                     ret_type, arg);
+   }
+   else {
+      assert(type.width*type.length == 128);
+
+      intrinsic = "llvm.x86.sse2.cvtps2dq";
+
+      res = lp_build_intrinsic_unary(bld->builder, intrinsic,
+                                     ret_type, a);
    }
 
-   return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
-                                    LLVMConstInt(LLVMInt32Type(), mode, 0));
+   return res;
 }
 
 
 /**
- * Return the integer part of a float (vector) value.  The returned value is
- * a float (vector).
- * Ex: trunc(-1.5) = 1.0
+ * Return the integer part of a float (vector) value (== round toward zero).
+ * The returned value is a float (vector).
+ * Ex: trunc(-1.5) = -1.0
  */
 LLVMValueRef
 lp_build_trunc(struct lp_build_context *bld,
@@ -962,8 +1115,10 @@ lp_build_trunc(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
+   }
    else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
@@ -990,8 +1145,10 @@ lp_build_round(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
+   }
    else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       LLVMValueRef res;
@@ -1016,8 +1173,10 @@ lp_build_floor(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+   }
    else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       LLVMValueRef res;
@@ -1042,8 +1201,10 @@ lp_build_ceil(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
+   }
    else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       LLVMValueRef res;
@@ -1068,9 +1229,9 @@ lp_build_fract(struct lp_build_context *bld,
 
 
 /**
- * Return the integer part of a float (vector) value.  The returned value is
- * an integer (vector).
- * Ex: itrunc(-1.5) = 1
+ * Return the integer part of a float (vector) value (== round toward zero).
+ * The returned value is an integer (vector).
+ * Ex: itrunc(-1.5) = -1
  */
 LLVMValueRef
 lp_build_itrunc(struct lp_build_context *bld,
@@ -1097,31 +1258,40 @@ lp_build_iround(struct lp_build_context *bld,
                 LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMTypeRef int_vec_type = bld->int_vec_type;
    LLVMValueRef res;
 
    assert(type.floating);
 
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
+   if (util_cpu_caps.has_sse2 &&
+       ((type.width == 32) && (type.length == 1 || type.length == 4))) {
+      return lp_build_iround_nearest_sse2(bld, a);
+   }
+   else if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
    }
    else {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
-      LLVMValueRef sign;
       LLVMValueRef half;
 
-      /* get sign bit */
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-
-      /* sign * 0.5 */
       half = lp_build_const_vec(type, 0.5);
-      half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
-      half = LLVMBuildOr(bld->builder, sign, half, "");
-      half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
+
+      if (type.sign) {
+         LLVMTypeRef vec_type = bld->vec_type;
+         LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
+         LLVMValueRef sign;
+
+         /* get sign bit */
+         sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+
+         /* sign * 0.5 */
+         half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
+         half = LLVMBuildOr(bld->builder, sign, half, "");
+         half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
+      }
 
       res = LLVMBuildFAdd(bld->builder, a, half, "");
    }
@@ -1142,37 +1312,42 @@ lp_build_ifloor(struct lp_build_context *bld,
                 LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMTypeRef int_vec_type = bld->int_vec_type;
    LLVMValueRef res;
 
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
    }
    else {
-      /* Take the sign bit and add it to 1 constant */
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      unsigned mantissa = lp_mantissa(type);
-      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
-      LLVMValueRef sign;
-      LLVMValueRef offset;
-
-      /* sign = a < 0 ? ~0 : 0 */
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
-
-      /* offset = -0.99999(9)f */
-      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
-      offset = LLVMConstBitCast(offset, int_vec_type);
-
-      /* offset = a < 0 ? offset : 0.0f */
-      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
-      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
-
-      res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
+      res = a;
+
+      if (type.sign) {
+         /* Take the sign bit and add it to 1 constant */
+         LLVMTypeRef vec_type = bld->vec_type;
+         unsigned mantissa = lp_mantissa(type);
+         LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
+         LLVMValueRef sign;
+         LLVMValueRef offset;
+
+         /* sign = a < 0 ? ~0 : 0 */
+         sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+         sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
+
+         /* offset = -0.99999(9)f */
+         offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
+         offset = LLVMConstBitCast(offset, int_vec_type);
+
+         /* offset = a < 0 ? offset : 0.0f */
+         offset = LLVMBuildAnd(bld->builder, offset, sign, "");
+         offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
+
+         res = LLVMBuildFAdd(bld->builder, res, offset, "ifloor.res");
+      }
    }
 
    /* round to nearest (toward zero) */
@@ -1192,35 +1367,39 @@ lp_build_iceil(struct lp_build_context *bld,
                LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMTypeRef int_vec_type = bld->int_vec_type;
    LLVMValueRef res;
 
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
    }
    else {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMTypeRef vec_type = bld->vec_type;
       unsigned mantissa = lp_mantissa(type);
-      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
-      LLVMValueRef sign;
       LLVMValueRef offset;
 
-      /* sign = a < 0 ? 0 : ~0 */
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
-      sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
-
       /* offset = 0.99999(9)f */
       offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
-      offset = LLVMConstBitCast(offset, int_vec_type);
 
-      /* offset = a < 0 ? 0.0 : offset */
-      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
-      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
+      if (type.sign) {
+         LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
+         LLVMValueRef sign;
+
+         /* sign = a < 0 ? 0 : ~0 */
+         sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+         sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
+         sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
+
+         /* offset = a < 0 ? 0.0 : offset */
+         offset = LLVMConstBitCast(offset, int_vec_type);
+         offset = LLVMBuildAnd(bld->builder, offset, sign, "");
+         offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
+      }
 
       res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
    }
@@ -1232,6 +1411,46 @@ lp_build_iceil(struct lp_build_context *bld,
 }
 
 
+/**
+ * Combined ifloor() & fract().
+ *
+ * Preferred to calling the functions separately, as it will ensure that the
+ * stratergy (floor() vs ifloor()) that results in less redundant work is used.
+ */
+void
+lp_build_ifloor_fract(struct lp_build_context *bld,
+                      LLVMValueRef a,
+                      LLVMValueRef *out_ipart,
+                      LLVMValueRef *out_fpart)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef ipart;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
+      /*
+       * floor() is easier.
+       */
+
+      ipart = lp_build_floor(bld, a);
+      *out_fpart = LLVMBuildFSub(bld->builder, a, ipart, "fpart");
+      *out_ipart = LLVMBuildFPToSI(bld->builder, ipart, bld->int_vec_type, "ipart");
+   }
+   else {
+      /*
+       * ifloor() is easier.
+       */
+
+      *out_ipart = lp_build_ifloor(bld, a);
+      ipart = LLVMBuildSIToFP(bld->builder, *out_ipart, bld->vec_type, "ipart");
+      *out_fpart = LLVMBuildFSub(bld->builder, a, ipart, "fpart");
+   }
+}
+
+
 LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
               LLVMValueRef a)
@@ -2041,6 +2260,71 @@ lp_build_exp2(struct lp_build_context *bld,
 
 
 /**
+ * Extract the exponent of a IEEE-754 floating point value.
+ *
+ * Optionally apply an integer bias.
+ *
+ * Result is an integer value with
+ *
+ *   ifloor(log2(x)) + bias
+ */
+LLVMValueRef
+lp_build_extract_exponent(struct lp_build_context *bld,
+                          LLVMValueRef x,
+                          int bias)
+{
+   const struct lp_type type = bld->type;
+   unsigned mantissa = lp_mantissa(type);
+   LLVMValueRef res;
+
+   assert(type.floating);
+
+   assert(lp_check_value(bld->type, x));
+
+   x = LLVMBuildBitCast(bld->builder, x, bld->int_vec_type, "");
+
+   res = LLVMBuildLShr(bld->builder, x, lp_build_const_int_vec(type, mantissa), "");
+   res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(type, 255), "");
+   res = LLVMBuildSub(bld->builder, res, lp_build_const_int_vec(type, 127 - bias), "");
+
+   return res;
+}
+
+
+/**
+ * Extract the mantissa of the a floating.
+ *
+ * Result is a floating point value with
+ *
+ *   x / floor(log2(x))
+ */
+LLVMValueRef
+lp_build_extract_mantissa(struct lp_build_context *bld,
+                          LLVMValueRef x)
+{
+   const struct lp_type type = bld->type;
+   unsigned mantissa = lp_mantissa(type);
+   LLVMValueRef mantmask = lp_build_const_int_vec(type, (1ULL << mantissa) - 1);
+   LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
+   LLVMValueRef res;
+
+   assert(lp_check_value(bld->type, x));
+
+   assert(type.floating);
+
+   x = LLVMBuildBitCast(bld->builder, x, bld->int_vec_type, "");
+
+   /* res = x / 2**ipart */
+   res = LLVMBuildAnd(bld->builder, x, mantmask, "");
+   res = LLVMBuildOr(bld->builder, res, one, "");
+   res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, "");
+
+   return res;
+}
+
+
+
+/**
  * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
  * These coefficients can be generate with
  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
@@ -2159,3 +2443,62 @@ lp_build_log2(struct lp_build_context *bld,
    lp_build_log2_approx(bld, x, NULL, NULL, &res);
    return res;
 }
+
+
+/**
+ * Faster (and less accurate) log2.
+ *
+ *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
+ *
+ * Piece-wise linear approximation, with exact results when x is a
+ * power of two.
+ *
+ * See http://www.flipcode.com/archives/Fast_log_Function.shtml
+ */
+LLVMValueRef
+lp_build_fast_log2(struct lp_build_context *bld,
+                   LLVMValueRef x)
+{
+   LLVMValueRef ipart;
+   LLVMValueRef fpart;
+
+   assert(lp_check_value(bld->type, x));
+
+   assert(bld->type.floating);
+
+   /* ipart = floor(log2(x)) - 1 */
+   ipart = lp_build_extract_exponent(bld, x, -1);
+   ipart = LLVMBuildSIToFP(bld->builder, ipart, bld->vec_type, "");
+
+   /* fpart = x / 2**ipart */
+   fpart = lp_build_extract_mantissa(bld, x);
+
+   /* ipart + fpart */
+   return LLVMBuildFAdd(bld->builder, ipart, fpart, "");
+}
+
+
+/**
+ * Fast implementation of iround(log2(x)).
+ *
+ * Not an approximation -- it should give accurate results all the time.
+ */
+LLVMValueRef
+lp_build_ilog2(struct lp_build_context *bld,
+               LLVMValueRef x)
+{
+   LLVMValueRef sqrt2 = lp_build_const_vec(bld->type, M_SQRT2);
+   LLVMValueRef ipart;
+
+   assert(bld->type.floating);
+
+   assert(lp_check_value(bld->type, x));
+
+   /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
+   x = LLVMBuildFMul(bld->builder, x, sqrt2, "");
+
+   /* ipart = floor(log2(x) + 0.5)  */
+   ipart = lp_build_extract_exponent(bld, x, 0);
+
+   return ipart;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index 31efa9921c..c78b61decf 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -171,6 +171,12 @@ LLVMValueRef
 lp_build_itrunc(struct lp_build_context *bld,
                 LLVMValueRef a);
 
+void
+lp_build_ifloor_fract(struct lp_build_context *bld,
+                      LLVMValueRef a,
+                      LLVMValueRef *out_ipart,
+                      LLVMValueRef *out_fpart);
+
 LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
               LLVMValueRef a);
@@ -209,9 +215,26 @@ lp_build_exp2(struct lp_build_context *bld,
               LLVMValueRef a);
 
 LLVMValueRef
+lp_build_extract_exponent(struct lp_build_context *bld,
+                          LLVMValueRef x,
+                          int bias);
+
+LLVMValueRef
+lp_build_extract_mantissa(struct lp_build_context *bld,
+                          LLVMValueRef x);
+
+LLVMValueRef
 lp_build_log2(struct lp_build_context *bld,
               LLVMValueRef a);
 
+LLVMValueRef
+lp_build_fast_log2(struct lp_build_context *bld,
+                   LLVMValueRef a);
+
+LLVMValueRef
+lp_build_ilog2(struct lp_build_context *bld,
+               LLVMValueRef x);
+
 void
 lp_build_exp2_approx(struct lp_build_context *bld,
                      LLVMValueRef x,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 8b477313d4..6967dd2622 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -63,6 +63,7 @@
 
 #include "util/u_debug.h"
 #include "util/u_math.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -96,58 +97,104 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(src_type);
    LLVMValueRef res;
    unsigned mantissa;
-   unsigned n;
-   unsigned long long ubound;
-   unsigned long long mask;
-   double scale;
-   double bias;
 
    assert(src_type.floating);
+   assert(dst_width <= src_type.width);
+   src_type.sign = FALSE;
 
    mantissa = lp_mantissa(src_type);
 
-   /* We cannot carry more bits than the mantissa */
-   n = MIN2(mantissa, dst_width);
+   if (dst_width <= mantissa) {
+      /*
+       * Apply magic coefficients that will make the desired result to appear
+       * in the lowest significant bits of the mantissa, with correct rounding.
+       *
+       * This only works if the destination width fits in the mantissa.
+       */
 
-   /* This magic coefficients will make the desired result to appear in the
-    * lowest significant bits of the mantissa.
-    */
-   ubound = ((unsigned long long)1 << n);
-   mask = ubound - 1;
-   scale = (double)mask/ubound;
-   bias = (double)((unsigned long long)1 << (mantissa - n));
+      unsigned long long ubound;
+      unsigned long long mask;
+      double scale;
+      double bias;
 
-   res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), "");
-   res = LLVMBuildFAdd(builder, res, lp_build_const_vec(src_type, bias), "");
-   res = LLVMBuildBitCast(builder, res, int_vec_type, "");
+      ubound = (1ULL << dst_width);
+      mask = ubound - 1;
+      scale = (double)mask/ubound;
+      bias = (double)(1ULL << (mantissa - dst_width));
 
-   if(dst_width > n) {
-      int shift = dst_width - n;
-      res = LLVMBuildShl(builder, res, lp_build_const_int_vec(src_type, shift), "");
+      res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), "");
+      res = LLVMBuildFAdd(builder, res, lp_build_const_vec(src_type, bias), "");
+      res = LLVMBuildBitCast(builder, res, int_vec_type, "");
+      res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(src_type, mask), "");
+   }
+   else if (dst_width == (mantissa + 1)) {
+      /*
+       * The destination width matches exactly what can be represented in
+       * floating point (i.e., mantissa + 1 bits). So do a straight
+       * multiplication followed by casting. No further rounding is necessary.
+       */
+
+      double scale;
 
-      /* TODO: Fill in the empty lower bits for additional precision? */
-      /* YES: this fixes progs/trivial/tri-z-eq.c.
-       * Otherwise vertex Z=1.0 values get converted to something like
-       * 0xfffffb00 and the test for equality with 0xffffffff fails.
+      scale = (double)((1ULL << dst_width) - 1);
+
+      res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), "");
+      res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
+   }
+   else {
+      /*
+       * The destination exceeds what can be represented in the floating point.
+       * So multiply by the largest power two we get away with, and when
+       * subtract the most significant bit to rescale to normalized values.
+       *
+       * The largest power of two factor we can get away is
+       * (1 << (src_type.width - 1)), because we need to use signed . In theory it
+       * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
+       * INT_MIN should be returned in FPToSI, which is the correct result for
+       * values near 1.0!
+       *
+       * This means we get (src_type.width - 1) correct bits for values near 0.0,
+       * and (mantissa + 1) correct bits for values near 1.0. Equally or more
+       * important, we also get exact results for 0.0 and 1.0.
        */
-#if 0
-      {
-         LLVMValueRef msb;
-         msb = LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, dst_width - 1), "");
-         msb = LLVMBuildShl(builder, msb, lp_build_const_int_vec(src_type, shift), "");
-         msb = LLVMBuildSub(builder, msb, lp_build_const_int_vec(src_type, 1), "");
-         res = LLVMBuildOr(builder, res, msb, "");
-      }
-#elif 0
-      while(shift > 0) {
-         res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, n), ""), "");
-         shift -= n;
-         n *= 2;
+
+      unsigned n = MIN2(src_type.width - 1, dst_width);
+
+      double scale = (double)(1ULL << n);
+      unsigned lshift = dst_width - n;
+      unsigned rshift = n;
+      LLVMValueRef lshifted;
+      LLVMValueRef rshifted;
+
+      res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), "");
+      res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
+
+      /*
+       * Align the most significant bit to its final place.
+       *
+       * This will cause 1.0 to overflow to 0, but the later adjustment will
+       * get it right.
+       */
+      if (lshift) {
+         lshifted = LLVMBuildShl(builder, res,
+                                 lp_build_const_int_vec(src_type, lshift), "");
+      } else {
+         lshifted = res;
       }
-#endif
+
+      /*
+       * Align the most significant bit to the right.
+       */
+      rshifted =  LLVMBuildAShr(builder, res,
+                                lp_build_const_int_vec(src_type, rshift), "");
+
+      /*
+       * Subtract the MSB to the LSB, therefore re-scaling from
+       * (1 << dst_width) to ((1 << dst_width) - 1).
+       */
+
+      res = LLVMBuildSub(builder, lshifted, rshifted, "");
    }
-   else
-      res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(src_type, mask), "");
 
    return res;
 }
@@ -177,6 +224,16 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
 
    assert(dst_type.floating);
 
+   /* Special-case int8->float, though most cases could be handled
+    * this way:
+    */
+   if (src_width == 8) {
+      scale = 1.0/255.0;
+      res = LLVMBuildSIToFP(builder, src, vec_type, "");
+      res = LLVMBuildFMul(builder, res, lp_build_const_vec(dst_type, scale), "");
+      return res;
+   }
+
    mantissa = lp_mantissa(dst_type);
 
    n = MIN2(mantissa, src_width);
@@ -241,6 +298,87 @@ lp_build_conv(LLVMBuilderRef builder,
    }
    num_tmps = num_srcs;
 
+
+   /* Special case 4x4f --> 1x16ub 
+    */
+   if (src_type.floating == 1 &&
+       src_type.fixed    == 0 &&
+       src_type.sign     == 1 &&
+       src_type.norm     == 0 &&
+       src_type.width    == 32 &&
+       src_type.length   == 4 &&
+
+       dst_type.floating == 0 &&
+       dst_type.fixed    == 0 &&
+       dst_type.sign     == 0 &&
+       dst_type.norm     == 1 &&
+       dst_type.width    == 8 &&
+       dst_type.length   == 16 &&
+
+       util_cpu_caps.has_sse2)
+   {
+      int i;
+
+      for (i = 0; i < num_dsts; i++, src += 4) {
+         struct lp_type int16_type = dst_type;
+         struct lp_type int32_type = dst_type;
+         LLVMValueRef lo, hi;
+         LLVMValueRef src_int0;
+         LLVMValueRef src_int1;
+         LLVMValueRef src_int2;
+         LLVMValueRef src_int3;
+         LLVMTypeRef int16_vec_type;
+         LLVMTypeRef int32_vec_type;
+         LLVMTypeRef src_vec_type;
+         LLVMTypeRef dst_vec_type;
+         LLVMValueRef const_255f;
+         LLVMValueRef a, b, c, d;
+
+         int16_type.width *= 2;
+         int16_type.length /= 2;
+         int16_type.sign = 1;
+
+         int32_type.width *= 4;
+         int32_type.length /= 4;
+         int32_type.sign = 1;
+
+         src_vec_type   = lp_build_vec_type(src_type);
+         dst_vec_type   = lp_build_vec_type(dst_type);
+         int16_vec_type = lp_build_vec_type(int16_type);
+         int32_vec_type = lp_build_vec_type(int32_type);
+
+         const_255f = lp_build_const_vec(src_type, 255.0f);
+
+         a = LLVMBuildFMul(builder, src[0], const_255f, "");
+         b = LLVMBuildFMul(builder, src[1], const_255f, "");
+         c = LLVMBuildFMul(builder, src[2], const_255f, "");
+         d = LLVMBuildFMul(builder, src[3], const_255f, "");
+
+         {
+            struct lp_build_context bld;
+
+            bld.builder = builder;
+            bld.type = src_type;
+            bld.vec_type = src_vec_type;
+            bld.int_elem_type = lp_build_elem_type(int32_type);
+            bld.int_vec_type = int32_vec_type;
+            bld.undef = lp_build_undef(src_type);
+            bld.zero = lp_build_zero(src_type);
+            bld.one = lp_build_one(src_type);
+
+            src_int0 = lp_build_iround(&bld, a);
+            src_int1 = lp_build_iround(&bld, b);
+            src_int2 = lp_build_iround(&bld, c);
+            src_int3 = lp_build_iround(&bld, d);
+         }
+         /* relying on clamping behavior of sse2 intrinsics here */
+         lo = lp_build_pack2(builder, int32_type, int16_type, src_int0, src_int1);
+         hi = lp_build_pack2(builder, int32_type, int16_type, src_int2, src_int3);
+         dst[i] = lp_build_pack2(builder, int16_type, dst_type, lo, hi);
+      }
+      return; 
+   }
+
    /*
     * Clamp if necessary
     */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.c b/src/gallium/auxiliary/gallivm/lp_bld_debug.c
index d3a5afff8c..93e56553d7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.c
@@ -57,6 +57,8 @@ lp_disassemble(const void* func)
 #ifdef HAVE_UDIS86
    ud_t ud_obj;
    uint64_t max_jmp_pc;
+   uint inst_no;
+   boolean emit_addrs = TRUE, emit_line_nos = FALSE;
 
    ud_init(&ud_obj);
 
@@ -76,13 +78,18 @@ lp_disassemble(const void* func)
 
    while (ud_disassemble(&ud_obj)) {
 
+      if (emit_addrs) {
 #ifdef PIPE_ARCH_X86
-      debug_printf("0x%08lx:\t", (unsigned long)ud_insn_off(&ud_obj));
+         debug_printf("0x%08lx:\t", (unsigned long)ud_insn_off(&ud_obj));
 #endif
 #ifdef PIPE_ARCH_X86_64
-      debug_printf("0x%016llx:\t", (unsigned long long)ud_insn_off(&ud_obj));
+         debug_printf("0x%016llx:\t", (unsigned long long)ud_insn_off(&ud_obj));
 #endif
-
+      }
+      else if (emit_line_nos) {
+         debug_printf("%6d:\t", inst_no);
+         inst_no++;
+      }
 #if 0
       debug_printf("%-16s ", ud_insn_hex(&ud_obj));
 #endif
@@ -115,8 +122,10 @@ lp_disassemble(const void* func)
          }
       }
 
-      if ((ud_insn_off(&ud_obj) >= max_jmp_pc && ud_obj.mnemonic == UD_Iret) ||
-           ud_obj.mnemonic == UD_Iinvalid)
+      if (ud_obj.mnemonic == UD_Iinvalid ||
+          (ud_insn_off(&ud_obj) >= max_jmp_pc &&
+           (ud_obj.mnemonic == UD_Iret ||
+            ud_obj.mnemonic == UD_Ijmp)))
          break;
    }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.h b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
index 369c1bbf09..eb11dcd4ef 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
@@ -36,11 +36,12 @@
 #include "util/u_string.h"
 
 
-#define GALLIVM_DEBUG_TGSI      0x1
-#define GALLIVM_DEBUG_IR        0x2
-#define GALLIVM_DEBUG_ASM       0x4
-#define GALLIVM_DEBUG_NO_OPT    0x8
-#define GALLIVM_DEBUG_PERF      0x10
+#define GALLIVM_DEBUG_TGSI          (1 << 0)
+#define GALLIVM_DEBUG_IR            (1 << 1)
+#define GALLIVM_DEBUG_ASM           (1 << 2)
+#define GALLIVM_DEBUG_NO_OPT        (1 << 3)
+#define GALLIVM_DEBUG_PERF          (1 << 4)
+#define GALLIVM_DEBUG_NO_BRILINEAR  (1 << 5)
 
 
 #ifdef DEBUG
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index 5bc9c741a8..a2cee199a0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -38,273 +38,15 @@
 #include "lp_bld_flow.h"
 
 
-#define LP_BUILD_FLOW_MAX_VARIABLES 64
-#define LP_BUILD_FLOW_MAX_DEPTH 32
-
-/**
- * Enumeration of all possible flow constructs.
- */
-enum lp_build_flow_construct_kind {
-   LP_BUILD_FLOW_SCOPE,
-   LP_BUILD_FLOW_SKIP,
-   LP_BUILD_FLOW_IF
-};
-
-
-/**
- * Variable declaration scope.
- */
-struct lp_build_flow_scope
-{
-   /** Number of variables declared in this scope */
-   unsigned num_variables;
-};
-
-
-/**
- * Early exit. Useful to skip to the end of a function or block when
- * the execution mask becomes zero or when there is an error condition.
- */
-struct lp_build_flow_skip
-{
-   /** Block to skip to */
-   LLVMBasicBlockRef block;
-
-   /** Number of variables declared at the beginning */
-   unsigned num_variables;
-
-   LLVMValueRef *phi;  /**< array [num_variables] */
-};
-
-
-/**
- * if/else/endif.
- */
-struct lp_build_flow_if
-{
-   unsigned num_variables;
-
-   LLVMValueRef *phi;  /**< array [num_variables] */
-
-   LLVMValueRef condition;
-   LLVMBasicBlockRef entry_block, true_block, false_block, merge_block;
-};
-
-
-/**
- * Union of all possible flow constructs' data
- */
-union lp_build_flow_construct_data
-{
-   struct lp_build_flow_scope scope;
-   struct lp_build_flow_skip skip;
-   struct lp_build_flow_if ifthen;
-};
-
-
-/**
- * Element of the flow construct stack.
- */
-struct lp_build_flow_construct
-{
-   enum lp_build_flow_construct_kind kind;
-   union lp_build_flow_construct_data data;
-};
-
-
 /**
- * All necessary data to generate LLVM control flow constructs.
+ * Insert a new block, right where builder is pointing to.
  *
- * Besides keeping track of the control flow construct themselves we also
- * need to keep track of variables in order to generate SSA Phi values.
- */
-struct lp_build_flow_context
-{
-   LLVMBuilderRef builder;
-
-   /**
-    * Control flow stack.
-    */
-   struct lp_build_flow_construct constructs[LP_BUILD_FLOW_MAX_DEPTH];
-   unsigned num_constructs;
-
-   /**
-    * Variable stack
-    */
-   LLVMValueRef *variables[LP_BUILD_FLOW_MAX_VARIABLES];
-   unsigned num_variables;
-};
-
-
-struct lp_build_flow_context *
-lp_build_flow_create(LLVMBuilderRef builder)
-{
-   struct lp_build_flow_context *flow;
-
-   flow = CALLOC_STRUCT(lp_build_flow_context);
-   if(!flow)
-      return NULL;
-
-   flow->builder = builder;
-
-   return flow;
-}
-
-
-void
-lp_build_flow_destroy(struct lp_build_flow_context *flow)
-{
-   assert(flow->num_constructs == 0);
-   assert(flow->num_variables == 0);
-   FREE(flow);
-}
-
-
-/**
- * Begin/push a new flow control construct, such as a loop, skip block
- * or variable scope.
- */
-static union lp_build_flow_construct_data *
-lp_build_flow_push(struct lp_build_flow_context *flow,
-                   enum lp_build_flow_construct_kind kind)
-{
-   assert(flow->num_constructs < LP_BUILD_FLOW_MAX_DEPTH);
-   if(flow->num_constructs >= LP_BUILD_FLOW_MAX_DEPTH)
-      return NULL;
-
-   flow->constructs[flow->num_constructs].kind = kind;
-   return &flow->constructs[flow->num_constructs++].data;
-}
-
-
-/**
- * Return the current/top flow control construct on the stack.
- * \param kind  the expected type of the top-most construct
- */
-static union lp_build_flow_construct_data *
-lp_build_flow_peek(struct lp_build_flow_context *flow,
-                   enum lp_build_flow_construct_kind kind)
-{
-   assert(flow->num_constructs);
-   if(!flow->num_constructs)
-      return NULL;
-
-   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
-   if(flow->constructs[flow->num_constructs - 1].kind != kind)
-      return NULL;
-
-   return &flow->constructs[flow->num_constructs - 1].data;
-}
-
-
-/**
- * End/pop the current/top flow control construct on the stack.
- * \param kind  the expected type of the top-most construct
- */
-static union lp_build_flow_construct_data *
-lp_build_flow_pop(struct lp_build_flow_context *flow,
-                  enum lp_build_flow_construct_kind kind)
-{
-   assert(flow->num_constructs);
-   if(!flow->num_constructs)
-      return NULL;
-
-   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
-   if(flow->constructs[flow->num_constructs - 1].kind != kind)
-      return NULL;
-
-   return &flow->constructs[--flow->num_constructs].data;
-}
-
-
-/**
- * Begin a variable scope.
+ * This is useful important not only for aesthetic reasons, but also for
+ * performance reasons, as frequently run blocks should be laid out next to
+ * each other and fall-throughs maximized.
  *
+ * See also llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp.
  *
- */
-void
-lp_build_flow_scope_begin(struct lp_build_flow_context *flow)
-{
-   struct lp_build_flow_scope *scope;
-
-   scope = &lp_build_flow_push(flow, LP_BUILD_FLOW_SCOPE)->scope;
-   if(!scope)
-      return;
-
-   scope->num_variables = 0;
-}
-
-
-/**
- * Declare a variable.
- *
- * A variable is a named entity which can have different LLVMValueRef's at
- * different points of the program. This is relevant for control flow because
- * when there are multiple branches to a same location we need to replace
- * the variable's value with a Phi function as explained in
- * http://en.wikipedia.org/wiki/Static_single_assignment_form .
- *
- * We keep track of variables by keeping around a pointer to where they're
- * current.
- *
- * There are a few cautions to observe:
- *
- * - Variable's value must not be NULL. If there is no initial value then
- *   LLVMGetUndef() should be used.
- *
- * - Variable's value must be kept up-to-date. If the variable is going to be
- *   modified by a function then a pointer should be passed so that its value
- *   is accurate. Failure to do this will cause some of the variables'
- *   transient values to be lost, leading to wrong results.
- *
- * - A program should be written from top to bottom, by always appending
- *   instructions to the bottom with a single LLVMBuilderRef. Inserting and/or
- *   modifying existing statements will most likely lead to wrong results.
- *
- */
-void
-lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
-                            LLVMValueRef *variable)
-{
-   struct lp_build_flow_scope *scope;
-
-   scope = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SCOPE)->scope;
-   if(!scope)
-      return;
-
-   assert(*variable);
-   if(!*variable)
-      return;
-
-   assert(flow->num_variables < LP_BUILD_FLOW_MAX_VARIABLES);
-   if(flow->num_variables >= LP_BUILD_FLOW_MAX_VARIABLES)
-      return;
-
-   flow->variables[flow->num_variables++] = variable;
-   ++scope->num_variables;
-}
-
-
-void
-lp_build_flow_scope_end(struct lp_build_flow_context *flow)
-{
-   struct lp_build_flow_scope *scope;
-
-   scope = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SCOPE)->scope;
-   if(!scope)
-      return;
-
-   assert(flow->num_variables >= scope->num_variables);
-   if(flow->num_variables < scope->num_variables) {
-      flow->num_variables = 0;
-      return;
-   }
-
-   flow->num_variables -= scope->num_variables;
-}
-
-
-/**
  * Note: this function has no dependencies on the flow code and could
  * be used elsewhere.
  */
@@ -334,52 +76,18 @@ lp_build_insert_new_block(LLVMBuilderRef builder, const char *name)
 }
 
 
-static LLVMBasicBlockRef
-lp_build_flow_insert_block(struct lp_build_flow_context *flow)
-{
-   return lp_build_insert_new_block(flow->builder, "");
-}
-
-
 /**
  * Begin a "skip" block.  Inside this block we can test a condition and
  * skip to the end of the block if the condition is false.
  */
 void
-lp_build_flow_skip_begin(struct lp_build_flow_context *flow)
+lp_build_flow_skip_begin(struct lp_build_skip_context *skip,
+                         LLVMBuilderRef builder)
 {
-   struct lp_build_flow_skip *skip;
-   LLVMBuilderRef builder;
-   unsigned i;
-
-   skip = &lp_build_flow_push(flow, LP_BUILD_FLOW_SKIP)->skip;
-   if(!skip)
-      return;
+   skip->builder = builder;
 
    /* create new basic block */
-   skip->block = lp_build_flow_insert_block(flow);
-
-   skip->num_variables = flow->num_variables;
-   if(!skip->num_variables) {
-      skip->phi = NULL;
-      return;
-   }
-
-   /* Allocate a Phi node for each variable in this skip scope */
-   skip->phi = MALLOC(skip->num_variables * sizeof *skip->phi);
-   if(!skip->phi) {
-      skip->num_variables = 0;
-      return;
-   }
-
-   builder = LLVMCreateBuilder();
-   LLVMPositionBuilderAtEnd(builder, skip->block);
-
-   /* create a Phi node for each variable */
-   for(i = 0; i < skip->num_variables; ++i)
-      skip->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
-
-   LLVMDisposeBuilder(builder);
+   skip->block = lp_build_insert_new_block(skip->builder, "skip");
 }
 
 
@@ -388,83 +96,50 @@ lp_build_flow_skip_begin(struct lp_build_flow_context *flow)
  * skip block if the condition is true.
  */
 void
-lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+lp_build_flow_skip_cond_break(struct lp_build_skip_context *skip,
                               LLVMValueRef cond)
 {
-   struct lp_build_flow_skip *skip;
-   LLVMBasicBlockRef current_block;
    LLVMBasicBlockRef new_block;
-   unsigned i;
-
-   skip = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SKIP)->skip;
-   if(!skip)
-      return;
 
-   current_block = LLVMGetInsertBlock(flow->builder);
-
-   new_block = lp_build_flow_insert_block(flow);
-
-   /* for each variable, update the Phi node with a (variable, block) pair */
-   for(i = 0; i < skip->num_variables; ++i) {
-      assert(*flow->variables[i]);
-      assert(LLVMTypeOf(skip->phi[i]) == LLVMTypeOf(*flow->variables[i]));
-      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
-   }
+   new_block = lp_build_insert_new_block(skip->builder, "");
 
    /* if cond is true, goto skip->block, else goto new_block */
-   LLVMBuildCondBr(flow->builder, cond, skip->block, new_block);
+   LLVMBuildCondBr(skip->builder, cond, skip->block, new_block);
 
-   LLVMPositionBuilderAtEnd(flow->builder, new_block);
+   LLVMPositionBuilderAtEnd(skip->builder, new_block);
 }
 
 
 void
-lp_build_flow_skip_end(struct lp_build_flow_context *flow)
+lp_build_flow_skip_end(struct lp_build_skip_context *skip)
 {
-   struct lp_build_flow_skip *skip;
-   LLVMBasicBlockRef current_block;
-   unsigned i;
-
-   skip = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SKIP)->skip;
-   if(!skip)
-      return;
-
-   current_block = LLVMGetInsertBlock(flow->builder);
-
-   /* add (variable, block) tuples to the phi nodes */
-   for(i = 0; i < skip->num_variables; ++i) {
-      assert(*flow->variables[i]);
-      assert(LLVMTypeOf(skip->phi[i]) == LLVMTypeOf(*flow->variables[i]));
-      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
-      *flow->variables[i] = skip->phi[i];
-   }
-
    /* goto block */
-   LLVMBuildBr(flow->builder, skip->block);
-   LLVMPositionBuilderAtEnd(flow->builder, skip->block);
-
-   FREE(skip->phi);
+   LLVMBuildBr(skip->builder, skip->block);
+   LLVMPositionBuilderAtEnd(skip->builder, skip->block);
 }
 
 
 /**
  * Check if the mask predicate is zero.  If so, jump to the end of the block.
  */
-static void
+void
 lp_build_mask_check(struct lp_build_mask_context *mask)
 {
-   LLVMBuilderRef builder = mask->flow->builder;
+   LLVMBuilderRef builder = mask->skip.builder;
+   LLVMValueRef value;
    LLVMValueRef cond;
 
+   value = lp_build_mask_value(mask);
+
    /* cond = (mask == 0) */
    cond = LLVMBuildICmp(builder,
                         LLVMIntEQ,
-                        LLVMBuildBitCast(builder, mask->value, mask->reg_type, ""),
+                        LLVMBuildBitCast(builder, value, mask->reg_type, ""),
                         LLVMConstNull(mask->reg_type),
                         "");
 
    /* if cond, goto end of block */
-   lp_build_flow_skip_cond_break(mask->flow, cond);
+   lp_build_flow_skip_cond_break(&mask->skip, cond);
 }
 
 
@@ -477,21 +152,27 @@ lp_build_mask_check(struct lp_build_mask_context *mask)
  */
 void
 lp_build_mask_begin(struct lp_build_mask_context *mask,
-                    struct lp_build_flow_context *flow,
+                    LLVMBuilderRef builder,
                     struct lp_type type,
                     LLVMValueRef value)
 {
    memset(mask, 0, sizeof *mask);
 
-   mask->flow = flow;
    mask->reg_type = LLVMIntType(type.width * type.length);
-   mask->value = value;
+   mask->var = lp_build_alloca(builder,
+                               lp_build_int_vec_type(type),
+                               "execution_mask");
 
-   lp_build_flow_scope_begin(flow);
-   lp_build_flow_scope_declare(flow, &mask->value);
-   lp_build_flow_skip_begin(flow);
+   LLVMBuildStore(builder, value, mask->var);
 
-   lp_build_mask_check(mask);
+   lp_build_flow_skip_begin(&mask->skip, builder);
+}
+
+
+LLVMValueRef
+lp_build_mask_value(struct lp_build_mask_context *mask)
+{
+   return LLVMBuildLoad(mask->skip.builder, mask->var, "");
 }
 
 
@@ -504,9 +185,10 @@ void
 lp_build_mask_update(struct lp_build_mask_context *mask,
                      LLVMValueRef value)
 {
-   mask->value = LLVMBuildAnd( mask->flow->builder, mask->value, value, "");
-
-   lp_build_mask_check(mask);
+   value = LLVMBuildAnd(mask->skip.builder,
+                        lp_build_mask_value(mask),
+                        value, "");
+   LLVMBuildStore(mask->skip.builder, value, mask->var);
 }
 
 
@@ -516,9 +198,8 @@ lp_build_mask_update(struct lp_build_mask_context *mask,
 LLVMValueRef
 lp_build_mask_end(struct lp_build_mask_context *mask)
 {
-   lp_build_flow_skip_end(mask->flow);
-   lp_build_flow_scope_end(mask->flow);
-   return mask->value;
+   lp_build_flow_skip_end(&mask->skip);
+   return lp_build_mask_value(mask);
 }
 
 
@@ -528,59 +209,27 @@ lp_build_loop_begin(LLVMBuilderRef builder,
                     LLVMValueRef start,
                     struct lp_build_loop_state *state)
 {
-   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
-   LLVMValueRef function = LLVMGetBasicBlockParent(block);
+   state->block = lp_build_insert_new_block(builder, "loop_begin");
 
-   state->block = LLVMAppendBasicBlock(function, "loop");
+   state->counter_var = lp_build_alloca(builder, LLVMTypeOf(start), "loop_counter");
+
+   LLVMBuildStore(builder, start, state->counter_var);
 
    LLVMBuildBr(builder, state->block);
 
    LLVMPositionBuilderAtEnd(builder, state->block);
 
-   state->counter = LLVMBuildPhi(builder, LLVMTypeOf(start), "");
-
-   LLVMAddIncoming(state->counter, &start, &block, 1);
-
+   state->counter = LLVMBuildLoad(builder, state->counter_var, "");
 }
 
 
 void
-lp_build_loop_end(LLVMBuilderRef builder,
-                  LLVMValueRef end,
-                  LLVMValueRef step,
-                  struct lp_build_loop_state *state)
-{
-   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
-   LLVMValueRef function = LLVMGetBasicBlockParent(block);
-   LLVMValueRef next;
-   LLVMValueRef cond;
-   LLVMBasicBlockRef after_block;
-
-   if (!step)
-      step = LLVMConstInt(LLVMTypeOf(end), 1, 0);
-
-   next = LLVMBuildAdd(builder, state->counter, step, "");
-
-   cond = LLVMBuildICmp(builder, LLVMIntNE, next, end, "");
-
-   after_block = LLVMAppendBasicBlock(function, "");
-
-   LLVMBuildCondBr(builder, cond, after_block, state->block);
-
-   LLVMAddIncoming(state->counter, &next, &block, 1);
-
-   LLVMPositionBuilderAtEnd(builder, after_block);
-}
-
-void
 lp_build_loop_end_cond(LLVMBuilderRef builder,
                        LLVMValueRef end,
                        LLVMValueRef step,
-                       int llvm_cond,
+                       LLVMIntPredicate llvm_cond,
                        struct lp_build_loop_state *state)
 {
-   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
-   LLVMValueRef function = LLVMGetBasicBlockParent(block);
    LLVMValueRef next;
    LLVMValueRef cond;
    LLVMBasicBlockRef after_block;
@@ -590,15 +239,27 @@ lp_build_loop_end_cond(LLVMBuilderRef builder,
 
    next = LLVMBuildAdd(builder, state->counter, step, "");
 
+   LLVMBuildStore(builder, next, state->counter_var);
+
    cond = LLVMBuildICmp(builder, llvm_cond, next, end, "");
 
-   after_block = LLVMAppendBasicBlock(function, "");
+   after_block = lp_build_insert_new_block(builder, "loop_end");
 
    LLVMBuildCondBr(builder, cond, after_block, state->block);
 
-   LLVMAddIncoming(state->counter, &next, &block, 1);
-
    LLVMPositionBuilderAtEnd(builder, after_block);
+
+   state->counter = LLVMBuildLoad(builder, state->counter_var, "");
+}
+
+
+void
+lp_build_loop_end(LLVMBuilderRef builder,
+                  LLVMValueRef end,
+                  LLVMValueRef step,
+                  struct lp_build_loop_state *state)
+{
+   lp_build_loop_end_cond(builder, end, step, LLVMIntNE, state);
 }
 
 
@@ -616,24 +277,16 @@ lp_build_loop_end_cond(LLVMBuilderRef builder,
 
   Is built with:
 
-     LLVMValueRef x = LLVMGetUndef();  // or something else
+     // x needs an alloca variable
+     x = lp_build_alloca(builder, type, "x");
 
-     flow = lp_build_flow_create(builder);
 
-        lp_build_flow_scope_begin(flow);
+     lp_build_if(ctx, builder, cond);
+        LLVMBuildStore(LLVMBuildAdd(1, 2), x);
+     lp_build_else(ctx);
+        LLVMBuildStore(LLVMBuildAdd(2, 3). x);
+     lp_build_endif(ctx);
 
-           // x needs a phi node
-           lp_build_flow_scope_declare(flow, &x);
-
-           lp_build_if(ctx, flow, builder, cond);
-              x = LLVMAdd(1, 2);
-           lp_build_else(ctx);
-              x = LLVMAdd(2, 3);
-           lp_build_endif(ctx);
-
-        lp_build_flow_scope_end(flow);
-
-     lp_build_flow_destroy(flow);
  */
 
 
@@ -642,47 +295,19 @@ lp_build_loop_end_cond(LLVMBuilderRef builder,
  * Begin an if/else/endif construct.
  */
 void
-lp_build_if(struct lp_build_if_state *ctx,
-            struct lp_build_flow_context *flow,
+lp_build_if(struct lp_build_if_state *ifthen,
             LLVMBuilderRef builder,
             LLVMValueRef condition)
 {
    LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
-   struct lp_build_flow_if *ifthen;
-   unsigned i;
-
-   memset(ctx, 0, sizeof(*ctx));
-   ctx->builder = builder;
-   ctx->flow = flow;
 
-   /* push/create new scope */
-   ifthen = &lp_build_flow_push(flow, LP_BUILD_FLOW_IF)->ifthen;
-   assert(ifthen);
-
-   ifthen->num_variables = flow->num_variables;
+   memset(ifthen, 0, sizeof *ifthen);
+   ifthen->builder = builder;
    ifthen->condition = condition;
    ifthen->entry_block = block;
 
-   /* create a Phi node for each variable in this flow scope */
-   ifthen->phi = MALLOC(ifthen->num_variables * sizeof(*ifthen->phi));
-   if (!ifthen->phi) {
-      ifthen->num_variables = 0;
-      return;
-   }
-
    /* create endif/merge basic block for the phi functions */
    ifthen->merge_block = lp_build_insert_new_block(builder, "endif-block");
-   LLVMPositionBuilderAtEnd(builder, ifthen->merge_block);
-
-   /* create a phi node for each variable */
-   for (i = 0; i < flow->num_variables; i++) {
-      ifthen->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
-
-      /* add add the initial value of the var from the entry block */
-      if (!LLVMIsUndef(*flow->variables[i]))
-         LLVMAddIncoming(ifthen->phi[i], flow->variables[i],
-                         &ifthen->entry_block, 1);
-   }
 
    /* create/insert true_block before merge_block */
    ifthen->true_block = LLVMInsertBasicBlock(ifthen->merge_block, "if-true-block");
@@ -696,27 +321,16 @@ lp_build_if(struct lp_build_if_state *ctx,
  * Begin else-part of a conditional
  */
 void
-lp_build_else(struct lp_build_if_state *ctx)
+lp_build_else(struct lp_build_if_state *ifthen)
 {
-   struct lp_build_flow_context *flow = ctx->flow;
-   struct lp_build_flow_if *ifthen;
-   unsigned i;
-
-   ifthen = &lp_build_flow_peek(flow, LP_BUILD_FLOW_IF)->ifthen;
-   assert(ifthen);
-
-   /* for each variable, update the Phi node with a (variable, block) pair */
-   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
-   for (i = 0; i < flow->num_variables; i++) {
-      assert(*flow->variables[i]);
-      LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->true_block, 1);
-   }
+   /* Append an unconditional Br(anch) instruction on the true_block */
+   LLVMBuildBr(ifthen->builder, ifthen->merge_block);
 
    /* create/insert false_block before the merge block */
    ifthen->false_block = LLVMInsertBasicBlock(ifthen->merge_block, "if-false-block");
 
    /* successive code goes into the else block */
-   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->false_block);
+   LLVMPositionBuilderAtEnd(ifthen->builder, ifthen->false_block);
 }
 
 
@@ -724,75 +338,30 @@ lp_build_else(struct lp_build_if_state *ctx)
  * End a conditional.
  */
 void
-lp_build_endif(struct lp_build_if_state *ctx)
+lp_build_endif(struct lp_build_if_state *ifthen)
 {
-   struct lp_build_flow_context *flow = ctx->flow;
-   struct lp_build_flow_if *ifthen;
-   LLVMBasicBlockRef curBlock = LLVMGetInsertBlock(ctx->builder);
-   unsigned i;
-
-   ifthen = &lp_build_flow_pop(flow, LP_BUILD_FLOW_IF)->ifthen;
-   assert(ifthen);
-
    /* Insert branch to the merge block from current block */
-   LLVMBuildBr(ctx->builder, ifthen->merge_block);
+   LLVMBuildBr(ifthen->builder, ifthen->merge_block);
 
-   if (ifthen->false_block) {
-      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
-      /* for each variable, update the Phi node with a (variable, block) pair */
-      for (i = 0; i < flow->num_variables; i++) {
-         assert(*flow->variables[i]);
-         LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &curBlock, 1);
-         /* replace the variable ref with the phi function */
-         *flow->variables[i] = ifthen->phi[i];
-      }
-   }
-   else {
-      /* no else clause */
-      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
-      for (i = 0; i < flow->num_variables; i++) {
-         assert(*flow->variables[i]);
-         LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->true_block, 1);
-
-         /* replace the variable ref with the phi function */
-         *flow->variables[i] = ifthen->phi[i];
-      }
-   }
-
-   FREE(ifthen->phi);
-
-   /***
-    *** Now patch in the various branch instructions.
-    ***/
+   /*
+    * Now patch in the various branch instructions.
+    */
 
    /* Insert the conditional branch instruction at the end of entry_block */
-   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->entry_block);
+   LLVMPositionBuilderAtEnd(ifthen->builder, ifthen->entry_block);
    if (ifthen->false_block) {
       /* we have an else clause */
-      LLVMBuildCondBr(ctx->builder, ifthen->condition,
+      LLVMBuildCondBr(ifthen->builder, ifthen->condition,
                       ifthen->true_block, ifthen->false_block);
    }
    else {
       /* no else clause */
-      LLVMBuildCondBr(ctx->builder, ifthen->condition,
+      LLVMBuildCondBr(ifthen->builder, ifthen->condition,
                       ifthen->true_block, ifthen->merge_block);
    }
 
-   /* Insert branch from end of true_block to merge_block */
-   if (ifthen->false_block) {
-      /* Append an unconditional Br(anch) instruction on the true_block */
-      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->true_block);
-      LLVMBuildBr(ctx->builder, ifthen->merge_block);
-   }
-   else {
-      /* No else clause.
-       * Note that we've already inserted the branch at the end of
-       * true_block.  See the very first LLVMBuildBr() call in this function.
-       */
-   }
-
    /* Resume building code at end of the ifthen->merge_block */
-   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
+   LLVMPositionBuilderAtEnd(ifthen->builder, ifthen->merge_block);
 }
 
 
@@ -830,6 +399,7 @@ lp_build_alloca(LLVMBuilderRef builder,
    }
 
    res = LLVMBuildAlloca(first_builder, type, name);
+   LLVMBuildStore(builder, LLVMConstNull(type), res);
 
    LLVMDisposeBuilder(first_builder);
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
index fffb493a93..e729ee6eaa 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
@@ -41,52 +41,49 @@
 struct lp_type;
 
 
-struct lp_build_flow_context;
-
-
-struct lp_build_flow_context *
-lp_build_flow_create(LLVMBuilderRef builder);
-
-void
-lp_build_flow_destroy(struct lp_build_flow_context *flow);
-
-void
-lp_build_flow_scope_begin(struct lp_build_flow_context *flow);
-
-void
-lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
-                            LLVMValueRef *variable);
+/**
+ * Early exit. Useful to skip to the end of a function or block when
+ * the execution mask becomes zero or when there is an error condition.
+ */
+struct lp_build_skip_context
+{
+   LLVMBuilderRef builder;
 
-void
-lp_build_flow_scope_end(struct lp_build_flow_context *flow);
+   /** Block to skip to */
+   LLVMBasicBlockRef block;
+};
 
 void
-lp_build_flow_skip_begin(struct lp_build_flow_context *flow);
+lp_build_flow_skip_begin(struct lp_build_skip_context *ctx,
+                         LLVMBuilderRef builder);
 
 void
-lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+lp_build_flow_skip_cond_break(struct lp_build_skip_context *ctx,
                               LLVMValueRef cond);
 
 void
-lp_build_flow_skip_end(struct lp_build_flow_context *flow);
+lp_build_flow_skip_end(struct lp_build_skip_context *ctx);
 
 
 struct lp_build_mask_context
 {
-   struct lp_build_flow_context *flow;
+   struct lp_build_skip_context skip;
 
    LLVMTypeRef reg_type;
 
-   LLVMValueRef value;
+   LLVMValueRef var;
 };
 
 
 void
 lp_build_mask_begin(struct lp_build_mask_context *mask,
-                    struct lp_build_flow_context *flow,
+                    LLVMBuilderRef builder,
                     struct lp_type type,
                     LLVMValueRef value);
 
+LLVMValueRef
+lp_build_mask_value(struct lp_build_mask_context *mask);
+
 /**
  * Bitwise AND the mask with the given value, if a previous mask was set.
  */
@@ -94,6 +91,9 @@ void
 lp_build_mask_update(struct lp_build_mask_context *mask,
                      LLVMValueRef value);
 
+void
+lp_build_mask_check(struct lp_build_mask_context *mask);
+
 LLVMValueRef
 lp_build_mask_end(struct lp_build_mask_context *mask);
 
@@ -108,6 +108,7 @@ lp_build_mask_end(struct lp_build_mask_context *mask);
 struct lp_build_loop_state
 {
   LLVMBasicBlockRef block;
+  LLVMValueRef counter_var;
   LLVMValueRef counter;
 };
 
@@ -128,22 +129,28 @@ void
 lp_build_loop_end_cond(LLVMBuilderRef builder,
                        LLVMValueRef end,
                        LLVMValueRef step,
-                       int cond, /* LLVM condition */
+                       LLVMIntPredicate cond,
                        struct lp_build_loop_state *state);
 
 
 
 
+/**
+ * if/else/endif.
+ */
 struct lp_build_if_state
 {
    LLVMBuilderRef builder;
-   struct lp_build_flow_context *flow;
+   LLVMValueRef condition;
+   LLVMBasicBlockRef entry_block;
+   LLVMBasicBlockRef true_block;
+   LLVMBasicBlockRef false_block;
+   LLVMBasicBlockRef merge_block;
 };
 
 
 void
 lp_build_if(struct lp_build_if_state *ctx,
-            struct lp_build_flow_context *flow,
             LLVMBuilderRef builder,
             LLVMValueRef condition);
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
index 0a5038bc98..2bce289555 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -35,6 +35,7 @@
 
 
 #include "util/u_format.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_arit.h"
 #include "lp_bld_type.h"
@@ -42,7 +43,7 @@
 #include "lp_bld_conv.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_format.h"
-
+#include "lp_bld_logic.h"
 
 /**
  * Extract Y, U, V channels from packed UYVY.
@@ -59,7 +60,7 @@ uyvy_to_yuv_soa(LLVMBuilderRef builder,
                 LLVMValueRef *v)
 {
    struct lp_type type;
-   LLVMValueRef shift, mask;
+   LLVMValueRef mask;
 
    memset(&type, 0, sizeof type);
    type.width = 32;
@@ -69,14 +70,37 @@ uyvy_to_yuv_soa(LLVMBuilderRef builder,
    assert(lp_check_value(type, i));
 
    /*
-    * y = (uyvy >> 16*i) & 0xff
+    * y = (uyvy >> (16*i + 8)) & 0xff
     * u = (uyvy        ) & 0xff
     * v = (uyvy >> 16  ) & 0xff
     */
 
-   shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
-   shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(type, 8), "");
-   *y = LLVMBuildLShr(builder, packed, shift, "");
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   /*
+    * Avoid shift with per-element count.
+    * No support on x86, gets translated to roughly 5 instructions
+    * per element. Didn't measure performance but cuts shader size
+    * by quite a bit (less difference if cpu has no sse4.1 support).
+    */
+   if (util_cpu_caps.has_sse2 && n == 4) {
+      LLVMValueRef sel, tmp, tmp2;
+      struct lp_build_context bld32;
+
+      lp_build_context_init(&bld32, builder, type);
+
+      tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 8), "");
+      tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(type, 16), "");
+      sel = lp_build_compare(builder, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(type, 0));
+      *y = lp_build_select(&bld32, sel, tmp, tmp2);
+   } else
+#endif
+   {
+      LLVMValueRef shift;
+      shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
+      shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(type, 8), "");
+      *y = LLVMBuildLShr(builder, packed, shift, "");
+   }
+
    *u = packed;
    *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), "");
 
@@ -103,7 +127,7 @@ yuyv_to_yuv_soa(LLVMBuilderRef builder,
                 LLVMValueRef *v)
 {
    struct lp_type type;
-   LLVMValueRef shift, mask;
+   LLVMValueRef mask;
 
    memset(&type, 0, sizeof type);
    type.width = 32;
@@ -118,8 +142,30 @@ yuyv_to_yuv_soa(LLVMBuilderRef builder,
     * v = (yuyv >> 24  ) & 0xff
     */
 
-   shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
-   *y = LLVMBuildLShr(builder, packed, shift, "");
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   /*
+    * Avoid shift with per-element count.
+    * No support on x86, gets translated to roughly 5 instructions
+    * per element. Didn't measure performance but cuts shader size
+    * by quite a bit (less difference if cpu has no sse4.1 support).
+    */
+   if (util_cpu_caps.has_sse2 && n == 4) {
+      LLVMValueRef sel, tmp;
+      struct lp_build_context bld32;
+
+      lp_build_context_init(&bld32, builder, type);
+
+      tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), "");
+      sel = lp_build_compare(builder, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(type, 0));
+       *y = lp_build_select(&bld32, sel, packed, tmp);
+   } else
+#endif
+   {
+      LLVMValueRef shift;
+      shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
+      *y = LLVMBuildLShr(builder, packed, shift, "");
+   }
+
    *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 8), "");
    *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 24), "");
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 761f33b578..5598ca5c48 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -44,6 +44,7 @@ static const struct debug_named_value lp_bld_debug_flags[] = {
    { "asm",    GALLIVM_DEBUG_ASM, NULL },
    { "nopt",   GALLIVM_DEBUG_NO_OPT, NULL },
    { "perf",   GALLIVM_DEBUG_PERF, NULL },
+   { "no_brilinear", GALLIVM_DEBUG_NO_BRILINEAR, NULL },
    DEBUG_NAMED_VALUE_END
 };
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h
index f26fdac466..0b4b1ca7d1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -47,4 +47,10 @@ lp_build_init(void);
 extern void
 lp_func_delete_body(LLVMValueRef func);
 
+
+extern LLVMValueRef
+lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
+                       const char *Name);
+
+
 #endif /* !LP_BLD_INIT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index d5c62a3f73..026b60ac36 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -92,9 +92,23 @@ lp_build_compare(LLVMBuilderRef builder,
    if(func == PIPE_FUNC_ALWAYS)
       return ones;
 
-   /* TODO: optimize the constant case */
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   /*
+    * There are no unsigned integer comparison instructions in SSE.
+    */
 
-   /* XXX: It is not clear if we should use the ordered or unordered operators */
+   if (!type.floating && !type.sign &&
+       type.width * type.length == 128 &&
+       util_cpu_caps.has_sse2 &&
+       (func == PIPE_FUNC_LESS ||
+        func == PIPE_FUNC_LEQUAL ||
+        func == PIPE_FUNC_GREATER ||
+        func == PIPE_FUNC_GEQUAL) &&
+       (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+         debug_printf("%s: inefficient <%u x i%u> unsigned comparison\n",
+                      __FUNCTION__, type.length, type.width);
+   }
+#endif
 
 #if HAVE_LLVM < 0x0207
 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
@@ -225,6 +239,8 @@ lp_build_compare(LLVMBuilderRef builder,
 #endif
 #endif /* HAVE_LLVM < 0x0207 */
 
+   /* XXX: It is not clear if we should use the ordered or unordered operators */
+
    if(type.floating) {
       LLVMRealPredicate op;
       switch(func) {
@@ -446,10 +462,12 @@ lp_build_select(struct lp_build_context *bld,
       LLVMTypeRef arg_type;
       LLVMValueRef args[3];
 
-      if (type.width == 64) {
+      if (type.floating &&
+          type.width == 64) {
          intrinsic = "llvm.x86.sse41.blendvpd";
          arg_type = LLVMVectorType(LLVMDoubleType(), 2);
-      } else if (type.width == 32) {
+      } else if (type.floating &&
+                 type.width == 32) {
          intrinsic = "llvm.x86.sse41.blendvps";
          arg_type = LLVMVectorType(LLVMFloatType(), 4);
       } else {
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 48baf7c425..f56ddee7fd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -178,3 +178,13 @@ lp_func_delete_body(LLVMValueRef FF)
    llvm::Function *func = llvm::unwrap<llvm::Function>(FF);
    func->deleteBody();
 }
+
+
+extern "C"
+LLVMValueRef
+lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
+                       const char *Name)
+{
+   return llvm::wrap(llvm::unwrap(B)->CreateLoad(llvm::unwrap(PointerVal), true, Name));
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_printf.c b/src/gallium/auxiliary/gallivm/lp_bld_printf.c
index 153ba5b15b..f418e96aff 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_printf.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.c
@@ -29,6 +29,8 @@
 
 #include "util/u_debug.h"
 #include "util/u_memory.h"
+#include "util/u_string.h"
+#include "lp_bld_const.h"
 #include "lp_bld_printf.h"
 
 
@@ -119,3 +121,22 @@ lp_build_printf(LLVMBuilderRef builder, const char *fmt, ...)
    return LLVMBuildCall(builder, func_printf, params, argcount + 1, "");
 }
 
+
+
+/**
+ * Print a float[4] vector.
+ */
+LLVMValueRef
+lp_build_print_vec4(LLVMBuilderRef builder, const char *msg, LLVMValueRef vec)
+{
+   char format[1000];
+   LLVMValueRef x, y, z, w;
+
+   x = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(0), "");
+   y = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(1), "");
+   z = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(2), "");
+   w = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(3), "");
+
+   util_snprintf(format, sizeof(format), "%s %%f %%f %%f %%f\n", msg);
+   return lp_build_printf(builder, format, x, y, z, w);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_printf.h b/src/gallium/auxiliary/gallivm/lp_bld_printf.h
index 83bd8f1d55..b6222c62eb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_printf.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.h
@@ -35,5 +35,9 @@
 LLVMValueRef lp_build_const_string_variable(LLVMModuleRef module, const char *str, int len);
 LLVMValueRef lp_build_printf(LLVMBuilderRef builder, const char *fmt, ...);
 
+LLVMValueRef
+lp_build_print_vec4(LLVMBuilderRef builder, const char *msg, LLVMValueRef vec);
+
+
 #endif
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
index 7b1088939b..c18c8b4710 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -81,11 +81,15 @@ LLVMValueRef
 lp_build_scalar_ddx(struct lp_build_context *bld,
                     LLVMValueRef a)
 {
-   LLVMValueRef idx_left  = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_LEFT, 0);
-   LLVMValueRef idx_right = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_RIGHT, 0);
-   LLVMValueRef a_left  = LLVMBuildExtractElement(bld->builder, a, idx_left, "");
-   LLVMValueRef a_right = LLVMBuildExtractElement(bld->builder, a, idx_right, "");
-   return lp_build_sub(bld, a_right, a_left);
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMValueRef idx_left  = LLVMConstInt(i32t, LP_BLD_QUAD_TOP_LEFT, 0);
+   LLVMValueRef idx_right = LLVMConstInt(i32t, LP_BLD_QUAD_TOP_RIGHT, 0);
+   LLVMValueRef a_left  = LLVMBuildExtractElement(bld->builder, a, idx_left, "left");
+   LLVMValueRef a_right = LLVMBuildExtractElement(bld->builder, a, idx_right, "right");
+   if (bld->type.floating)
+      return LLVMBuildFSub(bld->builder, a_right, a_left, "ddx");
+   else
+      return LLVMBuildSub(bld->builder, a_right, a_left, "ddx");
 }
 
 
@@ -93,9 +97,13 @@ LLVMValueRef
 lp_build_scalar_ddy(struct lp_build_context *bld,
                     LLVMValueRef a)
 {
-   LLVMValueRef idx_top    = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_LEFT, 0);
-   LLVMValueRef idx_bottom = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_BOTTOM_LEFT, 0);
-   LLVMValueRef a_top    = LLVMBuildExtractElement(bld->builder, a, idx_top, "");
-   LLVMValueRef a_bottom = LLVMBuildExtractElement(bld->builder, a, idx_bottom, "");
-   return lp_build_sub(bld, a_bottom, a_top);
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMValueRef idx_top    = LLVMConstInt(i32t, LP_BLD_QUAD_TOP_LEFT, 0);
+   LLVMValueRef idx_bottom = LLVMConstInt(i32t, LP_BLD_QUAD_BOTTOM_LEFT, 0);
+   LLVMValueRef a_top    = LLVMBuildExtractElement(bld->builder, a, idx_top, "top");
+   LLVMValueRef a_bottom = LLVMBuildExtractElement(bld->builder, a, idx_bottom, "bottom");
+   if (bld->type.floating)
+      return LLVMBuildFSub(bld->builder, a_bottom, a_top, "ddy");
+   else
+      return LLVMBuildSub(bld->builder, a_bottom, a_top, "ddy");
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index e89ee7c230..844d1d935b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -39,12 +39,52 @@
 #include "lp_bld_arit.h"
 #include "lp_bld_const.h"
 #include "lp_bld_debug.h"
+#include "lp_bld_printf.h"
 #include "lp_bld_flow.h"
 #include "lp_bld_sample.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_type.h"
 
 
+/*
+ * Bri-linear factor. Should be greater than one.
+ */
+#define BRILINEAR_FACTOR 2
+
+
+/**
+ * Does the given texture wrap mode allow sampling the texture border color?
+ * XXX maybe move this into gallium util code.
+ */
+boolean
+lp_sampler_wrap_mode_uses_border_color(unsigned mode,
+                                       unsigned min_img_filter,
+                                       unsigned mag_img_filter)
+{
+   switch (mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      return FALSE;
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      if (min_img_filter == PIPE_TEX_FILTER_NEAREST &&
+          mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
+         return FALSE;
+      } else {
+         return TRUE;
+      }
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      return TRUE;
+   default:
+      assert(0 && "unexpected wrap mode");
+      return FALSE;
+   }
+}
+
+
 /**
  * Initialize lp_sampler_static_state object with the gallium sampler
  * and texture state.
@@ -93,31 +133,40 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    state->wrap_r            = sampler->wrap_r;
    state->min_img_filter    = sampler->min_img_filter;
    state->mag_img_filter    = sampler->mag_img_filter;
-   if (view->last_level) {
+
+   if (view->last_level && sampler->max_lod > 0.0f) {
       state->min_mip_filter = sampler->min_mip_filter;
    } else {
       state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
    }
 
+   if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+      if (sampler->lod_bias != 0.0f) {
+         state->lod_bias_non_zero = 1;
+      }
+
+      /* If min_lod == max_lod we can greatly simplify mipmap selection.
+       * This is a case that occurs during automatic mipmap generation.
+       */
+      if (sampler->min_lod == sampler->max_lod) {
+         state->min_max_lod_equal = 1;
+      } else {
+         if (sampler->min_lod > 0.0f) {
+            state->apply_min_lod = 1;
+         }
+
+         if (sampler->max_lod < (float)view->last_level) {
+            state->apply_max_lod = 1;
+         }
+      }
+   }
+
    state->compare_mode      = sampler->compare_mode;
    if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
       state->compare_func   = sampler->compare_func;
    }
 
    state->normalized_coords = sampler->normalized_coords;
-   state->lod_bias          = sampler->lod_bias;
-   if (!view->last_level &&
-       sampler->min_img_filter == sampler->mag_img_filter) {
-      state->min_lod        = 0.0f;
-      state->max_lod        = 0.0f;
-   } else {
-      state->min_lod        = MAX2(sampler->min_lod, 0.0f);
-      state->max_lod        = sampler->max_lod;
-   }
-   state->border_color[0]   = sampler->border_color[0];
-   state->border_color[1]   = sampler->border_color[1];
-   state->border_color[2]   = sampler->border_color[2];
-   state->border_color[3]   = sampler->border_color[3];
 
    /*
     * FIXME: Handle the remainder of pipe_sampler_view.
@@ -126,6 +175,220 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
 
 /**
+ * Generate code to compute coordinate gradient (rho).
+ * \param ddx  partial derivatives of (s, t, r, q) with respect to X
+ * \param ddy  partial derivatives of (s, t, r, q) with respect to Y
+ *
+ * XXX: The resulting rho is scalar, so we ignore all but the first element of
+ * derivatives that are passed by the shader.
+ */
+static LLVMValueRef
+lp_build_rho(struct lp_build_sample_context *bld,
+             const LLVMValueRef ddx[4],
+             const LLVMValueRef ddy[4])
+{
+   struct lp_build_context *float_size_bld = &bld->float_size_bld;
+   struct lp_build_context *float_bld = &bld->float_bld;
+   const unsigned dims = bld->dims;
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
+   LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
+   LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
+   LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy;
+   LLVMValueRef rho_x, rho_y;
+   LLVMValueRef rho_vec;
+   LLVMValueRef float_size;
+   LLVMValueRef rho;
+
+   dsdx = ddx[0];
+   dsdy = ddy[0];
+
+   if (dims <= 1) {
+      rho_x = dsdx;
+      rho_y = dsdy;
+   }
+   else {
+      rho_x = float_size_bld->undef;
+      rho_y = float_size_bld->undef;
+
+      rho_x = LLVMBuildInsertElement(bld->builder, rho_x, dsdx, index0, "");
+      rho_y = LLVMBuildInsertElement(bld->builder, rho_y, dsdy, index0, "");
+
+      dtdx = ddx[1];
+      dtdy = ddy[1];
+
+      rho_x = LLVMBuildInsertElement(bld->builder, rho_x, dtdx, index1, "");
+      rho_y = LLVMBuildInsertElement(bld->builder, rho_y, dtdy, index1, "");
+
+      if (dims >= 3) {
+         drdx = ddx[2];
+         drdy = ddy[2];
+
+         rho_x = LLVMBuildInsertElement(bld->builder, rho_x, drdx, index2, "");
+         rho_y = LLVMBuildInsertElement(bld->builder, rho_y, drdy, index2, "");
+      }
+   }
+
+   rho_x = lp_build_abs(float_size_bld, rho_x);
+   rho_y = lp_build_abs(float_size_bld, rho_y);
+
+   rho_vec = lp_build_max(float_size_bld, rho_x, rho_y);
+
+   float_size = lp_build_int_to_float(float_size_bld, bld->int_size);
+
+   rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
+
+   if (dims <= 1) {
+      rho = rho_vec;
+   }
+   else {
+      if (dims >= 2) {
+         LLVMValueRef rho_s, rho_t, rho_r;
+
+         rho_s = LLVMBuildExtractElement(bld->builder, rho_vec, index0, "");
+         rho_t = LLVMBuildExtractElement(bld->builder, rho_vec, index1, "");
+
+         rho = lp_build_max(float_bld, rho_s, rho_t);
+
+         if (dims >= 3) {
+            rho_r = LLVMBuildExtractElement(bld->builder, rho_vec, index0, "");
+            rho = lp_build_max(float_bld, rho, rho_r);
+         }
+      }
+   }
+
+   return rho;
+}
+
+
+/*
+ * Bri-linear lod computation
+ *
+ * Use a piece-wise linear approximation of log2 such that:
+ * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc.
+ * - linear approximation for values in the neighborhood of 0.5, 1.5., etc,
+ *   with the steepness specified in 'factor'
+ * - exact result for 0.5, 1.5, etc.
+ *
+ *
+ *   1.0 -              /----*
+ *                     /
+ *                    /
+ *                   /
+ *   0.5 -          *
+ *                 /
+ *                /
+ *               /
+ *   0.0 - *----/
+ *
+ *         |                 |
+ *        2^0               2^1
+ *
+ * This is a technique also commonly used in hardware:
+ * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html
+ *
+ * TODO: For correctness, this should only be applied when texture is known to
+ * have regular mipmaps, i.e., mipmaps derived from the base level.
+ *
+ * TODO: This could be done in fixed point, where applicable.
+ */
+static void
+lp_build_brilinear_lod(struct lp_build_context *bld,
+                       LLVMValueRef lod,
+                       double factor,
+                       LLVMValueRef *out_lod_ipart,
+                       LLVMValueRef *out_lod_fpart)
+{
+   LLVMValueRef lod_fpart;
+   double pre_offset = (factor - 0.5)/factor - 0.5;
+   double post_offset = 1 - factor;
+
+   if (0) {
+      lp_build_printf(bld->builder, "lod = %f\n", lod);
+   }
+
+   lod = lp_build_add(bld, lod,
+                      lp_build_const_vec(bld->type, pre_offset));
+
+   lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);
+
+   lod_fpart = lp_build_mul(bld, lod_fpart,
+                            lp_build_const_vec(bld->type, factor));
+
+   lod_fpart = lp_build_add(bld, lod_fpart,
+                            lp_build_const_vec(bld->type, post_offset));
+
+   /*
+    * It's not necessary to clamp lod_fpart since:
+    * - the above expression will never produce numbers greater than one.
+    * - the mip filtering branch is only taken if lod_fpart is positive
+    */
+
+   *out_lod_fpart = lod_fpart;
+
+   if (0) {
+      lp_build_printf(bld->builder, "lod_ipart = %i\n", *out_lod_ipart);
+      lp_build_printf(bld->builder, "lod_fpart = %f\n\n", *out_lod_fpart);
+   }
+}
+
+
+/*
+ * Combined log2 and brilinear lod computation.
+ *
+ * It's in all identical to calling lp_build_fast_log2() and
+ * lp_build_brilinear_lod() above, but by combining we can compute the interger
+ * and fractional part independently.
+ */
+static void
+lp_build_brilinear_rho(struct lp_build_context *bld,
+                       LLVMValueRef rho,
+                       double factor,
+                       LLVMValueRef *out_lod_ipart,
+                       LLVMValueRef *out_lod_fpart)
+{
+   LLVMValueRef lod_ipart;
+   LLVMValueRef lod_fpart;
+
+   const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor);
+   const double post_offset = 1 - 2*factor;
+
+   assert(bld->type.floating);
+
+   assert(lp_check_value(bld->type, rho));
+
+   /*
+    * The pre factor will make the intersections with the exact powers of two
+    * happen precisely where we want then to be, which means that the integer
+    * part will not need any post adjustments.
+    */
+   rho = lp_build_mul(bld, rho,
+                      lp_build_const_vec(bld->type, pre_factor));
+
+   /* ipart = ifloor(log2(rho)) */
+   lod_ipart = lp_build_extract_exponent(bld, rho, 0);
+
+   /* fpart = rho / 2**ipart */
+   lod_fpart = lp_build_extract_mantissa(bld, rho);
+
+   lod_fpart = lp_build_mul(bld, lod_fpart,
+                            lp_build_const_vec(bld->type, factor));
+
+   lod_fpart = lp_build_add(bld, lod_fpart,
+                            lp_build_const_vec(bld->type, post_offset));
+
+   /*
+    * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since:
+    * - the above expression will never produce numbers greater than one.
+    * - the mip filtering branch is only taken if lod_fpart is positive
+    */
+
+   *out_lod_ipart = lod_ipart;
+   *out_lod_fpart = lod_fpart;
+}
+
+
+/**
  * Generate code to compute texture level of detail (lambda).
  * \param ddx  partial derivatives of (s, t, r, q) with respect to X
  * \param ddy  partial derivatives of (s, t, r, q) with respect to Y
@@ -138,83 +401,81 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
  * XXX: The resulting lod is scalar, so ignore all but the first element of
  * derivatives, lod_bias, etc that are passed by the shader.
  */
-LLVMValueRef
+void
 lp_build_lod_selector(struct lp_build_sample_context *bld,
+                      unsigned unit,
                       const LLVMValueRef ddx[4],
                       const LLVMValueRef ddy[4],
                       LLVMValueRef lod_bias, /* optional */
                       LLVMValueRef explicit_lod, /* optional */
-                      LLVMValueRef width,
-                      LLVMValueRef height,
-                      LLVMValueRef depth)
+                      unsigned mip_filter,
+                      LLVMValueRef *out_lod_ipart,
+                      LLVMValueRef *out_lod_fpart)
 
 {
-   if (bld->static_state->min_lod == bld->static_state->max_lod) {
+   struct lp_build_context *float_bld = &bld->float_bld;
+   LLVMValueRef lod;
+
+   *out_lod_ipart = bld->int_bld.zero;
+   *out_lod_fpart = bld->float_bld.zero;
+
+   if (bld->static_state->min_max_lod_equal) {
       /* User is forcing sampling from a particular mipmap level.
        * This is hit during mipmap generation.
        */
-      return LLVMConstReal(LLVMFloatType(), bld->static_state->min_lod);
+      LLVMValueRef min_lod =
+         bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit);
+
+      lod = min_lod;
    }
    else {
-      struct lp_build_context *float_bld = &bld->float_bld;
-      LLVMValueRef sampler_lod_bias = LLVMConstReal(LLVMFloatType(),
-                                                    bld->static_state->lod_bias);
-      LLVMValueRef min_lod = LLVMConstReal(LLVMFloatType(),
-                                           bld->static_state->min_lod);
-      LLVMValueRef max_lod = LLVMConstReal(LLVMFloatType(),
-                                           bld->static_state->max_lod);
+      LLVMValueRef sampler_lod_bias =
+         bld->dynamic_state->lod_bias(bld->dynamic_state, bld->builder, unit);
       LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
-      LLVMValueRef lod;
 
       if (explicit_lod) {
          lod = LLVMBuildExtractElement(bld->builder, explicit_lod,
                                        index0, "");
       }
       else {
-         const int dims = texture_dims(bld->static_state->target);
-         LLVMValueRef dsdx, dsdy;
-         LLVMValueRef dtdx = NULL, dtdy = NULL, drdx = NULL, drdy = NULL;
          LLVMValueRef rho;
 
-         dsdx = LLVMBuildExtractElement(bld->builder, ddx[0], index0, "dsdx");
-         dsdx = lp_build_abs(float_bld, dsdx);
-         dsdy = LLVMBuildExtractElement(bld->builder, ddy[0], index0, "dsdy");
-         dsdy = lp_build_abs(float_bld, dsdy);
-         if (dims > 1) {
-            dtdx = LLVMBuildExtractElement(bld->builder, ddx[1], index0, "dtdx");
-            dtdx = lp_build_abs(float_bld, dtdx);
-            dtdy = LLVMBuildExtractElement(bld->builder, ddy[1], index0, "dtdy");
-            dtdy = lp_build_abs(float_bld, dtdy);
-            if (dims > 2) {
-               drdx = LLVMBuildExtractElement(bld->builder, ddx[2], index0, "drdx");
-               drdx = lp_build_abs(float_bld, drdx);
-               drdy = LLVMBuildExtractElement(bld->builder, ddy[2], index0, "drdy");
-               drdy = lp_build_abs(float_bld, drdy);
-            }
-         }
+         rho = lp_build_rho(bld, ddx, ddy);
 
-         /* Compute rho = max of all partial derivatives scaled by texture size.
-          * XXX this could be vectorized somewhat
+         /*
+          * Compute lod = log2(rho)
           */
-         rho = LLVMBuildFMul(bld->builder,
-                            lp_build_max(float_bld, dsdx, dsdy),
-                            lp_build_int_to_float(float_bld, width), "");
-         if (dims > 1) {
-            LLVMValueRef max;
-            max = LLVMBuildFMul(bld->builder,
-                               lp_build_max(float_bld, dtdx, dtdy),
-                               lp_build_int_to_float(float_bld, height), "");
-            rho = lp_build_max(float_bld, rho, max);
-            if (dims > 2) {
-               max = LLVMBuildFMul(bld->builder,
-                                  lp_build_max(float_bld, drdx, drdy),
-                                  lp_build_int_to_float(float_bld, depth), "");
-               rho = lp_build_max(float_bld, rho, max);
+
+         if (!lod_bias &&
+             !bld->static_state->lod_bias_non_zero &&
+             !bld->static_state->apply_max_lod &&
+             !bld->static_state->apply_min_lod) {
+            /*
+             * Special case when there are no post-log2 adjustments, which
+             * saves instructions but keeping the integer and fractional lod
+             * computations separate from the start.
+             */
+
+            if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
+                mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
+               *out_lod_ipart = lp_build_ilog2(float_bld, rho);
+               *out_lod_fpart = bld->float_bld.zero;
+               return;
+            }
+            if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
+                !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
+               lp_build_brilinear_rho(float_bld, rho, BRILINEAR_FACTOR,
+                                      out_lod_ipart, out_lod_fpart);
+               return;
             }
          }
 
-         /* compute lod = log2(rho) */
-         lod = lp_build_log2(float_bld, rho);
+         if (0) {
+            lod = lp_build_log2(float_bld, rho);
+         }
+         else {
+            lod = lp_build_fast_log2(float_bld, rho);
+         }
 
          /* add shader lod bias */
          if (lod_bias) {
@@ -225,13 +486,43 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
       }
 
       /* add sampler lod bias */
-      lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias");
+      if (bld->static_state->lod_bias_non_zero)
+         lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias");
+
 
       /* clamp lod */
-      lod = lp_build_clamp(float_bld, lod, min_lod, max_lod);
+      if (bld->static_state->apply_max_lod) {
+         LLVMValueRef max_lod =
+            bld->dynamic_state->max_lod(bld->dynamic_state, bld->builder, unit);
 
-      return lod;
+         lod = lp_build_min(float_bld, lod, max_lod);
+      }
+      if (bld->static_state->apply_min_lod) {
+         LLVMValueRef min_lod =
+            bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit);
+
+         lod = lp_build_max(float_bld, lod, min_lod);
+      }
+   }
+
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
+         lp_build_brilinear_lod(float_bld, lod, BRILINEAR_FACTOR,
+                                out_lod_ipart, out_lod_fpart);
+      }
+      else {
+         lp_build_ifloor_fract(float_bld, lod, out_lod_ipart, out_lod_fpart);
+      }
+
+      lp_build_name(*out_lod_fpart, "lod_fpart");
+   }
+   else {
+      *out_lod_ipart = lp_build_iround(float_bld, lod);
    }
+
+   lp_build_name(*out_lod_ipart, "lod_ipart");
+
+   return;
 }
 
 
@@ -245,10 +536,9 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
 void
 lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
                            unsigned unit,
-                           LLVMValueRef lod,
+                           LLVMValueRef lod_ipart,
                            LLVMValueRef *level_out)
 {
-   struct lp_build_context *float_bld = &bld->float_bld;
    struct lp_build_context *int_bld = &bld->int_bld;
    LLVMValueRef last_level, level;
 
@@ -258,7 +548,7 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
                                                bld->builder, unit);
 
    /* convert float lod to integer */
-   level = lp_build_iround(float_bld, lod);
+   level = lod_ipart;
 
    /* clamp level to legal range of levels */
    *level_out = lp_build_clamp(int_bld, level, zero, last_level);
@@ -273,43 +563,77 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
 void
 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                            unsigned unit,
-                           LLVMValueRef lod,
+                           LLVMValueRef lod_ipart,
+                           LLVMValueRef *lod_fpart_inout,
                            LLVMValueRef *level0_out,
-                           LLVMValueRef *level1_out,
-                           LLVMValueRef *weight_out)
+                           LLVMValueRef *level1_out)
 {
-   struct lp_build_context *float_bld = &bld->float_bld;
+   LLVMBuilderRef builder = bld->builder;
    struct lp_build_context *int_bld = &bld->int_bld;
-   LLVMValueRef last_level, level;
+   struct lp_build_context *float_bld = &bld->float_bld;
+   LLVMValueRef last_level;
+   LLVMValueRef clamp_min;
+   LLVMValueRef clamp_max;
+
+   *level0_out = lod_ipart;
+   *level1_out = lp_build_add(int_bld, lod_ipart, int_bld->one);
 
    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
                                                bld->builder, unit);
 
-   /* convert float lod to integer */
-   level = lp_build_ifloor(float_bld, lod);
-
-   /* compute level 0 and clamp to legal range of levels */
-   *level0_out = lp_build_clamp(int_bld, level,
-                                int_bld->zero,
-                                last_level);
-   /* compute level 1 and clamp to legal range of levels */
-   level = lp_build_add(int_bld, level, int_bld->one);
-   *level1_out = lp_build_clamp(int_bld, level,
-                                int_bld->zero,
-                                last_level);
-
-   *weight_out = lp_build_fract(float_bld, lod);
+   /*
+    * Clamp both lod_ipart and lod_ipart + 1 to [0, last_level], with the
+    * minimum number of comparisons, and zeroing lod_fpart in the extreme
+    * ends in the process.
+    */
+
+   /* lod_ipart < 0 */
+   clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
+                             lod_ipart, int_bld->zero,
+                             "clamp_lod_to_zero");
+
+   *level0_out = LLVMBuildSelect(builder, clamp_min,
+                                 int_bld->zero, *level0_out, "");
+
+   *level1_out = LLVMBuildSelect(builder, clamp_min,
+                                 int_bld->zero, *level1_out, "");
+
+   *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
+                                      float_bld->zero, *lod_fpart_inout, "");
+
+   /* lod_ipart >= last_level */
+   clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
+                             lod_ipart, last_level,
+                             "clamp_lod_to_last");
+
+   *level0_out = LLVMBuildSelect(builder, clamp_max,
+                                 last_level, *level0_out, "");
+
+   *level1_out = LLVMBuildSelect(builder, clamp_max,
+                                 last_level, *level1_out, "");
+
+   *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
+                                      float_bld->zero, *lod_fpart_inout, "");
+
+   lp_build_name(*level0_out, "sampler%u_miplevel0", unit);
+   lp_build_name(*level1_out, "sampler%u_miplevel1", unit);
+   lp_build_name(*lod_fpart_inout, "sampler%u_mipweight", unit);
 }
 
 
+/**
+ * Return pointer to a single mipmap level.
+ * \param data_array  array of pointers to mipmap levels
+ * \param level  integer mipmap level
+ */
 LLVMValueRef
 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
-                          LLVMValueRef data_array, LLVMValueRef level)
+                          LLVMValueRef level)
 {
    LLVMValueRef indexes[2], data_ptr;
    indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
    indexes[1] = level;
-   data_ptr = LLVMBuildGEP(bld->builder, data_array, indexes, 2, "");
+   data_ptr = LLVMBuildGEP(bld->builder, bld->data_array, indexes, 2, "");
    data_ptr = LLVMBuildLoad(bld->builder, data_ptr, "");
    return data_ptr;
 }
@@ -317,10 +641,10 @@ lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
 
 LLVMValueRef
 lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
-                                LLVMValueRef data_array, int level)
+                                int level)
 {
    LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0);
-   return lp_build_get_mipmap_level(bld, data_array, lvl);
+   return lp_build_get_mipmap_level(bld, lvl);
 }
 
 
@@ -329,13 +653,24 @@ lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
  * Return max(1, base_size >> level);
  */
 static LLVMValueRef
-lp_build_minify(struct lp_build_sample_context *bld,
+lp_build_minify(struct lp_build_context *bld,
                 LLVMValueRef base_size,
                 LLVMValueRef level)
 {
-   LLVMValueRef size = LLVMBuildLShr(bld->builder, base_size, level, "minify");
-   size = lp_build_max(&bld->int_coord_bld, size, bld->int_coord_bld.one);
-   return size;
+   assert(lp_check_value(bld->type, base_size));
+   assert(lp_check_value(bld->type, level));
+
+   if (level == bld->zero) {
+      /* if we're using mipmap level zero, no minification is needed */
+      return base_size;
+   }
+   else {
+      LLVMValueRef size =
+         LLVMBuildLShr(bld->builder, base_size, level, "minify");
+      assert(bld->type.sign);
+      size = lp_build_max(bld, size, bld->one);
+      return size;
+   }
 }
 
 
@@ -364,71 +699,113 @@ lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
  */
 void
 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
-                            unsigned dims,
-                            LLVMValueRef width_vec,
-                            LLVMValueRef height_vec,
-                            LLVMValueRef depth_vec,
-                            LLVMValueRef ilevel0,
-                            LLVMValueRef ilevel1,
-                            LLVMValueRef row_stride_array,
-                            LLVMValueRef img_stride_array,
-                            LLVMValueRef *width0_vec,
-                            LLVMValueRef *width1_vec,
-                            LLVMValueRef *height0_vec,
-                            LLVMValueRef *height1_vec,
-                            LLVMValueRef *depth0_vec,
-                            LLVMValueRef *depth1_vec,
-                            LLVMValueRef *row_stride0_vec,
-                            LLVMValueRef *row_stride1_vec,
-                            LLVMValueRef *img_stride0_vec,
-                            LLVMValueRef *img_stride1_vec)
+                            LLVMValueRef ilevel,
+                            LLVMValueRef *out_size,
+                            LLVMValueRef *row_stride_vec,
+                            LLVMValueRef *img_stride_vec)
 {
-   const unsigned mip_filter = bld->static_state->min_mip_filter;
-   LLVMValueRef ilevel0_vec, ilevel1_vec;
+   const unsigned dims = bld->dims;
+   LLVMValueRef ilevel_vec;
 
-   ilevel0_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel0);
-   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR)
-      ilevel1_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel1);
+   ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
 
    /*
-    * Compute width, height, depth at mipmap level 'ilevel0'
+    * Compute width, height, depth at mipmap level 'ilevel'
     */
-   *width0_vec = lp_build_minify(bld, width_vec, ilevel0_vec);
+   *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
+
    if (dims >= 2) {
-      *height0_vec = lp_build_minify(bld, height_vec, ilevel0_vec);
-      *row_stride0_vec = lp_build_get_level_stride_vec(bld,
-                                                       row_stride_array,
-                                                       ilevel0);
+      *row_stride_vec = lp_build_get_level_stride_vec(bld,
+                                                      bld->row_stride_array,
+                                                      ilevel);
       if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
-         *img_stride0_vec = lp_build_get_level_stride_vec(bld,
-                                                          img_stride_array,
-                                                          ilevel0);
-         if (dims == 3) {
-            *depth0_vec = lp_build_minify(bld, depth_vec, ilevel0_vec);
-         }
+         *img_stride_vec = lp_build_get_level_stride_vec(bld,
+                                                         bld->img_stride_array,
+                                                         ilevel);
       }
    }
-   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-      /* compute width, height, depth for second mipmap level at 'ilevel1' */
-      *width1_vec = lp_build_minify(bld, width_vec, ilevel1_vec);
-      if (dims >= 2) {
-         *height1_vec = lp_build_minify(bld, height_vec, ilevel1_vec);
-         *row_stride1_vec = lp_build_get_level_stride_vec(bld,
-                                                          row_stride_array,
-                                                          ilevel1);
-         if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
-            *img_stride1_vec = lp_build_get_level_stride_vec(bld,
-                                                             img_stride_array,
-                                                             ilevel1);
-            if (dims == 3) {
-               *depth1_vec = lp_build_minify(bld, depth_vec, ilevel1_vec);
-            }
-         }
+}
+
+
+/**
+ * Extract and broadcast texture size.
+ *
+ * @param size_type   type of the texture size vector (either
+ *                    bld->int_size_type or bld->float_size_type)
+ * @param coord_type  type of the texture size vector (either
+ *                    bld->int_coord_type or bld->coord_type)
+ * @param int_size    vector with the integer texture size (width, height,
+ *                    depth)
+ */
+void
+lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
+                             struct lp_type size_type,
+                             struct lp_type coord_type,
+                             LLVMValueRef size,
+                             LLVMValueRef *out_width,
+                             LLVMValueRef *out_height,
+                             LLVMValueRef *out_depth)
+{
+   const unsigned dims = bld->dims;
+   LLVMTypeRef i32t = LLVMInt32Type();
+
+   *out_width = lp_build_extract_broadcast(bld->builder,
+                                           size_type,
+                                           coord_type,
+                                           size,
+                                           LLVMConstInt(i32t, 0, 0));
+   if (dims >= 2) {
+      *out_height = lp_build_extract_broadcast(bld->builder,
+                                               size_type,
+                                               coord_type,
+                                               size,
+                                               LLVMConstInt(i32t, 1, 0));
+      if (dims == 3) {
+         *out_depth = lp_build_extract_broadcast(bld->builder,
+                                                 size_type,
+                                                 coord_type,
+                                                 size,
+                                                 LLVMConstInt(i32t, 2, 0));
       }
    }
 }
 
 
+/**
+ * Unnormalize coords.
+ *
+ * @param int_size  vector with the integer texture size (width, height, depth)
+ */
+void
+lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
+                             LLVMValueRef flt_size,
+                             LLVMValueRef *s,
+                             LLVMValueRef *t,
+                             LLVMValueRef *r)
+{
+   const unsigned dims = bld->dims;
+   LLVMValueRef width;
+   LLVMValueRef height;
+   LLVMValueRef depth;
+
+   lp_build_extract_image_sizes(bld,
+                                bld->float_size_type,
+                                bld->coord_type,
+                                flt_size,
+                                &width,
+                                &height,
+                                &depth);
+
+   /* s = s * width, t = t * height */
+   *s = lp_build_mul(&bld->coord_bld, *s, width);
+   if (dims >= 2) {
+      *t = lp_build_mul(&bld->coord_bld, *t, height);
+      if (dims >= 3) {
+         *r = lp_build_mul(&bld->coord_bld, *r, depth);
+      }
+   }
+}
+
 
 /** Helper used by lp_build_cube_lookup() */
 static LLVMValueRef
@@ -547,25 +924,16 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
    rz_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rz, float_bld->zero, "");
 
    {
-      struct lp_build_flow_context *flow_ctx;
       struct lp_build_if_state if_ctx;
+      LLVMValueRef face_s_var;
+      LLVMValueRef face_t_var;
+      LLVMValueRef face_var;
 
-      flow_ctx = lp_build_flow_create(bld->builder);
-      lp_build_flow_scope_begin(flow_ctx);
-
-      *face_s = bld->coord_bld.undef;
-      *face_t = bld->coord_bld.undef;
-      *face = bld->int_bld.undef;
-
-      lp_build_name(*face_s, "face_s");
-      lp_build_name(*face_t, "face_t");
-      lp_build_name(*face, "face");
+      face_s_var = lp_build_alloca(bld->builder, bld->coord_bld.vec_type, "face_s_var");
+      face_t_var = lp_build_alloca(bld->builder, bld->coord_bld.vec_type, "face_t_var");
+      face_var = lp_build_alloca(bld->builder, bld->int_bld.vec_type, "face_var");
 
-      lp_build_flow_scope_declare(flow_ctx, face_s);
-      lp_build_flow_scope_declare(flow_ctx, face_t);
-      lp_build_flow_scope_declare(flow_ctx, face);
-
-      lp_build_if(&if_ctx, flow_ctx, bld->builder, arx_ge_ary_arz);
+      lp_build_if(&if_ctx, bld->builder, arx_ge_ary_arz);
       {
          /* +/- X face */
          LLVMValueRef sign = lp_build_sgn(float_bld, rx);
@@ -575,57 +943,52 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
          *face = lp_build_cube_face(bld, rx,
                                     PIPE_TEX_FACE_POS_X,
                                     PIPE_TEX_FACE_NEG_X);
+         LLVMBuildStore(bld->builder, *face_s, face_s_var);
+         LLVMBuildStore(bld->builder, *face_t, face_t_var);
+         LLVMBuildStore(bld->builder, *face, face_var);
       }
       lp_build_else(&if_ctx);
       {
-         struct lp_build_flow_context *flow_ctx2;
          struct lp_build_if_state if_ctx2;
 
-         LLVMValueRef face_s2 = bld->coord_bld.undef;
-         LLVMValueRef face_t2 = bld->coord_bld.undef;
-         LLVMValueRef face2 = bld->int_bld.undef;
-
-         flow_ctx2 = lp_build_flow_create(bld->builder);
-         lp_build_flow_scope_begin(flow_ctx2);
-         lp_build_flow_scope_declare(flow_ctx2, &face_s2);
-         lp_build_flow_scope_declare(flow_ctx2, &face_t2);
-         lp_build_flow_scope_declare(flow_ctx2, &face2);
-
          ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
 
-         lp_build_if(&if_ctx2, flow_ctx2, bld->builder, ary_ge_arx_arz);
+         lp_build_if(&if_ctx2, bld->builder, ary_ge_arx_arz);
          {
             /* +/- Y face */
             LLVMValueRef sign = lp_build_sgn(float_bld, ry);
             LLVMValueRef ima = lp_build_cube_ima(coord_bld, t);
-            face_s2 = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
-            face_t2 = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
-            face2 = lp_build_cube_face(bld, ry,
+            *face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
+            *face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
+            *face = lp_build_cube_face(bld, ry,
                                        PIPE_TEX_FACE_POS_Y,
                                        PIPE_TEX_FACE_NEG_Y);
+            LLVMBuildStore(bld->builder, *face_s, face_s_var);
+            LLVMBuildStore(bld->builder, *face_t, face_t_var);
+            LLVMBuildStore(bld->builder, *face, face_var);
          }
          lp_build_else(&if_ctx2);
          {
             /* +/- Z face */
             LLVMValueRef sign = lp_build_sgn(float_bld, rz);
             LLVMValueRef ima = lp_build_cube_ima(coord_bld, r);
-            face_s2 = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
-            face_t2 = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
-            face2 = lp_build_cube_face(bld, rz,
+            *face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
+            *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
+            *face = lp_build_cube_face(bld, rz,
                                        PIPE_TEX_FACE_POS_Z,
                                        PIPE_TEX_FACE_NEG_Z);
+            LLVMBuildStore(bld->builder, *face_s, face_s_var);
+            LLVMBuildStore(bld->builder, *face_t, face_t_var);
+            LLVMBuildStore(bld->builder, *face, face_var);
          }
          lp_build_endif(&if_ctx2);
-         lp_build_flow_scope_end(flow_ctx2);
-         lp_build_flow_destroy(flow_ctx2);
-         *face_s = face_s2;
-         *face_t = face_t2;
-         *face = face2;
       }
 
       lp_build_endif(&if_ctx);
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
+
+      *face_s = LLVMBuildLoad(bld->builder, face_s_var, "face_s");
+      *face_t = LLVMBuildLoad(bld->builder, face_t_var, "face_t");
+      *face   = LLVMBuildLoad(bld->builder, face_var, "face");
    }
 }
 
@@ -659,11 +1022,21 @@ lp_build_sample_partial_offset(struct lp_build_context *bld,
        * Pixel blocks have power of two dimensions. LLVM should convert the
        * rem/div to bit arithmetic.
        * TODO: Verify this.
+       * It does indeed BUT it does transform it to scalar (and back) when doing so
+       * (using roughly extract, shift/and, mov, unpack) (llvm 2.7).
+       * The generated code looks seriously unfunny and is quite expensive.
        */
-
+#if 0
       LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length);
       subcoord = LLVMBuildURem(bld->builder, coord, block_width, "");
       coord    = LLVMBuildUDiv(bld->builder, coord, block_width, "");
+#else
+      unsigned logbase2 = util_unsigned_logbase2(block_length);
+      LLVMValueRef block_shift = lp_build_const_int_vec(bld->type, logbase2);
+      LLVMValueRef block_mask = lp_build_const_int_vec(bld->type, block_length - 1);
+      subcoord = LLVMBuildAnd(bld->builder, coord, block_mask, "");
+      coord = LLVMBuildLShr(bld->builder, coord, block_shift, "");
+#endif
    }
 
    offset = lp_build_mul(bld, coord, stride);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 8b042d5242..ffed27cee8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -82,12 +82,10 @@ struct lp_sampler_static_state
    unsigned compare_mode:1;
    unsigned compare_func:3;
    unsigned normalized_coords:1;
-   float lod_bias, min_lod, max_lod;
-   float border_color[4];
-
-   /* Aero hacks */
-   unsigned force_nearest_s:1;
-   unsigned force_nearest_t:1;
+   unsigned min_max_lod_equal:1;  /**< min_lod == max_lod ? */
+   unsigned lod_bias_non_zero:1;
+   unsigned apply_min_lod:1;  /**< min_lod > 0 ? */
+   unsigned apply_max_lod:1;  /**< max_lod < last_level ? */
 };
 
 
@@ -104,45 +102,67 @@ struct lp_sampler_static_state
 struct lp_sampler_dynamic_state
 {
 
-   /** Obtain the base texture width. */
+   /** Obtain the base texture width (returns int32) */
    LLVMValueRef
    (*width)( const struct lp_sampler_dynamic_state *state,
              LLVMBuilderRef builder,
              unsigned unit);
 
-   /** Obtain the base texture height. */
+   /** Obtain the base texture height (returns int32) */
    LLVMValueRef
    (*height)( const struct lp_sampler_dynamic_state *state,
               LLVMBuilderRef builder,
               unsigned unit);
 
-   /** Obtain the base texture depth. */
+   /** Obtain the base texture depth (returns int32) */
    LLVMValueRef
    (*depth)( const struct lp_sampler_dynamic_state *state,
              LLVMBuilderRef builder,
              unsigned unit);
 
-   /** Obtain the number of mipmap levels (minus one). */
+   /** Obtain the number of mipmap levels minus one (returns int32) */
    LLVMValueRef
    (*last_level)( const struct lp_sampler_dynamic_state *state,
                   LLVMBuilderRef builder,
                   unsigned unit);
 
+   /** Obtain stride in bytes between image rows/blocks (returns int32) */
    LLVMValueRef
    (*row_stride)( const struct lp_sampler_dynamic_state *state,
                   LLVMBuilderRef builder,
                   unsigned unit);
 
+   /** Obtain stride in bytes between image slices (returns int32) */
    LLVMValueRef
    (*img_stride)( const struct lp_sampler_dynamic_state *state,
                   LLVMBuilderRef builder,
                   unsigned unit);
 
+   /** Obtain pointer to array of pointers to mimpap levels */
    LLVMValueRef
    (*data_ptr)( const struct lp_sampler_dynamic_state *state,
                 LLVMBuilderRef builder,
                 unsigned unit);
 
+   /** Obtain texture min lod (returns float) */
+   LLVMValueRef
+   (*min_lod)(const struct lp_sampler_dynamic_state *state,
+              LLVMBuilderRef builder, unsigned unit);
+
+   /** Obtain texture max lod (returns float) */
+   LLVMValueRef
+   (*max_lod)(const struct lp_sampler_dynamic_state *state,
+              LLVMBuilderRef builder, unsigned unit);
+
+   /** Obtain texture lod bias (returns float) */
+   LLVMValueRef
+   (*lod_bias)(const struct lp_sampler_dynamic_state *state,
+               LLVMBuilderRef builder, unsigned unit);
+
+   /** Obtain texture border color (returns ptr to float[4]) */
+   LLVMValueRef
+   (*border_color)(const struct lp_sampler_dynamic_state *state,
+                   LLVMBuilderRef builder, unsigned unit);
 };
 
 
@@ -159,10 +179,16 @@ struct lp_build_sample_context
 
    const struct util_format_description *format_desc;
 
+   /* See texture_dims() */
+   unsigned dims;
+
    /** regular scalar float type */
    struct lp_type float_type;
    struct lp_build_context float_bld;
 
+   /** float vector type */
+   struct lp_build_context float_vec_bld;
+
    /** regular scalar float type */
    struct lp_type int_type;
    struct lp_build_context int_bld;
@@ -171,17 +197,32 @@ struct lp_build_sample_context
    struct lp_type coord_type;
    struct lp_build_context coord_bld;
 
-   /** Unsigned integer coordinates */
-   struct lp_type uint_coord_type;
-   struct lp_build_context uint_coord_bld;
-
    /** Signed integer coordinates */
    struct lp_type int_coord_type;
    struct lp_build_context int_coord_bld;
 
+   /** Unsigned integer texture size */
+   struct lp_type int_size_type;
+   struct lp_build_context int_size_bld;
+
+   /** Unsigned integer texture size */
+   struct lp_type float_size_type;
+   struct lp_build_context float_size_bld;
+
    /** Output texels type and build context */
    struct lp_type texel_type;
    struct lp_build_context texel_bld;
+
+   /* Common dynamic state values */
+   LLVMValueRef width;
+   LLVMValueRef height;
+   LLVMValueRef depth;
+   LLVMValueRef row_stride_array;
+   LLVMValueRef img_stride_array;
+   LLVMValueRef data_array;
+
+   /** Integer vector with texture width, height, depth */
+   LLVMValueRef int_size;
 };
 
 
@@ -218,7 +259,7 @@ apply_sampler_swizzle(struct lp_build_sample_context *bld,
 }
 
 
-static INLINE int
+static INLINE unsigned
 texture_dims(enum pipe_texture_target tex)
 {
    switch (tex) {
@@ -237,6 +278,11 @@ texture_dims(enum pipe_texture_target tex)
 }
 
 
+boolean
+lp_sampler_wrap_mode_uses_border_color(unsigned mode,
+                                       unsigned min_img_filter,
+                                       unsigned mag_img_filter);
+
 /**
  * Derive the sampler static state.
  */
@@ -246,15 +292,16 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
                         const struct pipe_sampler_state *sampler);
 
 
-LLVMValueRef
+void
 lp_build_lod_selector(struct lp_build_sample_context *bld,
+                      unsigned unit,
                       const LLVMValueRef ddx[4],
                       const LLVMValueRef ddy[4],
                       LLVMValueRef lod_bias, /* optional */
                       LLVMValueRef explicit_lod, /* optional */
-                      LLVMValueRef width,
-                      LLVMValueRef height,
-                      LLVMValueRef depth);
+                      unsigned mip_filter,
+                      LLVMValueRef *out_lod_ipart,
+                      LLVMValueRef *out_lod_fpart);
 
 void
 lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
@@ -265,40 +312,44 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
 void
 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                            unsigned unit,
-                           LLVMValueRef lod,
+                           LLVMValueRef lod_ipart,
+                           LLVMValueRef *lod_fpart_inout,
                            LLVMValueRef *level0_out,
-                           LLVMValueRef *level1_out,
-                           LLVMValueRef *weight_out);
+                           LLVMValueRef *level1_out);
 
 LLVMValueRef
 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
-                          LLVMValueRef data_array, LLVMValueRef level);
+                          LLVMValueRef level);
 
 LLVMValueRef
 lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
-                                LLVMValueRef data_array, int level);
+                                int level);
 
 
 void
 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
-                            unsigned dims,
-                            LLVMValueRef width_vec,
-                            LLVMValueRef height_vec,
-                            LLVMValueRef depth_vec,
-                            LLVMValueRef ilevel0,
-                            LLVMValueRef ilevel1,
-                            LLVMValueRef row_stride_array,
-                            LLVMValueRef img_stride_array,
-                            LLVMValueRef *width0_vec,
-                            LLVMValueRef *width1_vec,
-                            LLVMValueRef *height0_vec,
-                            LLVMValueRef *height1_vec,
-                            LLVMValueRef *depth0_vec,
-                            LLVMValueRef *depth1_vec,
-                            LLVMValueRef *row_stride0_vec,
-                            LLVMValueRef *row_stride1_vec,
-                            LLVMValueRef *img_stride0_vec,
-                            LLVMValueRef *img_stride1_vec);
+                            LLVMValueRef ilevel,
+                            LLVMValueRef *out_size_vec,
+                            LLVMValueRef *row_stride_vec,
+                            LLVMValueRef *img_stride_vec);
+
+
+void
+lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
+                             struct lp_type size_type,
+                             struct lp_type coord_type,
+                             LLVMValueRef size,
+                             LLVMValueRef *out_width,
+                             LLVMValueRef *out_height,
+                             LLVMValueRef *out_depth);
+
+
+void
+lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
+                             LLVMValueRef flt_size,
+                             LLVMValueRef *s,
+                             LLVMValueRef *t,
+                             LLVMValueRef *r);
 
 
 void
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index 7e064900e7..d3e3b242af 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -45,6 +45,7 @@
 #include "lp_bld_const.h"
 #include "lp_bld_conv.h"
 #include "lp_bld_arit.h"
+#include "lp_bld_bitarit.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_pack.h"
@@ -80,20 +81,21 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
                                  LLVMValueRef *out_offset,
                                  LLVMValueRef *out_i)
 {
-   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    LLVMValueRef length_minus_one;
 
-   length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
+   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 
    switch(wrap_mode) {
    case PIPE_TEX_WRAP_REPEAT:
       if(is_pot)
          coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, "");
-      else
-         /* Signed remainder won't give the right results for negative
-          * dividends but unsigned remainder does.*/
+      else {
+         /* Add a bias to the texcoord to handle negative coords */
+         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
+         coord = LLVMBuildAdd(bld->builder, coord, bias, "");
          coord = LLVMBuildURem(bld->builder, coord, length, "");
+      }
       break;
 
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
@@ -111,7 +113,7 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
       assert(0);
    }
 
-   lp_build_sample_partial_offset(uint_coord_bld, block_length, coord, stride,
+   lp_build_sample_partial_offset(int_coord_bld, block_length, coord, stride,
                                   out_offset, out_i);
 }
 
@@ -144,7 +146,6 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
                                 LLVMValueRef *i0,
                                 LLVMValueRef *i1)
 {
-   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    LLVMValueRef length_minus_one;
    LLVMValueRef lmask, umask, mask;
@@ -186,8 +187,8 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
     * multiplication.
     */
 
-   *i0 = uint_coord_bld->zero;
-   *i1 = uint_coord_bld->zero;
+   *i0 = int_coord_bld->zero;
+   *i1 = int_coord_bld->zero;
 
    length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 
@@ -197,17 +198,18 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
          coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, "");
       }
       else {
-         /* Signed remainder won't give the right results for negative
-          * dividends but unsigned remainder does.*/
+         /* Add a bias to the texcoord to handle negative coords */
+         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
+         coord0 = LLVMBuildAdd(bld->builder, coord0, bias, "");
          coord0 = LLVMBuildURem(bld->builder, coord0, length, "");
       }
 
       mask = lp_build_compare(bld->builder, int_coord_bld->type,
                               PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
 
-      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
+      *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
       *offset1 = LLVMBuildAnd(bld->builder,
-                              lp_build_add(uint_coord_bld, *offset0, stride),
+                              lp_build_add(int_coord_bld, *offset0, stride),
                               mask, "");
       break;
 
@@ -222,8 +224,8 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
 
       mask = LLVMBuildAnd(bld->builder, lmask, umask, "");
 
-      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
-      *offset1 = lp_build_add(uint_coord_bld,
+      *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
+      *offset1 = lp_build_add(int_coord_bld,
                               *offset0,
                               LLVMBuildAnd(bld->builder, stride, mask, ""));
       break;
@@ -236,8 +238,8 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
    default:
       assert(0);
-      *offset0 = uint_coord_bld->zero;
-      *offset1 = uint_coord_bld->zero;
+      *offset0 = int_coord_bld->zero;
+      *offset1 = int_coord_bld->zero;
       break;
    }
 }
@@ -250,9 +252,7 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
  */
 static void
 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
-                              LLVMValueRef width_vec,
-                              LLVMValueRef height_vec,
-                              LLVMValueRef depth_vec,
+                              LLVMValueRef int_size,
                               LLVMValueRef row_stride_vec,
                               LLVMValueRef img_stride_vec,
                               LLVMValueRef data_ptr,
@@ -262,11 +262,12 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                               LLVMValueRef *colors_lo,
                               LLVMValueRef *colors_hi)
 {
-   const int dims = texture_dims(bld->static_state->target);
+   const unsigned dims = bld->dims;
    LLVMBuilderRef builder = bld->builder;
    struct lp_build_context i32, h16, u8n;
    LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
    LLVMValueRef i32_c8;
+   LLVMValueRef width_vec, height_vec, depth_vec;
    LLVMValueRef s_ipart, t_ipart, r_ipart;
    LLVMValueRef x_stride;
    LLVMValueRef x_offset, offset;
@@ -280,30 +281,33 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
    h16_vec_type = lp_build_vec_type(h16.type);
    u8n_vec_type = lp_build_vec_type(u8n.type);
 
+   lp_build_extract_image_sizes(bld,
+                                bld->int_size_type,
+                                bld->int_coord_type,
+                                int_size,
+                                &width_vec,
+                                &height_vec,
+                                &depth_vec);
+
    if (bld->static_state->normalized_coords) {
-      /* s = s * width, t = t * height */
-      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
-      LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec,
-                                              coord_vec_type, "");
-      s = lp_build_mul(&bld->coord_bld, s, fp_width);
-      if (dims >= 2) {
-         LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec,
-                                                  coord_vec_type, "");
-         t = lp_build_mul(&bld->coord_bld, t, fp_height);
-         if (dims >= 3) {
-            LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec,
-                                                    coord_vec_type, "");
-            r = lp_build_mul(&bld->coord_bld, r, fp_depth);
-         }
-      }
-   }
+      LLVMValueRef scaled_size;
+      LLVMValueRef flt_size;
 
-   /* scale coords by 256 (8 fractional bits) */
-   s = lp_build_mul_imm(&bld->coord_bld, s, 256);
-   if (dims >= 2)
-      t = lp_build_mul_imm(&bld->coord_bld, t, 256);
-   if (dims >= 3)
-      r = lp_build_mul_imm(&bld->coord_bld, r, 256);
+      /* scale size by 256 (8 fractional bits) */
+      scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8);
+
+      flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size);
+
+      lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
+   }
+   else {
+      /* scale coords by 256 (8 fractional bits) */
+      s = lp_build_mul_imm(&bld->coord_bld, s, 256);
+      if (dims >= 2)
+         t = lp_build_mul_imm(&bld->coord_bld, t, 256);
+      if (dims >= 3)
+         r = lp_build_mul_imm(&bld->coord_bld, r, 256);
+   }
 
    /* convert float to int */
    s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
@@ -321,7 +325,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
       r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
 
    /* get pixel, row, image strides */
-   x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
+   x_stride = lp_build_const_vec(bld->int_coord_bld.type,
                                  bld->format_desc->block.bits/8);
 
    /* Do texcoord wrapping, compute texel offset */
@@ -340,7 +344,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                                        bld->static_state->pot_height,
                                        bld->static_state->wrap_t,
                                        &y_offset, &y_subcoord);
-      offset = lp_build_add(&bld->uint_coord_bld, offset, y_offset);
+      offset = lp_build_add(&bld->int_coord_bld, offset, y_offset);
       if (dims >= 3) {
          LLVMValueRef z_offset;
          lp_build_sample_wrap_nearest_int(bld,
@@ -349,13 +353,13 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                                           bld->static_state->pot_height,
                                           bld->static_state->wrap_r,
                                           &z_offset, &z_subcoord);
-         offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset);
+         offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
       }
       else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
          LLVMValueRef z_offset;
          /* The r coord is the cube face in [0,5] */
-         z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec);
-         offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset);
+         z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
+         offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
       }
    }
 
@@ -414,9 +418,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
  */
 static void
 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
-                             LLVMValueRef width_vec,
-                             LLVMValueRef height_vec,
-                             LLVMValueRef depth_vec,
+                             LLVMValueRef int_size,
                              LLVMValueRef row_stride_vec,
                              LLVMValueRef img_stride_vec,
                              LLVMValueRef data_ptr,
@@ -426,11 +428,12 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                              LLVMValueRef *colors_lo,
                              LLVMValueRef *colors_hi)
 {
-   const int dims = texture_dims(bld->static_state->target);
+   const unsigned dims = bld->dims;
    LLVMBuilderRef builder = bld->builder;
    struct lp_build_context i32, h16, u8n;
    LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
    LLVMValueRef i32_c8, i32_c128, i32_c255;
+   LLVMValueRef width_vec, height_vec, depth_vec;
    LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
    LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi;
    LLVMValueRef r_ipart, r_fpart, r_fpart_lo, r_fpart_hi;
@@ -455,30 +458,33 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    h16_vec_type = lp_build_vec_type(h16.type);
    u8n_vec_type = lp_build_vec_type(u8n.type);
 
+   lp_build_extract_image_sizes(bld,
+                                bld->int_size_type,
+                                bld->int_coord_type,
+                                int_size,
+                                &width_vec,
+                                &height_vec,
+                                &depth_vec);
+
    if (bld->static_state->normalized_coords) {
-      /* s = s * width, t = t * height */
-      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
-      LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec,
-                                              coord_vec_type, "");
-      s = lp_build_mul(&bld->coord_bld, s, fp_width);
-      if (dims >= 2) {
-         LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec,
-                                                  coord_vec_type, "");
-         t = lp_build_mul(&bld->coord_bld, t, fp_height);
-      }
-      if (dims >= 3) {
-         LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec,
-                                                 coord_vec_type, "");
-         r = lp_build_mul(&bld->coord_bld, r, fp_depth);
-      }
-   }
+      LLVMValueRef scaled_size;
+      LLVMValueRef flt_size;
 
-   /* scale coords by 256 (8 fractional bits) */
-   s = lp_build_mul_imm(&bld->coord_bld, s, 256);
-   if (dims >= 2)
-      t = lp_build_mul_imm(&bld->coord_bld, t, 256);
-   if (dims >= 3)
-      r = lp_build_mul_imm(&bld->coord_bld, r, 256);
+      /* scale size by 256 (8 fractional bits) */
+      scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8);
+
+      flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size);
+
+      lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
+   }
+   else {
+      /* scale coords by 256 (8 fractional bits) */
+      s = lp_build_mul_imm(&bld->coord_bld, s, 256);
+      if (dims >= 2)
+         t = lp_build_mul_imm(&bld->coord_bld, t, 256);
+      if (dims >= 3)
+         r = lp_build_mul_imm(&bld->coord_bld, r, 256);
+   }
 
    /* convert float to int */
    s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
@@ -489,10 +495,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
 
    /* subtract 0.5 (add -128) */
    i32_c128 = lp_build_const_int_vec(i32.type, -128);
-   if (!bld->static_state->force_nearest_s) {
-      s = LLVMBuildAdd(builder, s, i32_c128, "");
-   }
-   if (dims >= 2 && !bld->static_state->force_nearest_t) {
+   s = LLVMBuildAdd(builder, s, i32_c128, "");
+   if (dims >= 2) {
       t = LLVMBuildAdd(builder, t, i32_c128, "");
    }
    if (dims >= 3) {
@@ -516,7 +520,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
       r_fpart = LLVMBuildAnd(builder, r, i32_c255, "");
 
    /* get pixel, row and image strides */
-   x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
+   x_stride = lp_build_const_vec(bld->int_coord_bld.type,
                                  bld->format_desc->block.bits/8);
    y_stride = row_stride_vec;
    z_stride = img_stride_vec;
@@ -547,9 +551,9 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
 
       for (z = 0; z < 2; z++) {
          for (x = 0; x < 2; x++) {
-            offset[z][0][x] = lp_build_add(&bld->uint_coord_bld,
+            offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
                                            offset[z][0][x], y_offset0);
-            offset[z][1][x] = lp_build_add(&bld->uint_coord_bld,
+            offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
                                            offset[z][1][x], y_offset1);
          }
       }
@@ -565,20 +569,20 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                                       &z_subcoord[0], &z_subcoord[1]);
       for (y = 0; y < 2; y++) {
          for (x = 0; x < 2; x++) {
-            offset[0][y][x] = lp_build_add(&bld->uint_coord_bld,
+            offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
                                            offset[0][y][x], z_offset0);
-            offset[1][y][x] = lp_build_add(&bld->uint_coord_bld,
+            offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
                                            offset[1][y][x], z_offset1);
          }
       }
    }
    else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
       LLVMValueRef z_offset;
-      z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec);
+      z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
       for (y = 0; y < 2; y++) {
          for (x = 0; x < 2; x++) {
             /* The r coord is the cube face in [0,5] */
-            offset[0][y][x] = lp_build_add(&bld->uint_coord_bld,
+            offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
                                            offset[0][y][x], z_offset);
          }
       }
@@ -709,82 +713,56 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    /*
     * Linear interpolation with 8.8 fixed point.
     */
-   if (bld->static_state->force_nearest_s) {
-      /* special case 1-D lerp */
-      packed_lo = lp_build_lerp(&h16,
-                                t_fpart_lo,
-                                neighbors_lo[0][0][0],
-                                neighbors_lo[0][0][1]);
-
-      packed_hi = lp_build_lerp(&h16,
-                                t_fpart_hi,
-                                neighbors_hi[0][1][0],
-                                neighbors_hi[0][1][0]);
-   }
-   else if (bld->static_state->force_nearest_t) {
-      /* special case 1-D lerp */
+   if (dims == 1) {
+      /* 1-D lerp */
       packed_lo = lp_build_lerp(&h16,
-                                s_fpart_lo,
-                                neighbors_lo[0][0][0],
-                                neighbors_lo[0][0][1]);
+				s_fpart_lo,
+				neighbors_lo[0][0][0],
+				neighbors_lo[0][0][1]);
 
       packed_hi = lp_build_lerp(&h16,
-                                s_fpart_hi,
-                                neighbors_hi[0][0][0],
-                                neighbors_hi[0][0][1]);
+				s_fpart_hi,
+				neighbors_hi[0][0][0],
+				neighbors_hi[0][0][1]);
    }
    else {
-      /* general 1/2/3-D lerping */
-      if (dims == 1) {
-         packed_lo = lp_build_lerp(&h16,
-                                   s_fpart_lo,
-                                   neighbors_lo[0][0][0],
-                                   neighbors_lo[0][0][1]);
-
-         packed_hi = lp_build_lerp(&h16,
-                                   s_fpart_hi,
-                                   neighbors_hi[0][0][0],
-                                   neighbors_hi[0][0][1]);
-      }
-      else {
-         /* 2-D lerp */
-         packed_lo = lp_build_lerp_2d(&h16,
-                                      s_fpart_lo, t_fpart_lo,
-                                      neighbors_lo[0][0][0],
-                                      neighbors_lo[0][0][1],
-                                      neighbors_lo[0][1][0],
-                                      neighbors_lo[0][1][1]);
-
-         packed_hi = lp_build_lerp_2d(&h16,
-                                      s_fpart_hi, t_fpart_hi,
-                                      neighbors_hi[0][0][0],
-                                      neighbors_hi[0][0][1],
-                                      neighbors_hi[0][1][0],
-                                      neighbors_hi[0][1][1]);
-
-         if (dims >= 3) {
-            LLVMValueRef packed_lo2, packed_hi2;
-
-            /* lerp in the second z slice */
-            packed_lo2 = lp_build_lerp_2d(&h16,
-                                          s_fpart_lo, t_fpart_lo,
-                                          neighbors_lo[1][0][0],
-                                          neighbors_lo[1][0][1],
-                                          neighbors_lo[1][1][0],
-                                          neighbors_lo[1][1][1]);
-
-            packed_hi2 = lp_build_lerp_2d(&h16,
-                                          s_fpart_hi, t_fpart_hi,
-                                          neighbors_hi[1][0][0],
-                                          neighbors_hi[1][0][1],
-                                          neighbors_hi[1][1][0],
-                                          neighbors_hi[1][1][1]);
-            /* interp between two z slices */
-            packed_lo = lp_build_lerp(&h16, r_fpart_lo,
-                                      packed_lo, packed_lo2);
-            packed_hi = lp_build_lerp(&h16, r_fpart_hi,
-                                      packed_hi, packed_hi2);
-         }
+      /* 2-D lerp */
+      packed_lo = lp_build_lerp_2d(&h16,
+				   s_fpart_lo, t_fpart_lo,
+				   neighbors_lo[0][0][0],
+				   neighbors_lo[0][0][1],
+				   neighbors_lo[0][1][0],
+				   neighbors_lo[0][1][1]);
+
+      packed_hi = lp_build_lerp_2d(&h16,
+				   s_fpart_hi, t_fpart_hi,
+				   neighbors_hi[0][0][0],
+				   neighbors_hi[0][0][1],
+				   neighbors_hi[0][1][0],
+				   neighbors_hi[0][1][1]);
+
+      if (dims >= 3) {
+	 LLVMValueRef packed_lo2, packed_hi2;
+
+	 /* lerp in the second z slice */
+	 packed_lo2 = lp_build_lerp_2d(&h16,
+				       s_fpart_lo, t_fpart_lo,
+				       neighbors_lo[1][0][0],
+				       neighbors_lo[1][0][1],
+				       neighbors_lo[1][1][0],
+				       neighbors_lo[1][1][1]);
+
+	 packed_hi2 = lp_build_lerp_2d(&h16,
+				       s_fpart_hi, t_fpart_hi,
+				       neighbors_hi[1][0][0],
+				       neighbors_hi[1][0][1],
+				       neighbors_hi[1][1][0],
+				       neighbors_hi[1][1][1]);
+	 /* interp between two z slices */
+	 packed_lo = lp_build_lerp(&h16, r_fpart_lo,
+				   packed_lo, packed_lo2);
+	 packed_hi = lp_build_lerp(&h16, r_fpart_hi,
+				   packed_hi, packed_hi2);
       }
    }
 
@@ -806,76 +784,124 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                        LLVMValueRef s,
                        LLVMValueRef t,
                        LLVMValueRef r,
+                       LLVMValueRef ilevel0,
+                       LLVMValueRef ilevel1,
                        LLVMValueRef lod_fpart,
-                       LLVMValueRef width0_vec,
-                       LLVMValueRef width1_vec,
-                       LLVMValueRef height0_vec,
-                       LLVMValueRef height1_vec,
-                       LLVMValueRef depth0_vec,
-                       LLVMValueRef depth1_vec,
-                       LLVMValueRef row_stride0_vec,
-                       LLVMValueRef row_stride1_vec,
-                       LLVMValueRef img_stride0_vec,
-                       LLVMValueRef img_stride1_vec,
-                       LLVMValueRef data_ptr0,
-                       LLVMValueRef data_ptr1,
-                       LLVMValueRef *colors_lo,
-                       LLVMValueRef *colors_hi)
+                       LLVMValueRef colors_lo_var,
+                       LLVMValueRef colors_hi_var)
 {
+   LLVMBuilderRef builder = bld->builder;
+   LLVMValueRef size0;
+   LLVMValueRef size1;
+   LLVMValueRef row_stride0_vec;
+   LLVMValueRef row_stride1_vec;
+   LLVMValueRef img_stride0_vec;
+   LLVMValueRef img_stride1_vec;
+   LLVMValueRef data_ptr0;
+   LLVMValueRef data_ptr1;
    LLVMValueRef colors0_lo, colors0_hi;
    LLVMValueRef colors1_lo, colors1_hi;
 
+
+   /* sample the first mipmap level */
+   lp_build_mipmap_level_sizes(bld, ilevel0,
+                               &size0,
+                               &row_stride0_vec, &img_stride0_vec);
+   data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
-      /* sample the first mipmap level */
       lp_build_sample_image_nearest(bld,
-                                    width0_vec, height0_vec, depth0_vec,
+                                    size0,
                                     row_stride0_vec, img_stride0_vec,
                                     data_ptr0, s, t, r,
                                     &colors0_lo, &colors0_hi);
-
-      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         /* sample the second mipmap level */
-         lp_build_sample_image_nearest(bld,
-                                       width1_vec, height1_vec, depth1_vec,
-                                       row_stride1_vec, img_stride1_vec,
-                                       data_ptr1, s, t, r,
-                                       &colors1_lo, &colors1_hi);
-      }
    }
    else {
       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
-
-      /* sample the first mipmap level */
       lp_build_sample_image_linear(bld,
-                                   width0_vec, height0_vec, depth0_vec,
+                                   size0,
                                    row_stride0_vec, img_stride0_vec,
                                    data_ptr0, s, t, r,
                                    &colors0_lo, &colors0_hi);
-
-      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         /* sample the second mipmap level */
-         lp_build_sample_image_linear(bld,
-                                      width1_vec, height1_vec, depth1_vec,
-                                      row_stride1_vec, img_stride1_vec,
-                                      data_ptr1, s, t, r,
-                                      &colors1_lo, &colors1_hi);
-      }
    }
 
+   /* Store the first level's colors in the output variables */
+   LLVMBuildStore(builder, colors0_lo, colors_lo_var);
+   LLVMBuildStore(builder, colors0_hi, colors_hi_var);
+
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-      /* interpolate samples from the two mipmap levels */
-      struct lp_build_context h16;
-      lp_build_context_init(&h16, bld->builder, lp_type_ufixed(16));
-
-      *colors_lo = lp_build_lerp(&h16, lod_fpart,
-                                 colors0_lo, colors1_lo);
-      *colors_hi = lp_build_lerp(&h16, lod_fpart,
-                                 colors0_hi, colors1_hi);
-   }
-   else {
-      /* use first/only level's colors */
-      *colors_lo = colors0_lo;
-      *colors_hi = colors0_hi;
+      LLVMValueRef h16_scale = LLVMConstReal(LLVMFloatType(), 256.0);
+      LLVMTypeRef i32_type = LLVMIntType(32);
+      struct lp_build_if_state if_ctx;
+      LLVMValueRef need_lerp;
+
+      lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16_scale, "");
+      lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32_type, "lod_fpart.fixed16");
+
+      /* need_lerp = lod_fpart > 0 */
+      need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
+                                lod_fpart, LLVMConstNull(i32_type),
+                                "need_lerp");
+
+      lp_build_if(&if_ctx, builder, need_lerp);
+      {
+         struct lp_build_context h16_bld;
+
+         lp_build_context_init(&h16_bld, builder, lp_type_ufixed(16));
+
+         /* sample the second mipmap level */
+         lp_build_mipmap_level_sizes(bld, ilevel1,
+                                     &size1,
+                                     &row_stride1_vec, &img_stride1_vec);
+         data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
+         if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+            lp_build_sample_image_nearest(bld,
+                                          size1,
+                                          row_stride1_vec, img_stride1_vec,
+                                          data_ptr1, s, t, r,
+                                          &colors1_lo, &colors1_hi);
+         }
+         else {
+            lp_build_sample_image_linear(bld,
+                                         size1,
+                                         row_stride1_vec, img_stride1_vec,
+                                         data_ptr1, s, t, r,
+                                         &colors1_lo, &colors1_hi);
+         }
+
+         /* interpolate samples from the two mipmap levels */
+
+         lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
+         lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
+
+#if HAVE_LLVM == 0x208
+         /* This is a work-around for a bug in LLVM 2.8.
+          * Evidently, something goes wrong in the construction of the
+          * lod_fpart short[8] vector.  Adding this no-effect shuffle seems
+          * to force the vector to be properly constructed.
+          * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
+          */
+         {
+            LLVMValueRef shuffles[8], shuffle;
+            int i;
+            assert(h16_bld.type.length <= Elements(shuffles));
+            for (i = 0; i < h16_bld.type.length; i++)
+               shuffles[i] = lp_build_const_int32(2 * (i & 1));
+            shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
+            lod_fpart = LLVMBuildShuffleVector(builder,
+                                               lod_fpart, lod_fpart,
+                                               shuffle, "");
+         }
+#endif
+
+         colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
+                                    colors0_lo, colors1_lo);
+         colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
+                                    colors0_hi, colors1_hi);
+
+         LLVMBuildStore(builder, colors0_lo, colors_lo_var);
+         LLVMBuildStore(builder, colors0_hi, colors_hi_var);
+      }
+      lp_build_endif(&if_ctx);
    }
 }
 
@@ -896,35 +922,22 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
                     const LLVMValueRef *ddy,
                     LLVMValueRef lod_bias, /* optional */
                     LLVMValueRef explicit_lod, /* optional */
-                    LLVMValueRef width,
-                    LLVMValueRef height,
-                    LLVMValueRef depth,
-                    LLVMValueRef width_vec,
-                    LLVMValueRef height_vec,
-                    LLVMValueRef depth_vec,
-                    LLVMValueRef row_stride_array,
-                    LLVMValueRef img_stride_array,
-                    LLVMValueRef data_array,
                     LLVMValueRef texel_out[4])
 {
-   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *int_bld = &bld->int_bld;
    LLVMBuilderRef builder = bld->builder;
    const unsigned mip_filter = bld->static_state->min_mip_filter;
    const unsigned min_filter = bld->static_state->min_img_filter;
    const unsigned mag_filter = bld->static_state->mag_img_filter;
-   const int dims = texture_dims(bld->static_state->target);
-   LLVMValueRef lod = NULL, lod_fpart = NULL;
+   const unsigned dims = bld->dims;
+   LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
    LLVMValueRef ilevel0, ilevel1 = NULL;
-   LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL;
-   LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL;
-   LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL;
-   LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL;
-   LLVMValueRef data_ptr0, data_ptr1 = NULL;
    LLVMValueRef packed, packed_lo, packed_hi;
    LLVMValueRef unswizzled[4];
    LLVMValueRef face_ddx[4], face_ddy[4];
-   struct lp_build_context h16;
-   LLVMTypeRef h16_vec_type;
+   struct lp_build_context h16_bld;
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMValueRef i32t_zero = LLVMConstInt(i32t, 0, 0);
 
    /* we only support the common/simple wrap modes at this time */
    assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s));
@@ -935,9 +948,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
 
 
    /* make 16-bit fixed-pt builder context */
-   lp_build_context_init(&h16, builder, lp_type_ufixed(16));
-   h16_vec_type = lp_build_vec_type(h16.type);
-
+   lp_build_context_init(&h16_bld, builder, lp_type_ufixed(16));
 
    /* cube face selection, compute pre-face coords, etc. */
    if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
@@ -949,19 +960,18 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
       r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
 
       /* recompute ddx, ddy using the new (s,t) face texcoords */
-      face_ddx[0] = lp_build_ddx(&bld->coord_bld, s);
-      face_ddx[1] = lp_build_ddx(&bld->coord_bld, t);
+      face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
+      face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
       face_ddx[2] = NULL;
       face_ddx[3] = NULL;
-      face_ddy[0] = lp_build_ddy(&bld->coord_bld, s);
-      face_ddy[1] = lp_build_ddy(&bld->coord_bld, t);
+      face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
+      face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
       face_ddy[2] = NULL;
       face_ddy[3] = NULL;
       ddx = face_ddx;
       ddy = face_ddy;
    }
 
-
    /*
     * Compute the level of detail (float).
     */
@@ -970,15 +980,16 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
       /* Need to compute lod either to choose mipmap levels or to
        * distinguish between minification/magnification with one mipmap level.
        */
-      lod = lp_build_lod_selector(bld, ddx, ddy,
-                                  lod_bias, explicit_lod,
-                                  width, height, depth);
+      lp_build_lod_selector(bld, unit, ddx, ddy,
+                            lod_bias, explicit_lod,
+                            mip_filter,
+                            &lod_ipart, &lod_fpart);
+   } else {
+      lod_ipart = i32t_zero;
    }
 
    /*
     * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
-    * If mipfilter=linear, also compute the weight between the two
-    * mipmap levels: lod_fpart
     */
    switch (mip_filter) {
    default:
@@ -991,135 +1002,81 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
           * We should be able to set ilevel0 = const(0) but that causes
           * bad x86 code to be emitted.
           */
-         lod = lp_build_const_elem(bld->coord_bld.type, 0.0);
-         lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+         assert(lod_ipart);
+         lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
       }
       else {
-         ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+         ilevel0 = i32t_zero;
       }
       break;
    case PIPE_TEX_MIPFILTER_NEAREST:
-      assert(lod);
-      lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+      assert(lod_ipart);
+      lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
       break;
    case PIPE_TEX_MIPFILTER_LINEAR:
-      {
-         LLVMValueRef f256 = LLVMConstReal(LLVMFloatType(), 256.0);
-         LLVMValueRef i255 = lp_build_const_int32(255);
-         LLVMTypeRef i16_type = LLVMIntType(16);
-
-         assert(lod);
-
-         lp_build_linear_mip_levels(bld, unit, lod, &ilevel0, &ilevel1,
-                                    &lod_fpart);
-         lod_fpart = LLVMBuildFMul(builder, lod_fpart, f256, "");
-         lod_fpart = lp_build_ifloor(&bld->float_bld, lod_fpart);
-         lod_fpart = LLVMBuildAnd(builder, lod_fpart, i255, "");
-         lod_fpart = LLVMBuildTrunc(builder, lod_fpart, i16_type, "");
-         lod_fpart = lp_build_broadcast_scalar(&h16, lod_fpart);
-
-         /* the lod_fpart values will be fixed pt values in [0,1) */
-      }
+      assert(lod_ipart);
+      assert(lod_fpart);
+      lp_build_linear_mip_levels(bld, unit,
+                                 lod_ipart, &lod_fpart,
+                                 &ilevel0, &ilevel1);
       break;
    }
 
-   /* compute image size(s) of source mipmap level(s) */
-   lp_build_mipmap_level_sizes(bld, dims, width_vec, height_vec, depth_vec,
-                               ilevel0, ilevel1,
-                               row_stride_array, img_stride_array,
-                               &width0_vec, &width1_vec,
-                               &height0_vec, &height1_vec,
-                               &depth0_vec, &depth1_vec,
-                               &row_stride0_vec, &row_stride1_vec,
-                               &img_stride0_vec, &img_stride1_vec);
-
    /*
-    * Get pointer(s) to image data for mipmap level(s).
+    * Get/interpolate texture colors.
     */
-   data_ptr0 = lp_build_get_mipmap_level(bld, data_array, ilevel0);
-   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-      data_ptr1 = lp_build_get_mipmap_level(bld, data_array, ilevel1);
-   }
 
+   packed_lo = lp_build_alloca(builder, h16_bld.vec_type, "packed_lo");
+   packed_hi = lp_build_alloca(builder, h16_bld.vec_type, "packed_hi");
 
-   /*
-    * Get/interpolate texture colors.
-    */
    if (min_filter == mag_filter) {
       /* no need to distinquish between minification and magnification */
-      lp_build_sample_mipmap(bld, min_filter, mip_filter,
-                             s, t, r, lod_fpart,
-                             width0_vec, width1_vec,
-                             height0_vec, height1_vec,
-                             depth0_vec, depth1_vec,
-                             row_stride0_vec, row_stride1_vec,
-                             img_stride0_vec, img_stride1_vec,
-                             data_ptr0, data_ptr1,
-                             &packed_lo, &packed_hi);
+      lp_build_sample_mipmap(bld,
+                             min_filter, mip_filter,
+                             s, t, r,
+                             ilevel0, ilevel1, lod_fpart,
+                             packed_lo, packed_hi);
    }
    else {
       /* Emit conditional to choose min image filter or mag image filter
        * depending on the lod being > 0 or <= 0, respectively.
        */
-      struct lp_build_flow_context *flow_ctx;
       struct lp_build_if_state if_ctx;
       LLVMValueRef minify;
 
-      flow_ctx = lp_build_flow_create(builder);
-      lp_build_flow_scope_begin(flow_ctx);
-
-      packed_lo = LLVMGetUndef(h16_vec_type);
-      packed_hi = LLVMGetUndef(h16_vec_type);
+      /* minify = lod >= 0.0 */
+      minify = LLVMBuildICmp(builder, LLVMIntSGE,
+                             lod_ipart, int_bld->zero, "");
 
-      lp_build_flow_scope_declare(flow_ctx, &packed_lo);
-      lp_build_flow_scope_declare(flow_ctx, &packed_hi);
-
-      /* minify = lod > 0.0 */
-      minify = LLVMBuildFCmp(builder, LLVMRealUGE,
-                             lod, float_bld->zero, "");
-
-      lp_build_if(&if_ctx, flow_ctx, builder, minify);
+      lp_build_if(&if_ctx, builder, minify);
       {
          /* Use the minification filter */
-         lp_build_sample_mipmap(bld, min_filter, mip_filter,
-                                s, t, r, lod_fpart,
-                                width0_vec, width1_vec,
-                                height0_vec, height1_vec,
-                                depth0_vec, depth1_vec,
-                                row_stride0_vec, row_stride1_vec,
-                                img_stride0_vec, img_stride1_vec,
-                                data_ptr0, data_ptr1,
-                                &packed_lo, &packed_hi);
+         lp_build_sample_mipmap(bld,
+                                min_filter, mip_filter,
+                                s, t, r,
+                                ilevel0, ilevel1, lod_fpart,
+                                packed_lo, packed_hi);
       }
       lp_build_else(&if_ctx);
       {
          /* Use the magnification filter */
-         lp_build_sample_mipmap(bld, mag_filter, mip_filter,
-                                s, t, r, lod_fpart,
-                                width0_vec, width1_vec,
-                                height0_vec, height1_vec,
-                                depth0_vec, depth1_vec,
-                                row_stride0_vec, row_stride1_vec,
-                                img_stride0_vec, img_stride1_vec,
-                                data_ptr0, data_ptr1,
-                                &packed_lo, &packed_hi);
+         lp_build_sample_mipmap(bld, 
+                                mag_filter, PIPE_TEX_MIPFILTER_NONE,
+                                s, t, r,
+                                i32t_zero, NULL, NULL,
+                                packed_lo, packed_hi);
       }
       lp_build_endif(&if_ctx);
-
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
    }
 
-   /* combine 'packed_lo', 'packed_hi' into 'packed' */
-   {
-      struct lp_build_context h16, u8n;
-
-      lp_build_context_init(&h16, builder, lp_type_ufixed(16));
-      lp_build_context_init(&u8n, builder, lp_type_unorm(8));
-
-      packed = lp_build_pack2(builder, h16.type, u8n.type,
-                              packed_lo, packed_hi);
-   }
+   /*
+    * combine the values stored in 'packed_lo' and 'packed_hi' variables
+    * into 'packed'
+    */
+   packed = lp_build_pack2(builder,
+                           h16_bld.type, lp_type_unorm(8),
+                           LLVMBuildLoad(builder, packed_lo, ""),
+                           LLVMBuildLoad(builder, packed_hi, ""));
 
    /*
     * Convert to SoA and swizzle.
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
index e1045bbbc2..5d9ecac4d5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
@@ -50,15 +50,6 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
                     const LLVMValueRef *ddy,
                     LLVMValueRef lod_bias, /* optional */
                     LLVMValueRef explicit_lod, /* optional */
-                    LLVMValueRef width,
-                    LLVMValueRef height,
-                    LLVMValueRef depth,
-                    LLVMValueRef width_vec,
-                    LLVMValueRef height_vec,
-                    LLVMValueRef depth_vec,
-                    LLVMValueRef row_stride_array,
-                    LLVMValueRef img_stride_array,
-                    LLVMValueRef data_array,
                     LLVMValueRef texel_out[4]);
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 2d80db6dc9..53cc0c5f34 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -47,41 +47,18 @@
 #include "lp_bld_arit.h"
 #include "lp_bld_bitarit.h"
 #include "lp_bld_logic.h"
+#include "lp_bld_printf.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_flow.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_format.h"
 #include "lp_bld_sample.h"
 #include "lp_bld_sample_aos.h"
+#include "lp_bld_struct.h"
 #include "lp_bld_quad.h"
 
 
 /**
- * Does the given texture wrap mode allow sampling the texture border color?
- * XXX maybe move this into gallium util code.
- */
-static boolean
-wrap_mode_uses_border_color(unsigned mode)
-{
-   switch (mode) {
-   case PIPE_TEX_WRAP_REPEAT:
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      return FALSE;
-   case PIPE_TEX_WRAP_CLAMP:
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      return TRUE;
-   default:
-      assert(0 && "unexpected wrap mode");
-      return FALSE;
-   }
-}
-
-
-/**
  * Generate code to fetch a texel from a texture at int coords (x, y, z).
  * The computation depends on whether the texture is 1D, 2D or 3D.
  * The result, texel, will be float vectors:
@@ -92,6 +69,7 @@ wrap_mode_uses_border_color(unsigned mode)
  */
 static void
 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
+                          unsigned unit,
                           LLVMValueRef width,
                           LLVMValueRef height,
                           LLVMValueRef depth,
@@ -103,21 +81,27 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
                           LLVMValueRef data_ptr,
                           LLVMValueRef texel_out[4])
 {
-   const int dims = texture_dims(bld->static_state->target);
+   const struct lp_sampler_static_state *static_state = bld->static_state;
+   const unsigned dims = bld->dims;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    LLVMValueRef offset;
    LLVMValueRef i, j;
    LLVMValueRef use_border = NULL;
 
    /* use_border = x < 0 || x >= width || y < 0 || y >= height */
-   if (wrap_mode_uses_border_color(bld->static_state->wrap_s)) {
+   if (lp_sampler_wrap_mode_uses_border_color(static_state->wrap_s,
+                                              static_state->min_img_filter,
+                                              static_state->mag_img_filter)) {
       LLVMValueRef b1, b2;
       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
       use_border = LLVMBuildOr(bld->builder, b1, b2, "b1_or_b2");
    }
 
-   if (dims >= 2 && wrap_mode_uses_border_color(bld->static_state->wrap_t)) {
+   if (dims >= 2 &&
+       lp_sampler_wrap_mode_uses_border_color(static_state->wrap_t,
+                                              static_state->min_img_filter,
+                                              static_state->mag_img_filter)) {
       LLVMValueRef b1, b2;
       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
@@ -130,7 +114,10 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
       }
    }
 
-   if (dims == 3 && wrap_mode_uses_border_color(bld->static_state->wrap_r)) {
+   if (dims == 3 &&
+       lp_sampler_wrap_mode_uses_border_color(static_state->wrap_r,
+                                              static_state->min_img_filter,
+                                              static_state->mag_img_filter)) {
       LLVMValueRef b1, b2;
       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
@@ -144,7 +131,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
    }
 
    /* convert x,y,z coords to linear offset from start of texture, in bytes */
-   lp_build_sample_offset(&bld->uint_coord_bld,
+   lp_build_sample_offset(&bld->int_coord_bld,
                           bld->format_desc,
                           x, y, z, y_stride, z_stride,
                           &offset, &i, &j);
@@ -158,7 +145,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
        * coords which are out of bounds to become zero.  Zero's guaranteed
        * to be inside the texture image.
        */
-      offset = lp_build_andnot(&bld->uint_coord_bld, offset, use_border);
+      offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border);
    }
 
    lp_build_fetch_rgba_soa(bld->builder,
@@ -168,8 +155,6 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
                            i, j,
                            texel_out);
 
-   apply_sampler_swizzle(bld, texel_out);
-
    /*
     * Note: if we find an app which frequently samples the texture border
     * we might want to implement a true conditional here to avoid sampling
@@ -187,15 +172,22 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
 
    if (use_border) {
       /* select texel color or border color depending on use_border */
+      LLVMValueRef border_color_ptr = 
+         bld->dynamic_state->border_color(bld->dynamic_state,
+                                          bld->builder, unit);
       int chan;
       for (chan = 0; chan < 4; chan++) {
          LLVMValueRef border_chan =
-            lp_build_const_vec(bld->texel_type,
-                                  bld->static_state->border_color[chan]);
+            lp_build_array_get(bld->builder, border_color_ptr,
+                               lp_build_const_int32(chan));
+         LLVMValueRef border_chan_vec =
+            lp_build_broadcast_scalar(&bld->float_vec_bld, border_chan);
          texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
-                                           border_chan, texel_out[chan]);
+                                           border_chan_vec, texel_out[chan]);
       }
    }
+
+   apply_sampler_swizzle(bld, texel_out);
 }
 
 
@@ -210,11 +202,7 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld,
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    LLVMValueRef fract, flr, isOdd;
 
-   /* fract = coord - floor(coord) */
-   fract = lp_build_sub(coord_bld, coord, lp_build_floor(coord_bld, coord));
-
-   /* flr = ifloor(coord); */
-   flr = lp_build_ifloor(coord_bld, coord);
+   lp_build_ifloor_fract(coord_bld, coord, &flr, &fract);
 
    /* isOdd = flr & 1 */
    isOdd = LLVMBuildAnd(bld->builder, flr, int_coord_bld->one, "");
@@ -242,6 +230,7 @@ static void
 lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
                             LLVMValueRef coord,
                             LLVMValueRef length,
+                            LLVMValueRef length_f,
                             boolean is_pot,
                             unsigned wrap_mode,
                             LLVMValueRef *x0_out,
@@ -250,10 +239,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 {
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
    LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5);
-   LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
-   LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
+   LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
    LLVMValueRef coord0, coord1, weight;
 
    switch(wrap_mode) {
@@ -261,21 +248,25 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       /* mul by size and subtract 0.5 */
       coord = lp_build_mul(coord_bld, coord, length_f);
       coord = lp_build_sub(coord_bld, coord, half);
-      /* convert to int */
-      coord0 = lp_build_ifloor(coord_bld, coord);
-      coord1 = lp_build_add(uint_coord_bld, coord0, uint_coord_bld->one);
-      /* compute lerp weight */
-      weight = lp_build_fract(coord_bld, coord);
+      /* convert to int, compute lerp weight */
+      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
       /* repeat wrap */
       if (is_pot) {
+         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
          coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, "");
          coord1 = LLVMBuildAnd(bld->builder, coord1, length_minus_one, "");
       }
       else {
-         /* Signed remainder won't give the right results for negative
-          * dividends but unsigned remainder does.*/
+         /* Add a bias to the texcoord to handle negative coords */
+         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
+         LLVMValueRef mask;
+         coord0 = LLVMBuildAdd(bld->builder, coord0, bias, "");
          coord0 = LLVMBuildURem(bld->builder, coord0, length, "");
-         coord1 = LLVMBuildURem(bld->builder, coord1, length, "");
+         mask = lp_build_compare(bld->builder, int_coord_bld->type,
+                                 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
+         coord1 = LLVMBuildAnd(bld->builder,
+                              lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
+                              mask, "");
       }
       break;
 
@@ -290,53 +281,47 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 
       coord = lp_build_sub(coord_bld, coord, half);
 
-      weight = lp_build_fract(coord_bld, coord);
-      coord0 = lp_build_ifloor(coord_bld, coord);
+      /* convert to int, compute lerp weight */
+      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       break;
 
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      if (bld->static_state->normalized_coords) {
-         /* clamp to [0,1] */
-         coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, coord_bld->one);
-         /* mul by tex size and subtract 0.5 */
-         coord = lp_build_mul(coord_bld, coord, length_f);
+      {
+         struct lp_build_context abs_coord_bld = bld->coord_bld;
+         abs_coord_bld.type.sign = FALSE;
+
+         if (bld->static_state->normalized_coords) {
+            /* mul by tex size */
+            coord = lp_build_mul(coord_bld, coord, length_f);
+         }
+         /* clamp to length max */
+         coord = lp_build_min(coord_bld, coord, length_f);
+         /* subtract 0.5 */
          coord = lp_build_sub(coord_bld, coord, half);
+         /* clamp to [0, length - 0.5] */
+         coord = lp_build_max(coord_bld, coord, coord_bld->zero);
+         /* convert to int, compute lerp weight */
+         lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
+         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+         /* coord1 = min(coord1, length-1) */
+         coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+         break;
       }
-      else {
-         LLVMValueRef min, max;
-         /* clamp to [0.5, length - 0.5] */
-         min = half;
-         max = lp_build_sub(coord_bld, length_f, min);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-      }
-      /* compute lerp weight */
-      weight = lp_build_fract(coord_bld, coord);
-      /* coord0 = floor(coord); */
-      coord0 = lp_build_ifloor(coord_bld, coord);
-      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
-      /* coord0 = max(coord0, 0) */
-      coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
-      /* coord1 = min(coord1, length-1) */
-      coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
-      break;
 
    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
       {
-         LLVMValueRef min, max;
+         LLVMValueRef min;
          if (bld->static_state->normalized_coords) {
             /* scale coord to length */
             coord = lp_build_mul(coord_bld, coord, length_f);
          }
-         /* clamp to [-0.5, length + 0.5] */
-         min = lp_build_const_vec(coord_bld->type, -0.5F);
-         max = lp_build_sub(coord_bld, length_f, min);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
+         /* was: clamp to [-0.5, length + 0.5], then sub 0.5 */
          coord = lp_build_sub(coord_bld, coord, half);
-         /* compute lerp weight */
-         weight = lp_build_fract(coord_bld, coord);
-         /* convert to int */
-         coord0 = lp_build_ifloor(coord_bld, coord);
+         min = lp_build_const_vec(coord_bld->type, -1.0F);
+         coord = lp_build_clamp(coord_bld, coord, min, length_f);
+         /* convert to int, compute lerp weight */
+         lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       }
       break;
@@ -349,11 +334,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       coord = lp_build_mul(coord_bld, coord, length_f);
       coord = lp_build_sub(coord_bld, coord, half);
 
-      /* compute lerp weight */
-      weight = lp_build_fract(coord_bld, coord);
-
-      /* convert to int coords */
-      coord0 = lp_build_ifloor(coord_bld, coord);
+      /* convert to int, compute lerp weight */
+      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 
       /* coord0 = max(coord0, 0) */
@@ -375,15 +357,16 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 
       coord = lp_build_sub(coord_bld, coord, half);
 
-      weight = lp_build_fract(coord_bld, coord);
-      coord0 = lp_build_ifloor(coord_bld, coord);
+      /* convert to int, compute lerp weight */
+      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
       {
          LLVMValueRef min, max;
-
+         struct lp_build_context abs_coord_bld = bld->coord_bld;
+         abs_coord_bld.type.sign = FALSE;
          coord = lp_build_abs(coord_bld, coord);
 
          if (bld->static_state->normalized_coords) {
@@ -398,16 +381,14 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 
          coord = lp_build_sub(coord_bld, coord, half);
 
-         weight = lp_build_fract(coord_bld, coord);
-         coord0 = lp_build_ifloor(coord_bld, coord);
+         /* convert to int, compute lerp weight */
+         lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       }
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
       {
-         LLVMValueRef min, max;
-
          coord = lp_build_abs(coord_bld, coord);
 
          if (bld->static_state->normalized_coords) {
@@ -415,15 +396,13 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
             coord = lp_build_mul(coord_bld, coord, length_f);
          }
 
-         /* clamp to [-0.5, length + 0.5] */
-         min = lp_build_negate(coord_bld, half);
-         max = lp_build_sub(coord_bld, length_f, min);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-
+         /* was: clamp to [-0.5, length + 0.5] then sub 0.5 */
+         /* skip -0.5 clamp (always positive), do sub first */
          coord = lp_build_sub(coord_bld, coord, half);
+         coord = lp_build_min(coord_bld, coord, length_f);
 
-         weight = lp_build_fract(coord_bld, coord);
-         coord0 = lp_build_ifloor(coord_bld, coord);
+         /* convert to int, compute lerp weight */
+         lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       }
       break;
@@ -452,14 +431,13 @@ static LLVMValueRef
 lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
                              LLVMValueRef coord,
                              LLVMValueRef length,
+                             LLVMValueRef length_f,
                              boolean is_pot,
                              unsigned wrap_mode)
 {
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
-   LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
+   LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
    LLVMValueRef icoord;
    
    switch(wrap_mode) {
@@ -468,10 +446,12 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
       icoord = lp_build_ifloor(coord_bld, coord);
       if (is_pot)
          icoord = LLVMBuildAnd(bld->builder, icoord, length_minus_one, "");
-      else
-         /* Signed remainder won't give the right results for negative
-          * dividends but unsigned remainder does.*/
+      else {
+         /* Add a bias to the texcoord to handle negative coords */
+         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
+         icoord = LLVMBuildAdd(bld->builder, icoord, bias, "");
          icoord = LLVMBuildURem(bld->builder, icoord, length, "");
+      }
       break;
 
    case PIPE_TEX_WRAP_CLAMP:
@@ -482,7 +462,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
       }
 
       /* floor */
-      icoord = lp_build_ifloor(coord_bld, coord);
+      /* use itrunc instead since we clamp to 0 anyway */
+      icoord = lp_build_itrunc(coord_bld, coord);
 
       /* clamp to [0, length - 1]. */
       icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
@@ -516,7 +497,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
       assert(bld->static_state->normalized_coords);
       coord = lp_build_mul(coord_bld, coord, length_f);
 
-      icoord = lp_build_ifloor(coord_bld, coord);
+      /* itrunc == ifloor here */
+      icoord = lp_build_itrunc(coord_bld, coord);
 
       /* clamp to [0, length - 1] */
       icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
@@ -531,7 +513,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          coord = lp_build_mul(coord_bld, coord, length_f);
       }
 
-      icoord = lp_build_ifloor(coord_bld, coord);
+      /* itrunc == ifloor here */
+      icoord = lp_build_itrunc(coord_bld, coord);
 
       /* clamp to [0, length - 1] */
       icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
@@ -545,7 +528,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          coord = lp_build_mul(coord_bld, coord, length_f);
       }
 
-      icoord = lp_build_ifloor(coord_bld, coord);
+      /* itrunc == ifloor here */
+      icoord = lp_build_itrunc(coord_bld, coord);
 
       /* clamp to [0, length] */
       icoord = lp_build_min(int_coord_bld, icoord, length);
@@ -566,9 +550,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
  */
 static void
 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
-                              LLVMValueRef width_vec,
-                              LLVMValueRef height_vec,
-                              LLVMValueRef depth_vec,
+                              unsigned unit,
+                              LLVMValueRef size,
                               LLVMValueRef row_stride_vec,
                               LLVMValueRef img_stride_vec,
                               LLVMValueRef data_ptr,
@@ -577,26 +560,47 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                               LLVMValueRef r,
                               LLVMValueRef colors_out[4])
 {
-   const int dims = texture_dims(bld->static_state->target);
+   const unsigned dims = bld->dims;
+   LLVMValueRef width_vec;
+   LLVMValueRef height_vec;
+   LLVMValueRef depth_vec;
+   LLVMValueRef flt_size;
+   LLVMValueRef flt_width_vec;
+   LLVMValueRef flt_height_vec;
+   LLVMValueRef flt_depth_vec;
    LLVMValueRef x, y, z;
 
+   lp_build_extract_image_sizes(bld,
+                                bld->int_size_type,
+                                bld->int_coord_type,
+                                size,
+                                &width_vec, &height_vec, &depth_vec);
+
+   flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
+
+   lp_build_extract_image_sizes(bld,
+                                bld->float_size_type,
+                                bld->coord_type,
+                                flt_size,
+                                &flt_width_vec, &flt_height_vec, &flt_depth_vec);
+
    /*
     * Compute integer texcoords.
     */
-   x = lp_build_sample_wrap_nearest(bld, s, width_vec,
+   x = lp_build_sample_wrap_nearest(bld, s, width_vec, flt_width_vec,
                                     bld->static_state->pot_width,
                                     bld->static_state->wrap_s);
    lp_build_name(x, "tex.x.wrapped");
 
    if (dims >= 2) {
-      y = lp_build_sample_wrap_nearest(bld, t, height_vec,
+      y = lp_build_sample_wrap_nearest(bld, t, height_vec, flt_height_vec,
                                        bld->static_state->pot_height,
                                        bld->static_state->wrap_t);
       lp_build_name(y, "tex.y.wrapped");
 
       if (dims == 3) {
-         z = lp_build_sample_wrap_nearest(bld, r, depth_vec,
-                                          bld->static_state->pot_height,
+         z = lp_build_sample_wrap_nearest(bld, r, depth_vec, flt_depth_vec,
+                                          bld->static_state->pot_depth,
                                           bld->static_state->wrap_r);
          lp_build_name(z, "tex.z.wrapped");
       }
@@ -614,7 +618,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
    /*
     * Get texture colors.
     */
-   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+   lp_build_sample_texel_soa(bld, unit,
+                             width_vec, height_vec, depth_vec,
                              x, y, z,
                              row_stride_vec, img_stride_vec,
                              data_ptr, colors_out);
@@ -627,9 +632,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
  */
 static void
 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
-                             LLVMValueRef width_vec,
-                             LLVMValueRef height_vec,
-                             LLVMValueRef depth_vec,
+                             unsigned unit,
+                             LLVMValueRef size,
                              LLVMValueRef row_stride_vec,
                              LLVMValueRef img_stride_vec,
                              LLVMValueRef data_ptr,
@@ -638,16 +642,37 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                              LLVMValueRef r,
                              LLVMValueRef colors_out[4])
 {
-   const int dims = texture_dims(bld->static_state->target);
+   const unsigned dims = bld->dims;
+   LLVMValueRef width_vec;
+   LLVMValueRef height_vec;
+   LLVMValueRef depth_vec;
+   LLVMValueRef flt_size;
+   LLVMValueRef flt_width_vec;
+   LLVMValueRef flt_height_vec;
+   LLVMValueRef flt_depth_vec;
    LLVMValueRef x0, y0, z0, x1, y1, z1;
    LLVMValueRef s_fpart, t_fpart, r_fpart;
    LLVMValueRef neighbors[2][2][4];
    int chan;
 
+   lp_build_extract_image_sizes(bld,
+                                bld->int_size_type,
+                                bld->int_coord_type,
+                                size,
+                                &width_vec, &height_vec, &depth_vec);
+
+   flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
+
+   lp_build_extract_image_sizes(bld,
+                                bld->float_size_type,
+                                bld->coord_type,
+                                flt_size,
+                                &flt_width_vec, &flt_height_vec, &flt_depth_vec);
+
    /*
     * Compute integer texcoords.
     */
-   lp_build_sample_wrap_linear(bld, s, width_vec,
+   lp_build_sample_wrap_linear(bld, s, width_vec, flt_width_vec,
                                bld->static_state->pot_width,
                                bld->static_state->wrap_s,
                                &x0, &x1, &s_fpart);
@@ -655,7 +680,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    lp_build_name(x1, "tex.x1.wrapped");
 
    if (dims >= 2) {
-      lp_build_sample_wrap_linear(bld, t, height_vec,
+      lp_build_sample_wrap_linear(bld, t, height_vec, flt_height_vec,
                                   bld->static_state->pot_height,
                                   bld->static_state->wrap_t,
                                   &y0, &y1, &t_fpart);
@@ -663,7 +688,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
       lp_build_name(y1, "tex.y1.wrapped");
 
       if (dims == 3) {
-         lp_build_sample_wrap_linear(bld, r, depth_vec,
+         lp_build_sample_wrap_linear(bld, r, depth_vec, flt_depth_vec,
                                      bld->static_state->pot_depth,
                                      bld->static_state->wrap_r,
                                      &z0, &z1, &r_fpart);
@@ -688,11 +713,13 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
     * Get texture colors.
     */
    /* get x0/x1 texels */
-   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+   lp_build_sample_texel_soa(bld, unit,
+                             width_vec, height_vec, depth_vec,
                              x0, y0, z0,
                              row_stride_vec, img_stride_vec,
                              data_ptr, neighbors[0][0]);
-   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+   lp_build_sample_texel_soa(bld, unit,
+                             width_vec, height_vec, depth_vec,
                              x1, y0, z0,
                              row_stride_vec, img_stride_vec,
                              data_ptr, neighbors[0][1]);
@@ -710,11 +737,13 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
       LLVMValueRef colors0[4];
 
       /* get x0/x1 texels at y1 */
-      lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+      lp_build_sample_texel_soa(bld, unit,
+                                width_vec, height_vec, depth_vec,
                                 x0, y1, z0,
                                 row_stride_vec, img_stride_vec,
                                 data_ptr, neighbors[1][0]);
-      lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+      lp_build_sample_texel_soa(bld, unit,
+                                width_vec, height_vec, depth_vec,
                                 x1, y1, z0,
                                 row_stride_vec, img_stride_vec,
                                 data_ptr, neighbors[1][1]);
@@ -734,19 +763,23 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
          LLVMValueRef colors1[4];
 
          /* get x0/x1/y0/y1 texels at z1 */
-         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+         lp_build_sample_texel_soa(bld, unit,
+                                   width_vec, height_vec, depth_vec,
                                    x0, y0, z1,
                                    row_stride_vec, img_stride_vec,
                                    data_ptr, neighbors1[0][0]);
-         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+         lp_build_sample_texel_soa(bld, unit,
+                                   width_vec, height_vec, depth_vec,
                                    x1, y0, z1,
                                    row_stride_vec, img_stride_vec,
                                    data_ptr, neighbors1[0][1]);
-         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+         lp_build_sample_texel_soa(bld, unit,
+                                   width_vec, height_vec, depth_vec,
                                    x0, y1, z1,
                                    row_stride_vec, img_stride_vec,
                                    data_ptr, neighbors1[1][0]);
-         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+         lp_build_sample_texel_soa(bld, unit,
+                                   width_vec, height_vec, depth_vec,
                                    x1, y1, z1,
                                    row_stride_vec, img_stride_vec,
                                    data_ptr, neighbors1[1][1]);
@@ -786,74 +819,98 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
  */
 static void
 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
+                       unsigned unit,
                        unsigned img_filter,
                        unsigned mip_filter,
                        LLVMValueRef s,
                        LLVMValueRef t,
                        LLVMValueRef r,
+                       LLVMValueRef ilevel0,
+                       LLVMValueRef ilevel1,
                        LLVMValueRef lod_fpart,
-                       LLVMValueRef width0_vec,
-                       LLVMValueRef width1_vec,
-                       LLVMValueRef height0_vec,
-                       LLVMValueRef height1_vec,
-                       LLVMValueRef depth0_vec,
-                       LLVMValueRef depth1_vec,
-                       LLVMValueRef row_stride0_vec,
-                       LLVMValueRef row_stride1_vec,
-                       LLVMValueRef img_stride0_vec,
-                       LLVMValueRef img_stride1_vec,
-                       LLVMValueRef data_ptr0,
-                       LLVMValueRef data_ptr1,
                        LLVMValueRef *colors_out)
 {
+   LLVMBuilderRef builder = bld->builder;
+   LLVMValueRef size0;
+   LLVMValueRef size1;
+   LLVMValueRef row_stride0_vec;
+   LLVMValueRef row_stride1_vec;
+   LLVMValueRef img_stride0_vec;
+   LLVMValueRef img_stride1_vec;
+   LLVMValueRef data_ptr0;
+   LLVMValueRef data_ptr1;
    LLVMValueRef colors0[4], colors1[4];
-   int chan;
+   unsigned chan;
 
+   /* sample the first mipmap level */
+   lp_build_mipmap_level_sizes(bld, ilevel0,
+                               &size0,
+                               &row_stride0_vec, &img_stride0_vec);
+   data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
-      /* sample the first mipmap level */
-      lp_build_sample_image_nearest(bld,
-                                    width0_vec, height0_vec, depth0_vec,
+      lp_build_sample_image_nearest(bld, unit,
+                                    size0,
                                     row_stride0_vec, img_stride0_vec,
-                                    data_ptr0, s, t, r, colors0);
-
-      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         /* sample the second mipmap level */
-         lp_build_sample_image_nearest(bld,
-                                       width1_vec, height1_vec, depth1_vec,
-                                       row_stride1_vec, img_stride1_vec,
-                                       data_ptr1, s, t, r, colors1);
-      }
+                                    data_ptr0, s, t, r,
+                                    colors0);
    }
    else {
       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
-
-      /* sample the first mipmap level */
-      lp_build_sample_image_linear(bld,
-                                   width0_vec, height0_vec, depth0_vec,
+      lp_build_sample_image_linear(bld, unit,
+                                   size0,
                                    row_stride0_vec, img_stride0_vec,
-                                   data_ptr0, s, t, r, colors0);
+                                   data_ptr0, s, t, r,
+                                   colors0);
+   }
 
-      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         /* sample the second mipmap level */
-         lp_build_sample_image_linear(bld,
-                                      width1_vec, height1_vec, depth1_vec,
-                                      row_stride1_vec, img_stride1_vec,
-                                      data_ptr1, s, t, r, colors1);
-      }
+   /* Store the first level's colors in the output variables */
+   for (chan = 0; chan < 4; chan++) {
+       LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
    }
 
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-      /* interpolate samples from the two mipmap levels */
-      for (chan = 0; chan < 4; chan++) {
-         colors_out[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
+      struct lp_build_if_state if_ctx;
+      LLVMValueRef need_lerp;
+
+      /* need_lerp = lod_fpart > 0 */
+      need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
+                                lod_fpart,
+                                bld->float_bld.zero,
+                                "need_lerp");
+
+      lp_build_if(&if_ctx, builder, need_lerp);
+      {
+         /* sample the second mipmap level */
+         lp_build_mipmap_level_sizes(bld, ilevel1,
+                                     &size1,
+                                     &row_stride1_vec, &img_stride1_vec);
+         data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
+         if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+            lp_build_sample_image_nearest(bld, unit,
+                                          size1,
+                                          row_stride1_vec, img_stride1_vec,
+                                          data_ptr1, s, t, r,
+                                          colors1);
+         }
+         else {
+            lp_build_sample_image_linear(bld, unit,
+                                         size1,
+                                         row_stride1_vec, img_stride1_vec,
+                                         data_ptr1, s, t, r,
+                                         colors1);
+         }
+
+         /* interpolate samples from the two mipmap levels */
+
+         lod_fpart = lp_build_broadcast_scalar(&bld->texel_bld, lod_fpart);
+
+         for (chan = 0; chan < 4; chan++) {
+            colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
                                           colors0[chan], colors1[chan]);
+            LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
+         }
       }
-   }
-   else {
-      /* use first/only level's colors */
-      for (chan = 0; chan < 4; chan++) {
-         colors_out[chan] = colors0[chan];
-      }
+      lp_build_endif(&if_ctx);
    }
 }
 
@@ -874,30 +931,20 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
                         const LLVMValueRef *ddy,
                         LLVMValueRef lod_bias, /* optional */
                         LLVMValueRef explicit_lod, /* optional */
-                        LLVMValueRef width,
-                        LLVMValueRef height,
-                        LLVMValueRef depth,
-                        LLVMValueRef width_vec,
-                        LLVMValueRef height_vec,
-                        LLVMValueRef depth_vec,
-                        LLVMValueRef row_stride_array,
-                        LLVMValueRef img_stride_array,
-                        LLVMValueRef data_array,
                         LLVMValueRef *colors_out)
 {
-   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *int_bld = &bld->int_bld;
+   LLVMBuilderRef builder = bld->builder;
    const unsigned mip_filter = bld->static_state->min_mip_filter;
    const unsigned min_filter = bld->static_state->min_img_filter;
    const unsigned mag_filter = bld->static_state->mag_img_filter;
-   const int dims = texture_dims(bld->static_state->target);
-   LLVMValueRef lod = NULL, lod_fpart = NULL;
+   LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
    LLVMValueRef ilevel0, ilevel1 = NULL;
-   LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL;
-   LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL;
-   LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL;
-   LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL;
-   LLVMValueRef data_ptr0, data_ptr1 = NULL;
    LLVMValueRef face_ddx[4], face_ddy[4];
+   LLVMValueRef texels[4];
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMValueRef i32t_zero = LLVMConstInt(i32t, 0, 0);
+   unsigned chan;
 
    /*
    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
@@ -916,12 +963,12 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
 
       /* recompute ddx, ddy using the new (s,t) face texcoords */
-      face_ddx[0] = lp_build_ddx(&bld->coord_bld, s);
-      face_ddx[1] = lp_build_ddx(&bld->coord_bld, t);
+      face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
+      face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
       face_ddx[2] = NULL;
       face_ddx[3] = NULL;
-      face_ddy[0] = lp_build_ddy(&bld->coord_bld, s);
-      face_ddy[1] = lp_build_ddy(&bld->coord_bld, t);
+      face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
+      face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
       face_ddy[2] = NULL;
       face_ddy[3] = NULL;
       ddx = face_ddx;
@@ -936,127 +983,109 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       /* Need to compute lod either to choose mipmap levels or to
        * distinguish between minification/magnification with one mipmap level.
        */
-      lod = lp_build_lod_selector(bld, ddx, ddy,
-                                  lod_bias, explicit_lod,
-                                  width, height, depth);
+      lp_build_lod_selector(bld, unit, ddx, ddy,
+                            lod_bias, explicit_lod,
+                            mip_filter,
+                            &lod_ipart, &lod_fpart);
+   } else {
+      lod_ipart = i32t_zero;
    }
 
    /*
-    * Compute integer mipmap level(s) to fetch texels from.
+    * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
     */
-   if (mip_filter == PIPE_TEX_MIPFILTER_NONE) {
+   switch (mip_filter) {
+   default:
+      assert(0 && "bad mip_filter value in lp_build_sample_soa()");
+      /* fall-through */
+   case PIPE_TEX_MIPFILTER_NONE:
       /* always use mip level 0 */
       if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
          /* XXX this is a work-around for an apparent bug in LLVM 2.7.
           * We should be able to set ilevel0 = const(0) but that causes
           * bad x86 code to be emitted.
           */
-         lod = lp_build_const_elem(bld->coord_bld.type, 0.0);
-         lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
-      }
-      else {
-         ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
-      }
-   }
-   else {
-      assert(lod);
-      if (mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
-         lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+         assert(lod_ipart);
+         lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
       }
       else {
-         assert(mip_filter == PIPE_TEX_MIPFILTER_LINEAR);
-         lp_build_linear_mip_levels(bld, unit, lod, &ilevel0, &ilevel1,
-                                    &lod_fpart);
-         lod_fpart = lp_build_broadcast_scalar(&bld->coord_bld, lod_fpart);
+         ilevel0 = i32t_zero;
       }
+      break;
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      assert(lod_ipart);
+      lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
+      break;
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      assert(lod_ipart);
+      assert(lod_fpart);
+      lp_build_linear_mip_levels(bld, unit,
+                                 lod_ipart, &lod_fpart,
+                                 &ilevel0, &ilevel1);
+      break;
    }
 
-   /* compute image size(s) of source mipmap level(s) */
-   lp_build_mipmap_level_sizes(bld, dims, width_vec, height_vec, depth_vec,
-                               ilevel0, ilevel1,
-                               row_stride_array, img_stride_array,
-                               &width0_vec, &width1_vec,
-                               &height0_vec, &height1_vec,
-                               &depth0_vec, &depth1_vec,
-                               &row_stride0_vec, &row_stride1_vec,
-                               &img_stride0_vec, &img_stride1_vec);
-
    /*
-    * Get pointer(s) to image data for mipmap level(s).
+    * Get/interpolate texture colors.
     */
-   data_ptr0 = lp_build_get_mipmap_level(bld, data_array, ilevel0);
-   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-      data_ptr1 = lp_build_get_mipmap_level(bld, data_array, ilevel1);
+
+   for (chan = 0; chan < 4; ++chan) {
+     texels[chan] = lp_build_alloca(builder, bld->texel_bld.vec_type, "");
+     lp_build_name(texels[chan], "sampler%u_texel_%c_var", unit, "xyzw"[chan]);
    }
 
-   /*
-    * Get/interpolate texture colors.
-    */
    if (min_filter == mag_filter) {
       /* no need to distinquish between minification and magnification */
-      lp_build_sample_mipmap(bld, min_filter, mip_filter, s, t, r, lod_fpart,
-                             width0_vec, width1_vec,
-                             height0_vec, height1_vec,
-                             depth0_vec, depth1_vec,
-                             row_stride0_vec, row_stride1_vec,
-                             img_stride0_vec, img_stride1_vec,
-                             data_ptr0, data_ptr1,
-                             colors_out);
+      lp_build_sample_mipmap(bld, unit,
+                             min_filter, mip_filter,
+                             s, t, r,
+                             ilevel0, ilevel1, lod_fpart,
+                             texels);
    }
    else {
       /* Emit conditional to choose min image filter or mag image filter
-       * depending on the lod being >0 or <= 0, respectively.
+       * depending on the lod being > 0 or <= 0, respectively.
        */
-      struct lp_build_flow_context *flow_ctx;
       struct lp_build_if_state if_ctx;
       LLVMValueRef minify;
 
-      flow_ctx = lp_build_flow_create(bld->builder);
-      lp_build_flow_scope_begin(flow_ctx);
-
-      lp_build_flow_scope_declare(flow_ctx, &colors_out[0]);
-      lp_build_flow_scope_declare(flow_ctx, &colors_out[1]);
-      lp_build_flow_scope_declare(flow_ctx, &colors_out[2]);
-      lp_build_flow_scope_declare(flow_ctx, &colors_out[3]);
+      /* minify = lod >= 0.0 */
+      minify = LLVMBuildICmp(builder, LLVMIntSGE,
+                             lod_ipart, int_bld->zero, "");
 
-      /* minify = lod > 0.0 */
-      minify = LLVMBuildFCmp(bld->builder, LLVMRealUGE,
-                             lod, float_bld->zero, "");
-
-      lp_build_if(&if_ctx, flow_ctx, bld->builder, minify);
+      lp_build_if(&if_ctx, builder, minify);
       {
          /* Use the minification filter */
-         lp_build_sample_mipmap(bld, min_filter, mip_filter,
-                                s, t, r, lod_fpart,
-                                width0_vec, width1_vec,
-                                height0_vec, height1_vec,
-                                depth0_vec, depth1_vec,
-                                row_stride0_vec, row_stride1_vec,
-                                img_stride0_vec, img_stride1_vec,
-                                data_ptr0, data_ptr1,
-                                colors_out);
+         lp_build_sample_mipmap(bld, unit,
+                                min_filter, mip_filter,
+                                s, t, r,
+                                ilevel0, ilevel1, lod_fpart,
+                                texels);
       }
       lp_build_else(&if_ctx);
       {
          /* Use the magnification filter */
-         lp_build_sample_mipmap(bld, mag_filter, mip_filter,
-                                s, t, r, lod_fpart,
-                                width0_vec, width1_vec,
-                                height0_vec, height1_vec,
-                                depth0_vec, depth1_vec,
-                                row_stride0_vec, row_stride1_vec,
-                                img_stride0_vec, img_stride1_vec,
-                                data_ptr0, data_ptr1,
-                                colors_out);
+         lp_build_sample_mipmap(bld, unit,
+                                mag_filter, PIPE_TEX_MIPFILTER_NONE,
+                                s, t, r,
+                                i32t_zero, NULL, NULL,
+                                texels);
       }
       lp_build_endif(&if_ctx);
+   }
 
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
+   for (chan = 0; chan < 4; ++chan) {
+     colors_out[chan] = LLVMBuildLoad(builder, texels[chan], "");
+     lp_build_name(colors_out[chan], "sampler%u_texel_%c", unit, "xyzw"[chan]);
    }
 }
 
 
+/**
+ * Do shadow test/comparison.
+ * \param p  the texcoord Z (aka R, aka P) component
+ * \param texel  the texel to compare against (use the X channel)
+ */
 static void
 lp_build_sample_compare(struct lp_build_sample_context *bld,
                         LLVMValueRef p,
@@ -1064,30 +1093,30 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
 {
    struct lp_build_context *texel_bld = &bld->texel_bld;
    LLVMValueRef res;
-   unsigned chan;
+   const unsigned chan = 0;
 
-   if(bld->static_state->compare_mode == PIPE_TEX_COMPARE_NONE)
+   if (bld->static_state->compare_mode == PIPE_TEX_COMPARE_NONE)
       return;
 
-   /* TODO: Compare before swizzling, to avoid redundant computations */
-   res = NULL;
-   for(chan = 0; chan < 4; ++chan) {
-      LLVMValueRef cmp;
-      cmp = lp_build_cmp(texel_bld, bld->static_state->compare_func, p, texel[chan]);
-      cmp = lp_build_select(texel_bld, cmp, texel_bld->one, texel_bld->zero);
-
-      if(res)
-         res = lp_build_add(texel_bld, res, cmp);
-      else
-         res = cmp;
+   /* debug code */
+   if (0) {
+      LLVMValueRef indx = lp_build_const_int32(0);
+      LLVMValueRef coord = LLVMBuildExtractElement(bld->builder, p, indx, "");
+      LLVMValueRef tex = LLVMBuildExtractElement(bld->builder,
+                                                 texel[chan], indx, "");
+      lp_build_printf(bld->builder, "shadow compare coord %f to texture %f\n",
+                      coord, tex);
    }
 
-   assert(res);
-   res = lp_build_mul(texel_bld, res, lp_build_const_vec(texel_bld->type, 0.25));
+   /* result = (p FUNC texel) ? 1 : 0 */
+   res = lp_build_cmp(texel_bld, bld->static_state->compare_func,
+                      p, texel[chan]);
+   res = lp_build_select(texel_bld, res, texel_bld->one, texel_bld->zero);
 
    /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
-   for(chan = 0; chan < 3; ++chan)
-      texel[chan] = res;
+   texel[0] =
+   texel[1] =
+   texel[2] = res;
    texel[3] = texel_bld->one;
 }
 
@@ -1131,15 +1160,14 @@ lp_build_sample_soa(LLVMBuilderRef builder,
                     LLVMValueRef explicit_lod, /* optional */
                     LLVMValueRef texel_out[4])
 {
+   unsigned dims = texture_dims(static_state->target);
    struct lp_build_sample_context bld;
-   LLVMValueRef width, width_vec;
-   LLVMValueRef height, height_vec;
-   LLVMValueRef depth, depth_vec;
-   LLVMValueRef row_stride_array, img_stride_array;
-   LLVMValueRef data_array;
+   LLVMTypeRef i32t = LLVMInt32Type();
+
    LLVMValueRef s;
    LLVMValueRef t;
    LLVMValueRef r;
+   struct lp_type float_vec_type;
 
    if (0) {
       enum pipe_format fmt = static_state->format;
@@ -1154,38 +1182,57 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    bld.static_state = static_state;
    bld.dynamic_state = dynamic_state;
    bld.format_desc = util_format_description(static_state->format);
+   bld.dims = dims;
 
    bld.float_type = lp_type_float(32);
    bld.int_type = lp_type_int(32);
    bld.coord_type = type;
-   bld.uint_coord_type = lp_uint_type(type);
    bld.int_coord_type = lp_int_type(type);
+   bld.float_size_type = lp_type_float(32);
+   bld.float_size_type.length = dims > 1 ? 4 : 1;
+   bld.int_size_type = lp_int_type(bld.float_size_type);
    bld.texel_type = type;
 
+   float_vec_type = lp_type_float_vec(32);
+
    lp_build_context_init(&bld.float_bld, builder, bld.float_type);
+   lp_build_context_init(&bld.float_vec_bld, builder, float_vec_type);
    lp_build_context_init(&bld.int_bld, builder, bld.int_type);
    lp_build_context_init(&bld.coord_bld, builder, bld.coord_type);
-   lp_build_context_init(&bld.uint_coord_bld, builder, bld.uint_coord_type);
    lp_build_context_init(&bld.int_coord_bld, builder, bld.int_coord_type);
+   lp_build_context_init(&bld.int_size_bld, builder, bld.int_size_type);
+   lp_build_context_init(&bld.float_size_bld, builder, bld.float_size_type);
    lp_build_context_init(&bld.texel_bld, builder, bld.texel_type);
 
    /* Get the dynamic state */
-   width = dynamic_state->width(dynamic_state, builder, unit);
-   height = dynamic_state->height(dynamic_state, builder, unit);
-   depth = dynamic_state->depth(dynamic_state, builder, unit);
-   row_stride_array = dynamic_state->row_stride(dynamic_state, builder, unit);
-   img_stride_array = dynamic_state->img_stride(dynamic_state, builder, unit);
-   data_array = dynamic_state->data_ptr(dynamic_state, builder, unit);
+   bld.width = dynamic_state->width(dynamic_state, builder, unit);
+   bld.height = dynamic_state->height(dynamic_state, builder, unit);
+   bld.depth = dynamic_state->depth(dynamic_state, builder, unit);
+   bld.row_stride_array = dynamic_state->row_stride(dynamic_state, builder, unit);
+   bld.img_stride_array = dynamic_state->img_stride(dynamic_state, builder, unit);
+   bld.data_array = dynamic_state->data_ptr(dynamic_state, builder, unit);
    /* Note that data_array is an array[level] of pointers to texture images */
 
    s = coords[0];
    t = coords[1];
    r = coords[2];
 
-   /* width, height, depth as uint vectors */
-   width_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, width);
-   height_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, height);
-   depth_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, depth);
+   /* width, height, depth as single int vector */
+   if (dims <= 1) {
+      bld.int_size = bld.width;
+   }
+   else {
+      bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_bld.undef,
+                                            bld.width, LLVMConstInt(i32t, 0, 0), "");
+      if (dims >= 2) {
+         bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
+                                               bld.height, LLVMConstInt(i32t, 1, 0), "");
+         if (dims >= 3) {
+            bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
+                                                  bld.depth, LLVMConstInt(i32t, 2, 0), "");
+         }
+      }
+   }
 
    if (0) {
       /* For debug: no-op texture sampling */
@@ -1195,13 +1242,9 @@ lp_build_sample_soa(LLVMBuilderRef builder,
             lp_is_simple_wrap_mode(static_state->wrap_s) &&
             lp_is_simple_wrap_mode(static_state->wrap_t)) {
       /* do sampling/filtering with fixed pt arithmetic */
-      printf("new sample\n");
       lp_build_sample_aos(&bld, unit, s, t, r, ddx, ddy,
                           lod_bias, explicit_lod,
-                          width, height, depth,
-                          width_vec, height_vec, depth_vec,
-                          row_stride_array, img_stride_array,
-                          data_array, texel_out);
+                          texel_out);
    }
 
    else {
@@ -1217,13 +1260,8 @@ lp_build_sample_soa(LLVMBuilderRef builder,
                       static_state->wrap_t);
       }
 
-      printf("old sample\n");
       lp_build_sample_general(&bld, unit, s, t, r, ddx, ddy,
                               lod_bias, explicit_lod,
-                              width, height, depth,
-                              width_vec, height_vec, depth_vec,
-                              row_stride_array, img_stride_array,
-                              data_array,
                               texel_out);
    }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index 2e9e8386de..4685a90e41 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -101,6 +101,83 @@ lp_build_broadcast_scalar(struct lp_build_context *bld,
 
 
 /**
+ * Combined extract and broadcast (or a mere shuffle when the two types match)
+ */
+LLVMValueRef
+lp_build_extract_broadcast(LLVMBuilderRef builder,
+                           struct lp_type src_type,
+                           struct lp_type dst_type,
+                           LLVMValueRef vector,
+                           LLVMValueRef index)
+{
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMValueRef res;
+
+   assert(src_type.floating == dst_type.floating);
+   assert(src_type.width    == dst_type.width);
+
+   assert(lp_check_value(src_type, vector));
+   assert(LLVMTypeOf(index) == i32t);
+
+   if (src_type.length == 1) {
+      if (dst_type.length == 1) {
+         /*
+          * Trivial scalar -> scalar.
+          */
+
+         res = vector;
+      }
+      else {
+         /*
+          * Broadcast scalar -> vector.
+          */
+
+         res = lp_build_broadcast(builder,
+                                  lp_build_vec_type(dst_type),
+                                  vector);
+      }
+   }
+   else {
+      if (dst_type.length == src_type.length) {
+         /*
+          * Special shuffle of the same size.
+          */
+
+         LLVMValueRef shuffle;
+         shuffle = lp_build_broadcast(builder,
+                                      LLVMVectorType(i32t, dst_type.length),
+                                      index);
+         res = LLVMBuildShuffleVector(builder, vector,
+                                      LLVMGetUndef(lp_build_vec_type(dst_type)),
+                                      shuffle, "");
+      }
+      else {
+         LLVMValueRef scalar;
+         scalar = LLVMBuildExtractElement(builder, vector, index, "");
+         if (dst_type.length == 1) {
+            /*
+             * Trivial extract scalar from vector.
+             */
+
+            res = scalar;
+         }
+         else {
+            /*
+             * General case of different sized vectors.
+             */
+
+            res = lp_build_broadcast(builder,
+                                     lp_build_vec_type(dst_type),
+                                     vector);
+         }
+      }
+   }
+
+   return res;
+}
+
+
+/**
  * Swizzle one channel into all other three channels.
  */
 LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
index f9b6a5e725..fdea8442ae 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -55,6 +55,14 @@ lp_build_broadcast_scalar(struct lp_build_context *bld,
                           LLVMValueRef scalar);
 
 
+LLVMValueRef
+lp_build_extract_broadcast(LLVMBuilderRef builder,
+                           struct lp_type src_type,
+                           struct lp_type dst_type,
+                           LLVMValueRef vector,
+                           LLVMValueRef index);
+
+
 /**
  * Broadcast one channel of a vector composed of arrays of XYZW structures into
  * all four channel.
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 97318b3456..a4d3b750c3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -36,6 +36,9 @@
 #define LP_BLD_TGSI_H
 
 #include "gallivm/lp_bld.h"
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
 
 
 struct tgsi_token;
@@ -55,6 +58,75 @@ enum lp_build_tex_modifier {
 
 
 /**
+ * Describe a channel of a register.
+ *
+ * The value can be a:
+ * - immediate value (i.e. derived from a IMM register)
+ * - CONST[n].x/y/z/w
+ * - IN[n].x/y/z/w
+ * - undetermined (when .file == TGSI_FILE_NULL)
+ *
+ * This is one of the analysis results, and is used to described
+ * the output color in terms of inputs.
+ */
+struct lp_tgsi_channel_info
+{
+   unsigned file:4; /* TGSI_FILE_* */
+   unsigned swizzle:3; /* PIPE_SWIZZLE_x */
+   union {
+      uint32_t index;
+      float value; /* for TGSI_FILE_IMMEDIATE */
+   } u;
+};
+
+
+/**
+ * Describe a texture sampler interpolator.
+ *
+ * The interpolation is described in terms of regular inputs.
+ */
+struct lp_tgsi_texture_info
+{
+   struct lp_tgsi_channel_info coord[4];
+   unsigned target:8; /* TGSI_TEXTURE_* */
+   unsigned unit:8;  /* Sampler unit */
+   unsigned modifier:8; /* LP_BLD_TEX_MODIFIER_* */
+};
+
+
+struct lp_tgsi_info
+{
+   struct tgsi_shader_info base;
+
+   /*
+    * Whether any of the texture opcodes access a register file other than
+    * TGSI_FILE_INPUT.
+    *
+    * We could also handle TGSI_FILE_CONST/IMMEDIATE here, but there is little
+    * benefit.
+    */
+   unsigned indirect_textures:1;
+
+   /*
+    * Texture opcode description. Aimed at detecting and described direct
+    * texture opcodes.
+    */
+   unsigned num_texs;
+   struct lp_tgsi_texture_info tex[PIPE_MAX_SAMPLERS];
+
+   /*
+    * Output description. Aimed at detecting and describing simple blit
+    * shaders.
+    */
+   struct lp_tgsi_channel_info output[PIPE_MAX_SHADER_OUTPUTS][4];
+
+   /*
+    * Shortcut pointers into the above (for fragment shaders).
+    */
+   const struct lp_tgsi_channel_info *cbuf[PIPE_MAX_COLOR_BUFS];
+};
+
+/**
  * Sampler code generation interface.
  *
  * Although texture sampling is a requirement for TGSI translation, it is
@@ -97,6 +169,11 @@ struct lp_build_sampler_aos
 
 
 void
+lp_build_tgsi_info(const struct tgsi_token *tokens,
+                   struct lp_tgsi_info *info);
+
+
+void
 lp_build_tgsi_soa(LLVMBuilderRef builder,
                   const struct tgsi_token *tokens,
                   struct lp_type type,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
new file mode 100644
index 0000000000..ad514463de
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
@@ -0,0 +1,479 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_dump.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_tgsi.h"
+
+
+/**
+ * Analysis context.
+ *
+ * This is where we keep store the value of each channel of the IMM/TEMP/OUT
+ * register values, as we walk the shader.
+ */
+struct analysis_context
+{
+   struct lp_tgsi_info *info;
+
+   unsigned num_imms;
+   float imm[32][4];
+
+   struct lp_tgsi_channel_info temp[32][4];
+};
+
+
+/**
+ * Describe the specified channel of the src register.
+ */
+static void
+analyse_src(struct analysis_context *ctx,
+            struct lp_tgsi_channel_info *chan_info,
+            const struct tgsi_src_register *src,
+            unsigned chan)
+{
+   chan_info->file = TGSI_FILE_NULL;
+   if (!src->Indirect && !src->Absolute && !src->Negate) {
+      unsigned swizzle = tgsi_util_get_src_register_swizzle(src, chan);
+      if (src->File == TGSI_FILE_TEMPORARY) {
+         if (src->Index < Elements(ctx->temp)) {
+            *chan_info = ctx->temp[src->Index][swizzle];
+         }
+      } else {
+         chan_info->file = src->File;
+         if (src->File == TGSI_FILE_IMMEDIATE) {
+            assert(src->Index < Elements(ctx->imm));
+            if (src->Index < Elements(ctx->imm)) {
+               chan_info->u.value = ctx->imm[src->Index][swizzle];
+            }
+         } else {
+            chan_info->u.index = src->Index;
+            chan_info->swizzle = swizzle;
+         }
+      }
+   }
+}
+
+
+/**
+ * Whether this register channel refers to a specific immediate value.
+ */
+static boolean
+is_immediate(const struct lp_tgsi_channel_info *chan_info, float value)
+{
+   return chan_info->file == TGSI_FILE_IMMEDIATE &&
+          chan_info->u.value == value;
+}
+
+
+static void
+analyse_tex(struct analysis_context *ctx,
+            const struct tgsi_full_instruction *inst,
+            enum lp_build_tex_modifier modifier)
+{
+   struct lp_tgsi_info *info = ctx->info;
+   unsigned chan;
+
+   if (info->num_texs < Elements(info->tex)) {
+      struct lp_tgsi_texture_info *tex_info = &info->tex[info->num_texs];
+      bool indirect = FALSE;
+      unsigned readmask = 0;
+
+      tex_info->target = inst->Texture.Texture;
+      switch (inst->Texture.Texture) {
+      case TGSI_TEXTURE_1D:
+         readmask = TGSI_WRITEMASK_X;
+         break;
+      case TGSI_TEXTURE_2D:
+      case TGSI_TEXTURE_RECT:
+         readmask = TGSI_WRITEMASK_XY;
+         break;
+      case TGSI_TEXTURE_SHADOW1D:
+      case TGSI_TEXTURE_SHADOW2D:
+      case TGSI_TEXTURE_SHADOWRECT:
+      case TGSI_TEXTURE_3D:
+      case TGSI_TEXTURE_CUBE:
+         readmask = TGSI_WRITEMASK_XYZ;
+         break;
+      default:
+         assert(0);
+         return;
+      }
+
+      if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+         /* We don't track explicit derivatives, although we could */
+         indirect = TRUE;
+         tex_info->unit = inst->Src[3].Register.Index;
+      }  else {
+         if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED ||
+             modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS ||
+             modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
+            readmask |= TGSI_WRITEMASK_W;
+         }
+         tex_info->unit = inst->Src[1].Register.Index;
+      }
+
+      for (chan = 0; chan < 4; ++chan) {
+         struct lp_tgsi_channel_info *chan_info = &tex_info->coord[chan];
+         if (readmask & (1 << chan)) {
+            analyse_src(ctx, chan_info, &inst->Src[0].Register, chan);
+            if (chan_info->file != TGSI_FILE_INPUT) {
+               indirect = TRUE;
+            }
+         } else {
+            memset(chan_info, 0, sizeof *chan_info);
+         }
+      }
+
+      if (indirect) {
+         info->indirect_textures = TRUE;
+      }
+
+      ++info->num_texs;
+   } else {
+      info->indirect_textures = TRUE;
+   }
+}
+
+
+/**
+ * Process an instruction, and update the register values accordingly.
+ */
+static void
+analyse_instruction(struct analysis_context *ctx,
+                    struct tgsi_full_instruction *inst)
+{
+   struct lp_tgsi_info *info = ctx->info;
+   struct lp_tgsi_channel_info (*regs)[4];
+   unsigned max_regs;
+   unsigned i;
+   unsigned index;
+   unsigned chan;
+
+   for (i = 0; i < inst->Instruction.NumDstRegs; ++i) {
+      const struct tgsi_dst_register *dst = &inst->Dst[i].Register;
+
+      /*
+       * Get the lp_tgsi_channel_info array corresponding to the destination
+       * register file.
+       */
+
+      if (dst->File == TGSI_FILE_TEMPORARY) {
+         regs = ctx->temp;
+         max_regs = Elements(ctx->temp);
+      } else if (dst->File == TGSI_FILE_OUTPUT) {
+         regs = info->output;
+         max_regs = Elements(info->output);
+      } else if (dst->File == TGSI_FILE_ADDRESS ||
+                 dst->File == TGSI_FILE_PREDICATE) {
+         continue;
+      } else {
+         assert(0);
+         continue;
+      }
+
+      /*
+       * Detect direct TEX instructions
+       */
+
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_TEX:
+         analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_NONE);
+         break;
+      case TGSI_OPCODE_TXD:
+         analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV);
+         break;
+      case TGSI_OPCODE_TXB:
+         analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_LOD_BIAS);
+         break;
+      case TGSI_OPCODE_TXL:
+         analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD);
+         break;
+      case TGSI_OPCODE_TXP:
+         analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_PROJECTED);
+         break;
+      default:
+         break;
+      }
+
+      /*
+       * Keep track of assignments and writes
+       */
+
+      if (dst->Indirect) {
+         /*
+          * It could be any register index so clear all register indices.
+          */
+
+         for (chan = 0; chan < 4; ++chan) {
+            if (dst->WriteMask & (1 << chan)) {
+               for (index = 0; index < max_regs; ++index) {
+                  regs[index][chan].file = TGSI_FILE_NULL;
+               }
+            }
+         }
+      } else if (dst->Index < max_regs) {
+         /*
+          * Update this destination register value.
+          */
+
+         struct lp_tgsi_channel_info res[4];
+
+         memset(res, 0, sizeof res);
+
+         if (!inst->Instruction.Predicate &&
+             !inst->Instruction.Saturate) {
+            for (chan = 0; chan < 4; ++chan) {
+               if (dst->WriteMask & (1 << chan)) {
+                  if (inst->Instruction.Opcode == TGSI_OPCODE_MOV) {
+                     analyse_src(ctx, &res[chan],
+                                 &inst->Src[0].Register, chan);
+                  } else if (inst->Instruction.Opcode == TGSI_OPCODE_MUL) {
+                     /*
+                      * Propagate values across 1.0 and 0.0 multiplications.
+                      */
+
+                     struct lp_tgsi_channel_info src0;
+                     struct lp_tgsi_channel_info src1;
+
+                     analyse_src(ctx, &src0, &inst->Src[0].Register, chan);
+                     analyse_src(ctx, &src1, &inst->Src[1].Register, chan);
+
+                     if (is_immediate(&src0, 0.0f)) {
+                        res[chan] = src0;
+                     } else if (is_immediate(&src1, 0.0f)) {
+                        res[chan] = src1;
+                     } else if (is_immediate(&src0, 1.0f)) {
+                        res[chan] = src1;
+                     } else if (is_immediate(&src1, 1.0f)) {
+                        res[chan] = src0;
+                     }
+                  }
+               }
+            }
+         }
+
+         for (chan = 0; chan < 4; ++chan) {
+            if (dst->WriteMask & (1 << chan)) {
+               regs[dst->Index][chan] = res[chan];
+            }
+         }
+      }
+   }
+
+   /*
+    * Clear all temporaries information in presence of a control flow opcode.
+    */
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_IF:
+   case TGSI_OPCODE_IFC:
+   case TGSI_OPCODE_ELSE:
+   case TGSI_OPCODE_ENDIF:
+   case TGSI_OPCODE_BGNLOOP:
+   case TGSI_OPCODE_BRK:
+   case TGSI_OPCODE_BREAKC:
+   case TGSI_OPCODE_CONT:
+   case TGSI_OPCODE_ENDLOOP:
+   case TGSI_OPCODE_CALLNZ:
+   case TGSI_OPCODE_CAL:
+   case TGSI_OPCODE_BGNSUB:
+   case TGSI_OPCODE_ENDSUB:
+   case TGSI_OPCODE_SWITCH:
+   case TGSI_OPCODE_CASE:
+   case TGSI_OPCODE_DEFAULT:
+   case TGSI_OPCODE_ENDSWITCH:
+   case TGSI_OPCODE_RET:
+   case TGSI_OPCODE_END:
+      /* XXX: Are there more cases? */
+      memset(&ctx->temp, 0, sizeof ctx->temp);
+      memset(&info->output, 0, sizeof info->output);
+   default:
+      break;
+   }
+}
+
+
+static INLINE void
+dump_info(const struct tgsi_token *tokens,
+          struct lp_tgsi_info *info)
+{
+   unsigned index;
+   unsigned chan;
+
+   tgsi_dump(tokens, 0);
+
+   for (index = 0; index < info->num_texs; ++index) {
+      const struct lp_tgsi_texture_info *tex_info = &info->tex[index];
+      debug_printf("TEX[%u] =", index);
+      for (chan = 0; chan < 4; ++chan) {
+         const struct lp_tgsi_channel_info *chan_info =
+               &tex_info->coord[chan];
+         if (chan_info->file != TGSI_FILE_NULL) {
+            debug_printf(" %s[%u].%c",
+                         tgsi_file_names[chan_info->file],
+                         chan_info->u.index,
+                         "xyzw01"[chan_info->swizzle]);
+         } else {
+            debug_printf(" _");
+         }
+      }
+      debug_printf(", SAMP[%u], %s\n",
+                   tex_info->unit,
+                   tgsi_texture_names[tex_info->target]);
+   }
+
+   for (index = 0; index < PIPE_MAX_SHADER_OUTPUTS; ++index) {
+      for (chan = 0; chan < 4; ++chan) {
+         const struct lp_tgsi_channel_info *chan_info =
+               &info->output[index][chan];
+         if (chan_info->file != TGSI_FILE_NULL) {
+            debug_printf("OUT[%u].%c = ", index, "xyzw"[chan]);
+            if (chan_info->file == TGSI_FILE_IMMEDIATE) {
+               debug_printf("%f", chan_info->u.value);
+            } else {
+               const char *file_name;
+               switch (chan_info->file) {
+               case TGSI_FILE_CONSTANT:
+                  file_name = "CONST";
+                  break;
+               case TGSI_FILE_INPUT:
+                  file_name = "IN";
+                  break;
+               default:
+                  file_name = "???";
+                  break;
+               }
+               debug_printf("%s[%u].%c",
+                            file_name,
+                            chan_info->u.index,
+                            "xyzw01"[chan_info->swizzle]);
+            }
+            debug_printf("\n");
+         }
+      }
+   }
+}
+
+
+/**
+ * Detect any direct relationship between the output color
+ */
+void
+lp_build_tgsi_info(const struct tgsi_token *tokens,
+                   struct lp_tgsi_info *info)
+{
+   struct tgsi_parse_context parse;
+   struct analysis_context ctx;
+   unsigned index;
+   unsigned chan;
+
+   memset(info, 0, sizeof *info);
+
+   tgsi_scan_shader(tokens, &info->base);
+
+   memset(&ctx, 0, sizeof ctx);
+   ctx.info = info;
+
+   tgsi_parse_init(&parse, tokens);
+
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         {
+            struct tgsi_full_instruction *inst =
+                  &parse.FullToken.FullInstruction;
+
+            if (inst->Instruction.Opcode == TGSI_OPCODE_END ||
+                inst->Instruction.Opcode == TGSI_OPCODE_BGNSUB) {
+               /* We reached the end of main function body. */
+               goto finished;
+            }
+
+            analyse_instruction(&ctx, inst);
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         {
+            const unsigned size =
+                  parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
+            assert(size <= 4);
+            if (ctx.num_imms < Elements(ctx.imm)) {
+               for (chan = 0; chan < size; ++chan) {
+                  ctx.imm[ctx.num_imms][chan] =
+                        parse.FullToken.FullImmediate.u[chan].Float;
+               }
+               ++ctx.num_imms;
+            }
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_PROPERTY:
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+finished:
+
+   tgsi_parse_free(&parse);
+
+
+   /*
+    * Link the output color values.
+    */
+
+   for (index = 0; index < PIPE_MAX_COLOR_BUFS; ++index) {
+      const struct lp_tgsi_channel_info null_output[4];
+      info->cbuf[index] = null_output;
+   }
+
+   for (index = 0; index < info->base.num_outputs; ++index) {
+      unsigned semantic_name = info->base.output_semantic_name[index];
+      unsigned semantic_index = info->base.output_semantic_index[index];
+      if (semantic_name == TGSI_SEMANTIC_COLOR &&
+          semantic_index < PIPE_MAX_COLOR_BUFS) {
+         info->cbuf[semantic_index] = info->output[index];
+      }
+   }
+
+   if (gallivm_debug & GALLIVM_DEBUG_TGSI) {
+      dump_info(tokens, info);
+   }
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 441aebae29..2bc90579a2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -887,21 +887,25 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
    }
 
    if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+      LLVMTypeRef i32t = LLVMInt32Type();
+      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
       for (i = 0; i < num_coords; i++) {
-         ddx[i] = emit_fetch( bld, inst, 1, i );
-         ddy[i] = emit_fetch( bld, inst, 2, i );
+         LLVMValueRef src1 = emit_fetch( bld, inst, 1, i );
+         LLVMValueRef src2 = emit_fetch( bld, inst, 2, i );
+         ddx[i] = LLVMBuildExtractElement(bld->base.builder, src1, index0, "");
+         ddy[i] = LLVMBuildExtractElement(bld->base.builder, src2, index0, "");
       }
       unit = inst->Src[3].Register.Index;
    }  else {
       for (i = 0; i < num_coords; i++) {
-         ddx[i] = lp_build_ddx( &bld->base, coords[i] );
-         ddy[i] = lp_build_ddy( &bld->base, coords[i] );
+         ddx[i] = lp_build_scalar_ddx( &bld->base, coords[i] );
+         ddy[i] = lp_build_scalar_ddy( &bld->base, coords[i] );
       }
       unit = inst->Src[1].Register.Index;
    }
    for (i = num_coords; i < 3; i++) {
-      ddx[i] = bld->base.undef;
-      ddy[i] = bld->base.undef;
+      ddx[i] = LLVMGetUndef(bld->base.elem_type);
+      ddy[i] = LLVMGetUndef(bld->base.elem_type);
    }
 
    bld->sampler->emit_fetch_texel(bld->sampler,
@@ -913,6 +917,43 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
                                   texel);
 }
 
+static boolean
+near_end_of_shader(struct lp_build_tgsi_soa_context *bld,
+		   int pc)
+{
+   int i;
+
+   for (i = 0; i < 5; i++) {
+      unsigned opcode;
+
+      if (pc + i >= bld->info->num_instructions)
+	 return TRUE;
+
+      opcode = bld->instructions[pc + i].Instruction.Opcode;
+
+      if (opcode == TGSI_OPCODE_END)
+	 return TRUE;
+
+      if (opcode == TGSI_OPCODE_TEX ||
+	  opcode == TGSI_OPCODE_TXP ||
+	  opcode == TGSI_OPCODE_TXD ||
+	  opcode == TGSI_OPCODE_TXB ||
+	  opcode == TGSI_OPCODE_TXL ||
+	  opcode == TGSI_OPCODE_TXF ||
+	  opcode == TGSI_OPCODE_TXQ ||
+	  opcode == TGSI_OPCODE_CAL ||
+	  opcode == TGSI_OPCODE_CALLNZ ||
+	  opcode == TGSI_OPCODE_IF ||
+	  opcode == TGSI_OPCODE_IFC ||
+	  opcode == TGSI_OPCODE_BGNLOOP ||
+	  opcode == TGSI_OPCODE_SWITCH)
+	 return FALSE;
+   }
+
+   return TRUE;
+}
+
+
 
 /**
  * Kill fragment if any of the src register values are negative.
@@ -920,7 +961,8 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
 static void
 emit_kil(
    struct lp_build_tgsi_soa_context *bld,
-   const struct tgsi_full_instruction *inst )
+   const struct tgsi_full_instruction *inst,
+   int pc)
 {
    const struct tgsi_full_src_register *reg = &inst->Src[0];
    LLVMValueRef terms[NUM_CHANNELS];
@@ -959,8 +1001,12 @@ emit_kil(
       }
    }
 
-   if(mask)
+   if(mask) {
       lp_build_mask_update(bld->mask, mask);
+
+      if (!near_end_of_shader(bld, pc))
+	 lp_build_mask_check(bld->mask);
+   }
 }
 
 
@@ -972,7 +1018,8 @@ emit_kil(
  */
 static void
 emit_kilp(struct lp_build_tgsi_soa_context *bld,
-          const struct tgsi_full_instruction *inst)
+          const struct tgsi_full_instruction *inst,
+	  int pc)
 {
    LLVMValueRef mask;
 
@@ -987,6 +1034,9 @@ emit_kilp(struct lp_build_tgsi_soa_context *bld,
    }
 
    lp_build_mask_update(bld->mask, mask);
+
+   if (!near_end_of_shader(bld, pc))
+      lp_build_mask_check(bld->mask);
 }
 
 static void
@@ -1535,12 +1585,12 @@ emit_instruction(
 
    case TGSI_OPCODE_KILP:
       /* predicated kill */
-      emit_kilp( bld, inst );
+      emit_kilp( bld, inst, (*pc)-1 );
       break;
 
    case TGSI_OPCODE_KIL:
       /* conditional kill */
-      emit_kil( bld, inst );
+      emit_kil( bld, inst, (*pc)-1 );
       break;
 
    case TGSI_OPCODE_PK2H:
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
index b4d8107372..a6eb403962 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -222,7 +222,7 @@ pb_cache_buffer_vtbl = {
 };
 
 
-static INLINE boolean
+static INLINE int
 pb_cache_is_buffer_compat(struct pb_cache_buffer *buf,  
                           pb_size size,
                           const struct pb_desc *desc)
@@ -230,26 +230,26 @@ pb_cache_is_buffer_compat(struct pb_cache_buffer *buf,
    void *map;
 
    if(buf->base.base.size < size)
-      return FALSE;
+      return 0;
 
    /* be lenient with size */
    if(buf->base.base.size >= 2*size)
-      return FALSE;
+      return 0;
    
    if(!pb_check_alignment(desc->alignment, buf->base.base.alignment))
-      return FALSE;
+      return 0;
    
    if(!pb_check_usage(desc->usage, buf->base.base.usage))
-      return FALSE;
+      return 0;
 
    map = pb_map(buf->buffer, PB_USAGE_DONTBLOCK, NULL);
    if (!map) {
-      return FALSE;
+      return -1;
    }
 
    pb_unmap(buf->buffer);
    
-   return TRUE;
+   return 1;
 }
 
 
@@ -263,7 +263,8 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
    struct pb_cache_buffer *curr_buf;
    struct list_head *curr, *next;
    int64_t now;
-   
+   int ret = 0;
+
    pipe_mutex_lock(mgr->mutex);
 
    buf = NULL;
@@ -274,25 +275,30 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
    now = os_time_get();
    while(curr != &mgr->delayed) {
       curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
-      if(!buf && pb_cache_is_buffer_compat(curr_buf, size, desc))
-	 buf = curr_buf;
+      if(!buf && (ret = pb_cache_is_buffer_compat(curr_buf, size, desc) > 0))
+         buf = curr_buf;
       else if(os_time_timeout(curr_buf->start, curr_buf->end, now))
-	 _pb_cache_buffer_destroy(curr_buf);
+         _pb_cache_buffer_destroy(curr_buf);
       else
          /* This buffer (and all hereafter) are still hot in cache */
          break;
+      if (ret == -1)
+         break;
       curr = next; 
       next = curr->next;
    }
 
    /* keep searching in the hot buffers */
-   if(!buf) {
+   if(!buf && ret != -1) {
       while(curr != &mgr->delayed) {
          curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
-         if(pb_cache_is_buffer_compat(curr_buf, size, desc)) {
+         ret = pb_cache_is_buffer_compat(curr_buf, size, desc);
+         if (ret > 0) {
             buf = curr_buf;
             break;
          }
+         if (ret == -1)
+            break;
          /* no need to check the timeout here */
          curr = next;
          next = curr->next;
@@ -301,6 +307,7 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
    
    if(buf) {
       LIST_DEL(&buf->head);
+      --mgr->numDelayed;
       pipe_mutex_unlock(mgr->mutex);
       /* Increase refcount */
       pipe_reference_init(&buf->base.base.reference, 1);
diff --git a/src/gallium/auxiliary/rbug/rbug_context.c b/src/gallium/auxiliary/rbug/rbug_context.c
index 1832425658..a3fd7e8430 100644
--- a/src/gallium/auxiliary/rbug/rbug_context.c
+++ b/src/gallium/auxiliary/rbug/rbug_context.c
@@ -480,7 +480,7 @@ struct rbug_proto_context_list * rbug_demarshal_context_list(struct rbug_proto_h
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_LIST)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_LIST)
 		return NULL;
 
 	pos = 0;
@@ -506,7 +506,7 @@ struct rbug_proto_context_info * rbug_demarshal_context_info(struct rbug_proto_h
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_INFO)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_INFO)
 		return NULL;
 
 	pos = 0;
@@ -533,7 +533,7 @@ struct rbug_proto_context_draw_block * rbug_demarshal_context_draw_block(struct
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_DRAW_BLOCK)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_DRAW_BLOCK)
 		return NULL;
 
 	pos = 0;
@@ -561,7 +561,7 @@ struct rbug_proto_context_draw_step * rbug_demarshal_context_draw_step(struct rb
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_DRAW_STEP)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_DRAW_STEP)
 		return NULL;
 
 	pos = 0;
@@ -589,7 +589,7 @@ struct rbug_proto_context_draw_unblock * rbug_demarshal_context_draw_unblock(str
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_DRAW_UNBLOCK)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_DRAW_UNBLOCK)
 		return NULL;
 
 	pos = 0;
@@ -617,7 +617,7 @@ struct rbug_proto_context_draw_rule * rbug_demarshal_context_draw_rule(struct rb
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_DRAW_RULE)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_DRAW_RULE)
 		return NULL;
 
 	pos = 0;
@@ -649,7 +649,7 @@ struct rbug_proto_context_flush * rbug_demarshal_context_flush(struct rbug_proto
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_FLUSH)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_FLUSH)
 		return NULL;
 
 	pos = 0;
@@ -677,7 +677,7 @@ struct rbug_proto_context_list_reply * rbug_demarshal_context_list_reply(struct
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_LIST_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_LIST_REPLY)
 		return NULL;
 
 	pos = 0;
@@ -705,7 +705,7 @@ struct rbug_proto_context_info_reply * rbug_demarshal_context_info_reply(struct
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_INFO_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_INFO_REPLY)
 		return NULL;
 
 	pos = 0;
@@ -739,7 +739,7 @@ struct rbug_proto_context_draw_blocked * rbug_demarshal_context_draw_blocked(str
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_DRAW_BLOCKED)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_DRAW_BLOCKED)
 		return NULL;
 
 	pos = 0;
diff --git a/src/gallium/auxiliary/rbug/rbug_core.c b/src/gallium/auxiliary/rbug/rbug_core.c
index 876ae5a0ce..1d47d13c9f 100644
--- a/src/gallium/auxiliary/rbug/rbug_core.c
+++ b/src/gallium/auxiliary/rbug/rbug_core.c
@@ -233,7 +233,7 @@ struct rbug_proto_noop * rbug_demarshal_noop(struct rbug_proto_header *header)
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_NOOP)
+	if (header->opcode != (int32_t)RBUG_OP_NOOP)
 		return NULL;
 
 	pos = 0;
@@ -259,7 +259,7 @@ struct rbug_proto_ping * rbug_demarshal_ping(struct rbug_proto_header *header)
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_PING)
+	if (header->opcode != (int32_t)RBUG_OP_PING)
 		return NULL;
 
 	pos = 0;
@@ -285,7 +285,7 @@ struct rbug_proto_error * rbug_demarshal_error(struct rbug_proto_header *header)
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_ERROR)
+	if (header->opcode != (int32_t)RBUG_OP_ERROR)
 		return NULL;
 
 	pos = 0;
@@ -312,7 +312,7 @@ struct rbug_proto_ping_reply * rbug_demarshal_ping_reply(struct rbug_proto_heade
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_PING_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_PING_REPLY)
 		return NULL;
 
 	pos = 0;
@@ -339,7 +339,7 @@ struct rbug_proto_error_reply * rbug_demarshal_error_reply(struct rbug_proto_hea
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_ERROR_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_ERROR_REPLY)
 		return NULL;
 
 	pos = 0;
diff --git a/src/gallium/auxiliary/rbug/rbug_demarshal.c b/src/gallium/auxiliary/rbug/rbug_demarshal.c
index 47390fbcee..06caa45469 100644
--- a/src/gallium/auxiliary/rbug/rbug_demarshal.c
+++ b/src/gallium/auxiliary/rbug/rbug_demarshal.c
@@ -91,3 +91,67 @@ struct rbug_header * rbug_demarshal(struct rbug_proto_header *header)
 		return NULL;
 	}
 }
+
+const char* rbug_proto_get_name(enum rbug_opcode opcode)
+{
+	switch(opcode) {
+	case RBUG_OP_NOOP:
+		return "RBUG_OP_NOOP";
+	case RBUG_OP_PING:
+		return "RBUG_OP_PING";
+	case RBUG_OP_ERROR:
+		return "RBUG_OP_ERROR";
+	case RBUG_OP_PING_REPLY:
+		return "RBUG_OP_PING_REPLY";
+	case RBUG_OP_ERROR_REPLY:
+		return "RBUG_OP_ERROR_REPLY";
+	case RBUG_OP_TEXTURE_LIST:
+		return "RBUG_OP_TEXTURE_LIST";
+	case RBUG_OP_TEXTURE_INFO:
+		return "RBUG_OP_TEXTURE_INFO";
+	case RBUG_OP_TEXTURE_WRITE:
+		return "RBUG_OP_TEXTURE_WRITE";
+	case RBUG_OP_TEXTURE_READ:
+		return "RBUG_OP_TEXTURE_READ";
+	case RBUG_OP_TEXTURE_LIST_REPLY:
+		return "RBUG_OP_TEXTURE_LIST_REPLY";
+	case RBUG_OP_TEXTURE_INFO_REPLY:
+		return "RBUG_OP_TEXTURE_INFO_REPLY";
+	case RBUG_OP_TEXTURE_READ_REPLY:
+		return "RBUG_OP_TEXTURE_READ_REPLY";
+	case RBUG_OP_CONTEXT_LIST:
+		return "RBUG_OP_CONTEXT_LIST";
+	case RBUG_OP_CONTEXT_INFO:
+		return "RBUG_OP_CONTEXT_INFO";
+	case RBUG_OP_CONTEXT_DRAW_BLOCK:
+		return "RBUG_OP_CONTEXT_DRAW_BLOCK";
+	case RBUG_OP_CONTEXT_DRAW_STEP:
+		return "RBUG_OP_CONTEXT_DRAW_STEP";
+	case RBUG_OP_CONTEXT_DRAW_UNBLOCK:
+		return "RBUG_OP_CONTEXT_DRAW_UNBLOCK";
+	case RBUG_OP_CONTEXT_DRAW_RULE:
+		return "RBUG_OP_CONTEXT_DRAW_RULE";
+	case RBUG_OP_CONTEXT_FLUSH:
+		return "RBUG_OP_CONTEXT_FLUSH";
+	case RBUG_OP_CONTEXT_LIST_REPLY:
+		return "RBUG_OP_CONTEXT_LIST_REPLY";
+	case RBUG_OP_CONTEXT_INFO_REPLY:
+		return "RBUG_OP_CONTEXT_INFO_REPLY";
+	case RBUG_OP_CONTEXT_DRAW_BLOCKED:
+		return "RBUG_OP_CONTEXT_DRAW_BLOCKED";
+	case RBUG_OP_SHADER_LIST:
+		return "RBUG_OP_SHADER_LIST";
+	case RBUG_OP_SHADER_INFO:
+		return "RBUG_OP_SHADER_INFO";
+	case RBUG_OP_SHADER_DISABLE:
+		return "RBUG_OP_SHADER_DISABLE";
+	case RBUG_OP_SHADER_REPLACE:
+		return "RBUG_OP_SHADER_REPLACE";
+	case RBUG_OP_SHADER_LIST_REPLY:
+		return "RBUG_OP_SHADER_LIST_REPLY";
+	case RBUG_OP_SHADER_INFO_REPLY:
+		return "RBUG_OP_SHADER_INFO_REPLY";
+	default:
+		return NULL;
+	}
+}
diff --git a/src/gallium/auxiliary/rbug/rbug_proto.h b/src/gallium/auxiliary/rbug/rbug_proto.h
index 4f3eb75dc4..2fce725bc9 100644
--- a/src/gallium/auxiliary/rbug/rbug_proto.h
+++ b/src/gallium/auxiliary/rbug/rbug_proto.h
@@ -91,4 +91,9 @@ struct rbug_proto_header
  */
 struct rbug_connection;
 
+/**
+ * Get printable string for opcode.
+ */
+const char* rbug_proto_get_name(enum rbug_opcode opcode);
+
 #endif
diff --git a/src/gallium/auxiliary/rbug/rbug_shader.c b/src/gallium/auxiliary/rbug/rbug_shader.c
index fccd2f55ef..1742941cc1 100644
--- a/src/gallium/auxiliary/rbug/rbug_shader.c
+++ b/src/gallium/auxiliary/rbug/rbug_shader.c
@@ -305,7 +305,7 @@ struct rbug_proto_shader_list * rbug_demarshal_shader_list(struct rbug_proto_hea
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_SHADER_LIST)
+	if (header->opcode != (int32_t)RBUG_OP_SHADER_LIST)
 		return NULL;
 
 	pos = 0;
@@ -332,7 +332,7 @@ struct rbug_proto_shader_info * rbug_demarshal_shader_info(struct rbug_proto_hea
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_SHADER_INFO)
+	if (header->opcode != (int32_t)RBUG_OP_SHADER_INFO)
 		return NULL;
 
 	pos = 0;
@@ -360,7 +360,7 @@ struct rbug_proto_shader_disable * rbug_demarshal_shader_disable(struct rbug_pro
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_SHADER_DISABLE)
+	if (header->opcode != (int32_t)RBUG_OP_SHADER_DISABLE)
 		return NULL;
 
 	pos = 0;
@@ -389,7 +389,7 @@ struct rbug_proto_shader_replace * rbug_demarshal_shader_replace(struct rbug_pro
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_SHADER_REPLACE)
+	if (header->opcode != (int32_t)RBUG_OP_SHADER_REPLACE)
 		return NULL;
 
 	pos = 0;
@@ -418,7 +418,7 @@ struct rbug_proto_shader_list_reply * rbug_demarshal_shader_list_reply(struct rb
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_SHADER_LIST_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_SHADER_LIST_REPLY)
 		return NULL;
 
 	pos = 0;
@@ -446,7 +446,7 @@ struct rbug_proto_shader_info_reply * rbug_demarshal_shader_info_reply(struct rb
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_SHADER_INFO_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_SHADER_INFO_REPLY)
 		return NULL;
 
 	pos = 0;
diff --git a/src/gallium/auxiliary/rbug/rbug_texture.c b/src/gallium/auxiliary/rbug/rbug_texture.c
index 5a918fe6bc..2ad577915e 100644
--- a/src/gallium/auxiliary/rbug/rbug_texture.c
+++ b/src/gallium/auxiliary/rbug/rbug_texture.c
@@ -417,7 +417,7 @@ struct rbug_proto_texture_list * rbug_demarshal_texture_list(struct rbug_proto_h
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_TEXTURE_LIST)
+	if (header->opcode != (int32_t)RBUG_OP_TEXTURE_LIST)
 		return NULL;
 
 	pos = 0;
@@ -443,7 +443,7 @@ struct rbug_proto_texture_info * rbug_demarshal_texture_info(struct rbug_proto_h
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_TEXTURE_INFO)
+	if (header->opcode != (int32_t)RBUG_OP_TEXTURE_INFO)
 		return NULL;
 
 	pos = 0;
@@ -470,7 +470,7 @@ struct rbug_proto_texture_write * rbug_demarshal_texture_write(struct rbug_proto
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_TEXTURE_WRITE)
+	if (header->opcode != (int32_t)RBUG_OP_TEXTURE_WRITE)
 		return NULL;
 
 	pos = 0;
@@ -506,7 +506,7 @@ struct rbug_proto_texture_read * rbug_demarshal_texture_read(struct rbug_proto_h
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_TEXTURE_READ)
+	if (header->opcode != (int32_t)RBUG_OP_TEXTURE_READ)
 		return NULL;
 
 	pos = 0;
@@ -540,7 +540,7 @@ struct rbug_proto_texture_list_reply * rbug_demarshal_texture_list_reply(struct
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_TEXTURE_LIST_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_TEXTURE_LIST_REPLY)
 		return NULL;
 
 	pos = 0;
@@ -568,7 +568,7 @@ struct rbug_proto_texture_info_reply * rbug_demarshal_texture_info_reply(struct
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_TEXTURE_INFO_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_TEXTURE_INFO_REPLY)
 		return NULL;
 
 	pos = 0;
@@ -606,7 +606,7 @@ struct rbug_proto_texture_read_reply * rbug_demarshal_texture_read_reply(struct
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_TEXTURE_READ_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_TEXTURE_READ_REPLY)
 		return NULL;
 
 	pos = 0;
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index 65d5ce795b..fbde1d191a 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -58,7 +58,6 @@
 
 #include <unistd.h>
 #include <sys/mman.h>
-#include "os/os_thread.h"
 #include "util/u_mm.h"
 
 #define EXEC_HEAP_SIZE (10*1024*1024)
diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
index 036c1ee48a..34bfa527db 100644
--- a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
@@ -23,26 +23,13 @@
 #include "cell/ppu/cell_public.h"
 #endif
 
+
 static INLINE struct pipe_screen *
-sw_screen_create(struct sw_winsys *winsys)
+sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
 {
-   const char *default_driver;
-   const char *driver;
    struct pipe_screen *screen = NULL;
 
 #if defined(GALLIUM_CELL)
-   default_driver = "cell";
-#elif defined(GALLIUM_LLVMPIPE)
-   default_driver = "llvmpipe";
-#elif defined(GALLIUM_SOFTPIPE)
-   default_driver = "softpipe";
-#else
-   default_driver = "";
-#endif
-
-   driver = debug_get_option("GALLIUM_DRIVER", default_driver);
-
-#if defined(GALLIUM_CELL)
    if (screen == NULL && strcmp(driver, "cell") == 0)
       screen = cell_create_screen(winsys);
 #endif
@@ -60,4 +47,26 @@ sw_screen_create(struct sw_winsys *winsys)
    return screen;
 }
 
+
+static INLINE struct pipe_screen *
+sw_screen_create(struct sw_winsys *winsys)
+{
+   const char *default_driver;
+   const char *driver;
+
+#if defined(GALLIUM_CELL)
+   default_driver = "cell";
+#elif defined(GALLIUM_LLVMPIPE)
+   default_driver = "llvmpipe";
+#elif defined(GALLIUM_SOFTPIPE)
+   default_driver = "softpipe";
+#else
+   default_driver = "";
+#endif
+
+   driver = debug_get_option("GALLIUM_DRIVER", default_driver);
+   return sw_screen_create_named(winsys, driver);
+}
+
+
 #endif
diff --git a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
index 0b4e740403..e4effa713e 100644
--- a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
@@ -13,22 +13,28 @@ static INLINE struct pipe_screen *
 sw_screen_wrap(struct pipe_screen *screen)
 {
    struct sw_winsys *sws;
-   struct pipe_screen *sw_screen;
+   struct pipe_screen *sw_screen = NULL;
+   const char *driver;
 
-   sws = wrapper_sw_winsys_warp_pipe_screen(screen);
+   driver = debug_get_option("GALLIUM_DRIVER", "native");
+   if (strcmp(driver, "native") == 0)
+      return screen;
+
+   sws = wrapper_sw_winsys_wrap_pipe_screen(screen);
    if (!sws)
       goto err;
 
-   sw_screen = sw_screen_create(sws);
-   if (sw_screen == screen)
+   sw_screen = sw_screen_create_named(sws, driver);
+
+   if (!sw_screen)
       goto err_winsys;
 
    return sw_screen;
 
 err_winsys:
-   sws->destroy(sws);
+   return wrapper_sw_winsys_dewrap_pipe_screen(sws);
 err:
-  return screen;
+   return screen;
 }
 
 #endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index f71ffb7030..77bde86684 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -90,7 +90,8 @@ static const char *processor_type_names[] =
    "GEOM"
 };
 
-static const char *file_names[TGSI_FILE_COUNT] =
+const char *
+tgsi_file_names[TGSI_FILE_COUNT] =
 {
    "NULL",
    "CONST",
@@ -125,7 +126,8 @@ static const char *semantic_names[] =
    "FACE",
    "EDGEFLAG",
    "PRIM_ID",
-   "INSTANCEID"
+   "INSTANCEID",
+   "STENCIL"
 };
 
 static const char *immediate_type_names[] =
@@ -135,7 +137,8 @@ static const char *immediate_type_names[] =
    "INT32"
 };
 
-static const char *swizzle_names[] =
+const char *
+tgsi_swizzle_names[] =
 {
    "x",
    "y",
@@ -143,7 +146,8 @@ static const char *swizzle_names[] =
    "w"
 };
 
-static const char *texture_names[] =
+const char *
+tgsi_texture_names[] =
 {
    "UNKNOWN",
    "1D",
@@ -201,15 +205,15 @@ _dump_register_src(
    struct dump_ctx *ctx,
    const struct tgsi_full_src_register *src )
 {
-   ENM(src->Register.File, file_names);
+   ENM(src->Register.File, tgsi_file_names);
    if (src->Register.Dimension) {
       if (src->Dimension.Indirect) {
          CHR( '[' );
-         ENM( src->DimIndirect.File, file_names );
+         ENM( src->DimIndirect.File, tgsi_file_names );
          CHR( '[' );
          SID( src->DimIndirect.Index );
          TXT( "]." );
-         ENM( src->DimIndirect.SwizzleX, swizzle_names );
+         ENM( src->DimIndirect.SwizzleX, tgsi_swizzle_names );
          if (src->Dimension.Index != 0) {
             if (src->Dimension.Index > 0)
                CHR( '+' );
@@ -224,11 +228,11 @@ _dump_register_src(
    }
    if (src->Register.Indirect) {
       CHR( '[' );
-      ENM( src->Indirect.File, file_names );
+      ENM( src->Indirect.File, tgsi_file_names );
       CHR( '[' );
       SID( src->Indirect.Index );
       TXT( "]." );
-      ENM( src->Indirect.SwizzleX, swizzle_names );
+      ENM( src->Indirect.SwizzleX, tgsi_swizzle_names );
       if (src->Register.Index != 0) {
          if (src->Register.Index > 0)
             CHR( '+' );
@@ -248,15 +252,15 @@ _dump_register_dst(
    struct dump_ctx *ctx,
    const struct tgsi_full_dst_register *dst )
 {
-   ENM(dst->Register.File, file_names);
+   ENM(dst->Register.File, tgsi_file_names);
    if (dst->Register.Dimension) {
       if (dst->Dimension.Indirect) {
          CHR( '[' );
-         ENM( dst->DimIndirect.File, file_names );
+         ENM( dst->DimIndirect.File, tgsi_file_names );
          CHR( '[' );
          SID( dst->DimIndirect.Index );
          TXT( "]." );
-         ENM( dst->DimIndirect.SwizzleX, swizzle_names );
+         ENM( dst->DimIndirect.SwizzleX, tgsi_swizzle_names );
          if (dst->Dimension.Index != 0) {
             if (dst->Dimension.Index > 0)
                CHR( '+' );
@@ -271,11 +275,11 @@ _dump_register_dst(
    }
    if (dst->Register.Indirect) {
       CHR( '[' );
-      ENM( dst->Indirect.File, file_names );
+      ENM( dst->Indirect.File, tgsi_file_names );
       CHR( '[' );
       SID( dst->Indirect.Index );
       TXT( "]." );
-      ENM( dst->Indirect.SwizzleX, swizzle_names );
+      ENM( dst->Indirect.SwizzleX, tgsi_swizzle_names );
       if (dst->Register.Index != 0) {
          if (dst->Register.Index > 0)
             CHR( '+' );
@@ -351,7 +355,7 @@ iter_declaration(
 
    TXT( "DCL " );
 
-   ENM(decl->Declaration.File, file_names);
+   ENM(decl->Declaration.File, tgsi_file_names);
 
    /* all geometry shader inputs are two dimensional */
    if (decl->Declaration.File == TGSI_FILE_INPUT &&
@@ -585,10 +589,10 @@ iter_instruction(
           inst->Predicate.SwizzleZ != TGSI_SWIZZLE_Z ||
           inst->Predicate.SwizzleW != TGSI_SWIZZLE_W) {
          CHR( '.' );
-         ENM( inst->Predicate.SwizzleX, swizzle_names );
-         ENM( inst->Predicate.SwizzleY, swizzle_names );
-         ENM( inst->Predicate.SwizzleZ, swizzle_names );
-         ENM( inst->Predicate.SwizzleW, swizzle_names );
+         ENM( inst->Predicate.SwizzleX, tgsi_swizzle_names );
+         ENM( inst->Predicate.SwizzleY, tgsi_swizzle_names );
+         ENM( inst->Predicate.SwizzleZ, tgsi_swizzle_names );
+         ENM( inst->Predicate.SwizzleW, tgsi_swizzle_names );
       }
 
       TXT( ") " );
@@ -641,10 +645,10 @@ iter_instruction(
           src->Register.SwizzleZ != TGSI_SWIZZLE_Z ||
           src->Register.SwizzleW != TGSI_SWIZZLE_W) {
          CHR( '.' );
-         ENM( src->Register.SwizzleX, swizzle_names );
-         ENM( src->Register.SwizzleY, swizzle_names );
-         ENM( src->Register.SwizzleZ, swizzle_names );
-         ENM( src->Register.SwizzleW, swizzle_names );
+         ENM( src->Register.SwizzleX, tgsi_swizzle_names );
+         ENM( src->Register.SwizzleY, tgsi_swizzle_names );
+         ENM( src->Register.SwizzleZ, tgsi_swizzle_names );
+         ENM( src->Register.SwizzleW, tgsi_swizzle_names );
       }
 
       if (src->Register.Absolute)
@@ -655,7 +659,7 @@ iter_instruction(
 
    if (inst->Instruction.Texture) {
       TXT( ", " );
-      ENM( inst->Texture.Texture, texture_names );
+      ENM( inst->Texture.Texture, tgsi_texture_names );
    }
 
    switch (inst->Instruction.Opcode) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.h b/src/gallium/auxiliary/tgsi/tgsi_dump.h
index dd78b36100..fc0429ad8d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.h
@@ -35,6 +35,15 @@
 extern "C" {
 #endif
 
+extern const char *
+tgsi_file_names[TGSI_FILE_COUNT];
+
+extern const char *
+tgsi_swizzle_names[];
+
+extern const char *
+tgsi_texture_names[];
+
 void
 tgsi_dump_str(
    const struct tgsi_token *tokens,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 0757f05dfa..3a71540506 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -605,8 +605,10 @@ tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
       if ((inst->Src[i].Register.File ==
            inst->Dst[0].Register.File) &&
-          (inst->Src[i].Register.Index ==
-           inst->Dst[0].Register.Index)) {
+          ((inst->Src[i].Register.Index ==
+            inst->Dst[0].Register.Index) ||
+	   inst->Src[i].Register.Indirect ||
+	   inst->Dst[0].Register.Indirect)) {
          /* loop over dest channels */
          uint channelsWritten = 0x0;
          FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
index e472947507..b3123ed016 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
@@ -163,6 +163,10 @@ OP12(USGE)
 OP12(USHR)
 OP12(USLT)
 OP12(USNE)
+OP01(SWITCH)
+OP01(CASE)
+OP00(DEFAULT)
+OP00(ENDSWITCH)
 
 
 #undef OP00
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 90198a4f60..6585da3e83 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -147,6 +147,7 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                   info->input_semantic_name[reg] = (ubyte)fulldecl->Semantic.Name;
                   info->input_semantic_index[reg] = (ubyte)fulldecl->Semantic.Index;
                   info->input_interpolate[reg] = (ubyte)fulldecl->Declaration.Interpolate;
+                  info->input_centroid[reg] = (ubyte)fulldecl->Declaration.Centroid;
                   info->input_cylindrical_wrap[reg] = (ubyte)fulldecl->Declaration.CylindricalWrap;
                   info->num_inputs++;
                }
@@ -157,9 +158,11 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
 
                   /* extra info for special outputs */
                   if (procType == TGSI_PROCESSOR_FRAGMENT &&
-                      fulldecl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
-                     info->writes_z = TRUE;
-                  }
+                      fulldecl->Semantic.Name == TGSI_SEMANTIC_POSITION)
+                        info->writes_z = TRUE;
+                  if (procType == TGSI_PROCESSOR_FRAGMENT &&
+                      fulldecl->Semantic.Name == TGSI_SEMANTIC_STENCIL)
+                        info->writes_stencil = TRUE;
                   if (procType == TGSI_PROCESSOR_VERTEX &&
                       fulldecl->Semantic.Name == TGSI_SEMANTIC_EDGEFLAG) {
                      info->writes_edgeflag = TRUE;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index f8aa90cf06..104097fbc0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -45,6 +45,7 @@ struct tgsi_shader_info
    ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
    ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
+   ubyte input_centroid[PIPE_MAX_SHADER_INPUTS];
    ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS];
    ubyte input_cylindrical_wrap[PIPE_MAX_SHADER_INPUTS];
    ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
@@ -60,6 +61,7 @@ struct tgsi_shader_info
    uint opcode_count[TGSI_OPCODE_LAST];  /**< opcode histogram */
 
    boolean writes_z;  /**< does fragment shader write Z value? */
+   boolean writes_stencil; /**< does fragment shader write stencil value? */
    boolean writes_edgeflag; /**< vertex shader outputs edgeflag */
    boolean uses_kill;  /**< KIL or KILP instruction used? */
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 13e2e8eb99..086d983a73 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -2830,31 +2830,52 @@ static void soa_to_aos( struct x86_function *func,
  * Check if the instructions dst register is the same as any src
  * register and warn if there's a posible SOA dependency.
  */
-static void
+static boolean
 check_soa_dependencies(const struct tgsi_full_instruction *inst)
 {
-   switch (inst->Instruction.Opcode) {
+   uint opcode = inst->Instruction.Opcode;
+
+   /* XXX: we only handle src/dst aliasing in a few opcodes currently.
+    * Need to use an additional temporay to hold the result in the
+    * cases where the code is too opaque to fix.
+    */
+
+   switch (opcode) {
    case TGSI_OPCODE_ADD:
    case TGSI_OPCODE_MOV:
    case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_EXP:
+   case TGSI_OPCODE_LOG:
+   case TGSI_OPCODE_DP3:
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DP2A:
+   case TGSI_OPCODE_EX2:
+   case TGSI_OPCODE_LG2:
+   case TGSI_OPCODE_POW:
    case TGSI_OPCODE_XPD:
+   case TGSI_OPCODE_DPH:
+   case TGSI_OPCODE_COS:
+   case TGSI_OPCODE_SIN:
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXP:
+   case TGSI_OPCODE_NRM:
+   case TGSI_OPCODE_NRM4:
+   case TGSI_OPCODE_DP2:
       /* OK - these opcodes correctly handle SOA dependencies */
-      break;
+      return TRUE;
    default:
-      if (tgsi_check_soa_dependencies(inst)) {
-         uint opcode = inst->Instruction.Opcode;
+      if (!tgsi_check_soa_dependencies(inst))
+         return TRUE;
 
-         /* XXX: we only handle src/dst aliasing in a few opcodes
-          * currently.  Need to use an additional temporay to hold
-          * the result in the cases where the code is too opaque to
-          * fix.
-          */
-         if (opcode != TGSI_OPCODE_MOV) {
-            debug_printf("Warning: src/dst aliasing in instruction"
-                         " is not handled:\n");
-            tgsi_dump_instruction(inst, 1);
-         }
-      }
+      debug_printf("Warning: src/dst aliasing in instruction"
+                   " is not handled:\n");
+      debug_printf("Warning: ");
+      tgsi_dump_instruction(inst, 1);
+
+      return FALSE;
    }
 }
 
@@ -2954,7 +2975,8 @@ tgsi_emit_sse2(
                          tgsi_get_processor_name(proc));
 	 }
 
-         check_soa_dependencies(&parse.FullToken.FullInstruction);
+         if (ok)
+            ok = check_soa_dependencies(&parse.FullToken.FullInstruction);
          break;
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 3cf6893a9b..7d13a17bdb 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -96,7 +96,8 @@ struct ureg_program
       unsigned semantic_name;
       unsigned semantic_index;
       unsigned interp;
-      unsigned cylindrical_wrap;
+      unsigned char cylindrical_wrap;
+      unsigned char centroid;
    } fs_input[UREG_MAX_INPUT];
    unsigned nr_fs_inputs;
 
@@ -286,11 +287,12 @@ ureg_property_fs_coord_pixel_center(struct ureg_program *ureg,
 
 
 struct ureg_src
-ureg_DECL_fs_input_cyl(struct ureg_program *ureg,
+ureg_DECL_fs_input_cyl_centroid(struct ureg_program *ureg,
                        unsigned semantic_name,
                        unsigned semantic_index,
                        unsigned interp_mode,
-                       unsigned cylindrical_wrap)
+                       unsigned cylindrical_wrap,
+                       unsigned centroid)
 {
    unsigned i;
 
@@ -306,6 +308,7 @@ ureg_DECL_fs_input_cyl(struct ureg_program *ureg,
       ureg->fs_input[i].semantic_index = semantic_index;
       ureg->fs_input[i].interp = interp_mode;
       ureg->fs_input[i].cylindrical_wrap = cylindrical_wrap;
+      ureg->fs_input[i].centroid = centroid;
       ureg->nr_fs_inputs++;
    } else {
       set_bad(ureg);
@@ -1126,7 +1129,8 @@ emit_decl_fs(struct ureg_program *ureg,
              unsigned semantic_name,
              unsigned semantic_index,
              unsigned interpolate,
-             unsigned cylindrical_wrap)
+             unsigned cylindrical_wrap,
+             unsigned centroid)
 {
    union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 3);
 
@@ -1138,6 +1142,7 @@ emit_decl_fs(struct ureg_program *ureg,
    out[0].decl.Interpolate = interpolate;
    out[0].decl.Semantic = 1;
    out[0].decl.CylindricalWrap = cylindrical_wrap;
+   out[0].decl.Centroid = centroid;
 
    out[1].value = 0;
    out[1].decl_range.First = index;
@@ -1287,7 +1292,8 @@ static void emit_decls( struct ureg_program *ureg )
                       ureg->fs_input[i].semantic_name,
                       ureg->fs_input[i].semantic_index,
                       ureg->fs_input[i].interp,
-                      ureg->fs_input[i].cylindrical_wrap);
+                      ureg->fs_input[i].cylindrical_wrap,
+                      ureg->fs_input[i].centroid);
       }
    } else {
       for (i = 0; i < ureg->nr_gs_inputs; i++) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 07fb01ab7b..acc463200a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -158,11 +158,27 @@ ureg_property_fs_coord_pixel_center(struct ureg_program *ureg,
  */
 
 struct ureg_src
-ureg_DECL_fs_input_cyl(struct ureg_program *,
+ureg_DECL_fs_input_cyl_centroid(struct ureg_program *,
                        unsigned semantic_name,
                        unsigned semantic_index,
                        unsigned interp_mode,
-                       unsigned cylindrical_wrap);
+                       unsigned cylindrical_wrap,
+                       unsigned centroid);
+
+static INLINE struct ureg_src
+ureg_DECL_fs_input_cyl(struct ureg_program *ureg,
+                       unsigned semantic_name,
+                       unsigned semantic_index,
+                       unsigned interp_mode,
+                       unsigned cylindrical_wrap)
+{
+   return ureg_DECL_fs_input_cyl_centroid(ureg,
+                                 semantic_name,
+                                 semantic_index,
+                                 interp_mode,
+                                 cylindrical_wrap,
+                                 0);
+}
 
 static INLINE struct ureg_src
 ureg_DECL_fs_input(struct ureg_program *ureg,
@@ -170,11 +186,11 @@ ureg_DECL_fs_input(struct ureg_program *ureg,
                    unsigned semantic_index,
                    unsigned interp_mode)
 {
-   return ureg_DECL_fs_input_cyl(ureg,
+   return ureg_DECL_fs_input_cyl_centroid(ureg,
                                  semantic_name,
                                  semantic_index,
                                  interp_mode,
-                                 0);
+                                 0, 0);
 }
 
 struct ureg_src
diff --git a/src/gallium/auxiliary/util/u_atomic.h b/src/gallium/auxiliary/util/u_atomic.h
index a156823390..8434491a42 100644
--- a/src/gallium/auxiliary/util/u_atomic.h
+++ b/src/gallium/auxiliary/util/u_atomic.h
@@ -29,6 +29,8 @@
 #define PIPE_ATOMIC_ASM_MSVC_X86                
 #elif (defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86))
 #define PIPE_ATOMIC_ASM_GCC_X86
+#elif (defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64))
+#define PIPE_ATOMIC_ASM_GCC_X86_64
 #elif defined(PIPE_CC_GCC) && (PIPE_CC_GCC_VERSION >= 401)
 #define PIPE_ATOMIC_GCC_INTRINSIC
 #else
@@ -36,6 +38,51 @@
 #endif
 
 
+#if defined(PIPE_ATOMIC_ASM_GCC_X86_64)
+#define PIPE_ATOMIC "GCC x86_64 assembly"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define p_atomic_set(_v, _i) (*(_v) = (_i))
+#define p_atomic_read(_v) (*(_v))
+
+static INLINE boolean
+p_atomic_dec_zero(int32_t *v)
+{
+   unsigned char c;
+
+   __asm__ __volatile__("lock; decl %0; sete %1":"+m"(*v), "=qm"(c)
+			::"memory");
+
+   return c != 0;
+}
+
+static INLINE void
+p_atomic_inc(int32_t *v)
+{
+   __asm__ __volatile__("lock; incl %0":"+m"(*v));
+}
+
+static INLINE void
+p_atomic_dec(int32_t *v)
+{
+   __asm__ __volatile__("lock; decl %0":"+m"(*v));
+}
+
+static INLINE int32_t
+p_atomic_cmpxchg(int32_t *v, int32_t old, int32_t _new)
+{
+   return __sync_val_compare_and_swap(v, old, _new);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PIPE_ATOMIC_ASM_GCC_X86_64 */
+
 
 #if defined(PIPE_ATOMIC_ASM_GCC_X86)
 
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index f93ef26ae7..a163f93cb8 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -268,7 +268,7 @@ void util_blitter_destroy(struct blitter_context *blitter)
          pipe->delete_fs_state(pipe, ctx->fs_texfetch_depth[i]);
    }
 
-   for (i = 0; i <= PIPE_MAX_COLOR_BUFS && ctx->fs_col[i]; i++)
+   for (i = 0; i <= PIPE_MAX_COLOR_BUFS; i++)
       if (ctx->fs_col[i])
          pipe->delete_fs_state(pipe, ctx->fs_col[i]);
 
@@ -964,16 +964,18 @@ void util_blitter_clear_depth_stencil(struct blitter_context *blitter,
    blitter_restore_CSOs(ctx);
 }
 
-/* Clear a region of a depth stencil surface. */
-void util_blitter_flush_depth_stencil(struct blitter_context *blitter,
-                                      struct pipe_surface *dstsurf)
+/* draw a rectangle across a region using a custom dsa stage - for r600g */
+void util_blitter_custom_depth_stencil(struct blitter_context *blitter,
+				       struct pipe_surface *zsurf,
+				       struct pipe_surface *cbsurf,
+				       void *dsa_stage, float depth)
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
    struct pipe_context *pipe = ctx->base.pipe;
    struct pipe_framebuffer_state fb_state;
 
-   assert(dstsurf->texture);
-   if (!dstsurf->texture)
+   assert(zsurf->texture);
+   if (!zsurf->texture)
       return;
 
    /* check the saved state */
@@ -981,8 +983,8 @@ void util_blitter_flush_depth_stencil(struct blitter_context *blitter,
    assert(blitter->saved_fb_state.nr_cbufs != ~0);
 
    /* bind CSOs */
-   pipe->bind_blend_state(pipe, ctx->blend_keep_color);
-   pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_flush_depth_stencil);
+   pipe->bind_blend_state(pipe, ctx->blend_write_color);
+   pipe->bind_depth_stencil_alpha_state(pipe, dsa_stage);
 
    pipe->bind_rasterizer_state(pipe, ctx->rs_state);
    pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, 0));
@@ -990,15 +992,30 @@ void util_blitter_flush_depth_stencil(struct blitter_context *blitter,
    pipe->bind_vertex_elements_state(pipe, ctx->velem_state);
 
    /* set a framebuffer state */
-   fb_state.width = dstsurf->width;
-   fb_state.height = dstsurf->height;
-   fb_state.nr_cbufs = 0;
-   fb_state.cbufs[0] = 0;
-   fb_state.zsbuf = dstsurf;
+   fb_state.width = zsurf->width;
+   fb_state.height = zsurf->height;
+   fb_state.nr_cbufs = 1;
+   if (cbsurf) {
+	   fb_state.cbufs[0] = cbsurf;
+	   fb_state.nr_cbufs = 1;
+   } else {
+	   fb_state.cbufs[0] = NULL;
+	   fb_state.nr_cbufs = 0;
+   }
+   fb_state.zsbuf = zsurf;
    pipe->set_framebuffer_state(pipe, &fb_state);
 
-   blitter_set_dst_dimensions(ctx, dstsurf->width, dstsurf->height);
-   blitter->draw_rectangle(blitter, 0, 0, dstsurf->width, dstsurf->height, 0,
+   blitter_set_dst_dimensions(ctx, zsurf->width, zsurf->height);
+   blitter->draw_rectangle(blitter, 0, 0, zsurf->width, zsurf->height, depth,
                            UTIL_BLITTER_ATTRIB_NONE, NULL);
    blitter_restore_CSOs(ctx);
 }
+
+/* flush a region of a depth stencil surface for r300g */
+void util_blitter_flush_depth_stencil(struct blitter_context *blitter,
+                                      struct pipe_surface *dstsurf)
+{
+	struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
+	util_blitter_custom_depth_stencil(blitter, dstsurf, NULL,
+					  ctx->dsa_flush_depth_stencil, 0.0f);
+}
diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h
index e33d2e283f..f9f96f25c7 100644
--- a/src/gallium/auxiliary/util/u_blitter.h
+++ b/src/gallium/auxiliary/util/u_blitter.h
@@ -203,6 +203,12 @@ void util_blitter_clear_depth_stencil(struct blitter_context *blitter,
 
 void util_blitter_flush_depth_stencil(struct blitter_context *blitter,
                                       struct pipe_surface *dstsurf);
+
+void util_blitter_custom_depth_stencil(struct blitter_context *blitter,
+				       struct pipe_surface *zsurf,
+				       struct pipe_surface *cbsurf,
+				       void *dsa_stage, float depth);
+
 /* The functions below should be used to save currently bound constant state
  * objects inside a driver. The objects are automatically restored at the end
  * of the util_blitter_{clear, copy_region, fill_region} functions and then
diff --git a/src/gallium/auxiliary/util/u_format.csv b/src/gallium/auxiliary/util/u_format.csv
index 016e73c4a1..1fbd83841c 100644
--- a/src/gallium/auxiliary/util/u_format.csv
+++ b/src/gallium/auxiliary/util/u_format.csv
@@ -109,9 +109,12 @@ PIPE_FORMAT_Z32_UNORM               , plain, 1, 1, un32,     ,     ,     , x___,
 PIPE_FORMAT_Z32_FLOAT               , plain, 1, 1, f32 ,     ,     ,     , x___, zs
 PIPE_FORMAT_Z24_UNORM_S8_USCALED    , plain, 1, 1, un24, u8  ,     ,     , xy__, zs
 PIPE_FORMAT_S8_USCALED_Z24_UNORM    , plain, 1, 1, u8 ,  un24,     ,     , yx__, zs
+PIPE_FORMAT_X24S8_USCALED           , plain, 1, 1, x24,  u8  ,     ,     , _y__, zs
+PIPE_FORMAT_S8X24_USCALED           , plain, 1, 1, u8  , x24 ,     ,     , _x__, zs
 PIPE_FORMAT_Z24X8_UNORM             , plain, 1, 1, un24, x8  ,     ,     , x___, zs
 PIPE_FORMAT_X8Z24_UNORM             , plain, 1, 1, x8  , un24,     ,     , y___, zs
 PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED , plain, 1, 1, f32,  u8  , x24 ,     , xy__, zs
+PIPE_FORMAT_X32_S8X24_USCALED       , plain, 1, 1, x32,  u8  , x24 ,     , _y__, zs
 
 # YUV formats
 # http://www.fourcc.org/yuv.php#UYVY
diff --git a/src/gallium/auxiliary/util/u_format_zs.c b/src/gallium/auxiliary/util/u_format_zs.c
index 792d69c214..80081e22f7 100644
--- a/src/gallium/auxiliary/util/u_format_zs.c
+++ b/src/gallium/auxiliary/util/u_format_zs.c
@@ -918,3 +918,56 @@ util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned d
    }
 }
 
+
+void
+util_format_x24s8_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_z24_unorm_s8_uscaled_unpack_s_8uscaled(dst_row, dst_stride,
+						      src_row, src_stride,
+						      width, height);
+}
+
+void
+util_format_x24s8_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_z24_unorm_s8_uscaled_pack_s_8uscaled(dst_row, dst_stride,
+						    src_row, src_stride,
+						    width, height);
+}
+
+void
+util_format_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_s8_uscaled_z24_unorm_unpack_s_8uscaled(dst_row, dst_stride,
+						      src_row, src_stride,
+						      width, height);
+}
+
+void
+util_format_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_s8_uscaled_z24_unorm_pack_s_8uscaled(dst_row, dst_stride,
+						      src_row, src_stride,
+						      width, height);
+}
+
+void
+util_format_x32_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
+						const uint8_t *src_row, unsigned src_stride,
+						unsigned width, unsigned height)
+{
+   util_format_z32_float_s8x24_uscaled_unpack_s_8uscaled(dst_row, dst_stride,
+							 src_row, src_stride,
+							 width, height);
+
+}
+
+void
+util_format_x32_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
+					      const uint8_t *src_row, unsigned src_stride,
+					      unsigned width, unsigned height)
+{
+   util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(dst_row, dst_stride,
+                                                       src_row, src_stride,
+						       width, height);
+}
diff --git a/src/gallium/auxiliary/util/u_format_zs.h b/src/gallium/auxiliary/util/u_format_zs.h
index 650db4b95f..1604cc3eee 100644
--- a/src/gallium/auxiliary/util/u_format_zs.h
+++ b/src/gallium/auxiliary/util/u_format_zs.h
@@ -192,5 +192,21 @@ util_format_z32_float_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned
 void
 util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
 
+void
+util_format_x24s8_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_x24s8_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
 
+void
+util_format_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_x32_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_x32_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_sride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
 #endif /* U_FORMAT_ZS_H_ */
diff --git a/src/gallium/auxiliary/util/u_index_modify.c b/src/gallium/auxiliary/util/u_index_modify.c
new file mode 100644
index 0000000000..65b079ed53
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_index_modify.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "pipe/p_context.h"
+#include "util/u_index_modify.h"
+#include "util/u_inlines.h"
+
+void util_shorten_ubyte_elts(struct pipe_context *context,
+			     struct pipe_resource **elts,
+			     int index_bias,
+			     unsigned start,
+			     unsigned count)
+{
+    struct pipe_screen* screen = context->screen;
+    struct pipe_resource* new_elts;
+    unsigned char *in_map;
+    unsigned short *out_map;
+    struct pipe_transfer *src_transfer, *dst_transfer;
+    unsigned i;
+
+    new_elts = pipe_buffer_create(screen,
+                                  PIPE_BIND_INDEX_BUFFER,
+                                  2 * count);
+
+    in_map = pipe_buffer_map(context, *elts, PIPE_TRANSFER_READ, &src_transfer);
+    out_map = pipe_buffer_map(context, new_elts, PIPE_TRANSFER_WRITE, &dst_transfer);
+
+    in_map += start;
+
+    for (i = 0; i < count; i++) {
+        *out_map = (unsigned short)(*in_map + index_bias);
+        in_map++;
+        out_map++;
+    }
+
+    pipe_buffer_unmap(context, *elts, src_transfer);
+    pipe_buffer_unmap(context, new_elts, dst_transfer);
+
+    *elts = new_elts;
+}
+
+void util_rebuild_ushort_elts(struct pipe_context *context,
+			      struct pipe_resource **elts,
+			      int index_bias,
+			      unsigned start, unsigned count)
+{
+    struct pipe_transfer *in_transfer = NULL;
+    struct pipe_transfer *out_transfer = NULL;
+    struct pipe_resource *new_elts;
+    unsigned short *in_map;
+    unsigned short *out_map;
+    unsigned i;
+
+    new_elts = pipe_buffer_create(context->screen,
+                                  PIPE_BIND_INDEX_BUFFER,
+                                  2 * count);
+
+    in_map = pipe_buffer_map(context, *elts,
+                             PIPE_TRANSFER_READ, &in_transfer);
+    out_map = pipe_buffer_map(context, new_elts,
+                              PIPE_TRANSFER_WRITE, &out_transfer);
+
+    in_map += start;
+    for (i = 0; i < count; i++) {
+        *out_map = (unsigned short)(*in_map + index_bias);
+        in_map++;
+        out_map++;
+    }
+
+    pipe_buffer_unmap(context, *elts, in_transfer);
+    pipe_buffer_unmap(context, new_elts, out_transfer);
+
+    *elts = new_elts;
+}
+
+void util_rebuild_uint_elts(struct pipe_context *context,
+			    struct pipe_resource **elts,
+			    int index_bias,
+			    unsigned start, unsigned count)
+{
+    struct pipe_transfer *in_transfer = NULL;
+    struct pipe_transfer *out_transfer = NULL;
+    struct pipe_resource *new_elts;
+    unsigned int *in_map;
+    unsigned int *out_map;
+    unsigned i;
+
+    new_elts = pipe_buffer_create(context->screen,
+                                  PIPE_BIND_INDEX_BUFFER,
+                                  2 * count);
+
+    in_map = pipe_buffer_map(context, *elts,
+                             PIPE_TRANSFER_READ, &in_transfer);
+    out_map = pipe_buffer_map(context, new_elts,
+                              PIPE_TRANSFER_WRITE, &out_transfer);
+
+    in_map += start;
+    for (i = 0; i < count; i++) {
+        *out_map = (unsigned int)(*in_map + index_bias);
+        in_map++;
+        out_map++;
+    }
+
+    pipe_buffer_unmap(context, *elts, in_transfer);
+    pipe_buffer_unmap(context, new_elts, out_transfer);
+
+    *elts = new_elts;
+}
diff --git a/src/gallium/auxiliary/util/u_index_modify.h b/src/gallium/auxiliary/util/u_index_modify.h
new file mode 100644
index 0000000000..01a6cae94f
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_index_modify.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef UTIL_INDEX_MODIFY_H
+#define UTIL_INDEX_MODIFY_H
+
+void util_shorten_ubyte_elts(struct pipe_context *context,
+			     struct pipe_resource **elts,
+			     int index_bias,
+			     unsigned start,
+			     unsigned count);
+
+void util_rebuild_ushort_elts(struct pipe_context *context,
+			      struct pipe_resource **elts,
+			      int index_bias,
+			      unsigned start, unsigned count);
+
+void util_rebuild_uint_elts(struct pipe_context *context,
+			    struct pipe_resource **elts,
+			    int index_bias,
+			    unsigned start, unsigned count);
+#endif
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 69a7681494..37294b7203 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -118,6 +118,11 @@ __inline double __cdecl atan2(double val)
 #endif
 
 
+#ifndef M_SQRT2
+#define M_SQRT2 1.41421356237309504880
+#endif
+
+
 #if defined(_MSC_VER) 
 
 #if _MSC_VER < 1400 && !defined(__cplusplus) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h
index aae8b8bdf1..5378f2d782 100644
--- a/src/gallium/auxiliary/util/u_pack_color.h
+++ b/src/gallium/auxiliary/util/u_pack_color.h
@@ -394,7 +394,7 @@ util_pack_color(const float rgba[4], enum pipe_format format, union util_color *
       return;
    case PIPE_FORMAT_B4G4R4A4_UNORM:
       {
-         uc->ub = ((a & 0xf0) << 8) | ((r & 0xf0) << 4) | ((g & 0xf0) << 0) | (b >> 4);
+         uc->us = ((a & 0xf0) << 8) | ((r & 0xf0) << 4) | ((g & 0xf0) << 0) | (b >> 4);
       }
       return;
    case PIPE_FORMAT_A8_UNORM:
@@ -434,8 +434,8 @@ util_pack_color(const float rgba[4], enum pipe_format format, union util_color *
 /* Integer versions of util_pack_z and util_pack_z_stencil - useful for
  * constructing clear masks.
  */
-static INLINE uint
-util_pack_uint_z(enum pipe_format format, unsigned z)
+static INLINE uint32_t
+util_pack_mask_z(enum pipe_format format, uint32_t z)
 {
    switch (format) {
    case PIPE_FORMAT_Z16_UNORM:
@@ -452,29 +452,32 @@ util_pack_uint_z(enum pipe_format format, unsigned z)
    case PIPE_FORMAT_S8_USCALED:
       return 0;
    default:
-      debug_print_format("gallium: unhandled format in util_pack_z()", format);
+      debug_print_format("gallium: unhandled format in util_pack_mask_z()", format);
       assert(0);
       return 0;
    }
 }
 
-static INLINE uint
-util_pack_uint_z_stencil(enum pipe_format format, double z, uint s)
+static INLINE uint32_t
+util_pack_mask_z_stencil(enum pipe_format format, uint32_t z, uint8_t s)
 {
-   unsigned packed = util_pack_uint_z(format, z);
-
-   s &= 0xff;
+   uint32_t packed = util_pack_mask_z(format, z);
 
    switch (format) {
    case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
-      return packed | (s << 24);
+      packed |= (uint32_t)s << 24;
+      break;
    case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
-      return packed | s;
+      packed |= s;
+      break;
    case PIPE_FORMAT_S8_USCALED:
-      return packed | s;
+      packed |= s;
+      break;
    default:
-      return packed;
+      break;
    }
+
+   return packed;
 }
 
 
@@ -482,9 +485,11 @@ util_pack_uint_z_stencil(enum pipe_format format, double z, uint s)
 /**
  * Note: it's assumed that z is in [0,1]
  */
-static INLINE uint
+static INLINE uint32_t
 util_pack_z(enum pipe_format format, double z)
 {
+   union fi fui;
+
    if (z == 0.0)
       return 0;
 
@@ -492,24 +497,25 @@ util_pack_z(enum pipe_format format, double z)
    case PIPE_FORMAT_Z16_UNORM:
       if (z == 1.0)
          return 0xffff;
-      return (uint) (z * 0xffff);
+      return (uint32_t) (z * 0xffff);
    case PIPE_FORMAT_Z32_UNORM:
       /* special-case to avoid overflow */
       if (z == 1.0)
          return 0xffffffff;
-      return (uint) (z * 0xffffffff);
+      return (uint32_t) (z * 0xffffffff);
    case PIPE_FORMAT_Z32_FLOAT:
-      return (uint)z;
+      fui.f = (float)z;
+      return fui.ui;
    case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
    case PIPE_FORMAT_Z24X8_UNORM:
       if (z == 1.0)
          return 0xffffff;
-      return (uint) (z * 0xffffff);
+      return (uint32_t) (z * 0xffffff);
    case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
    case PIPE_FORMAT_X8Z24_UNORM:
       if (z == 1.0)
          return 0xffffff00;
-      return ((uint) (z * 0xffffff)) << 8;
+      return ((uint32_t) (z * 0xffffff)) << 8;
    case PIPE_FORMAT_S8_USCALED:
       /* this case can get it via util_pack_z_stencil() */
       return 0;
@@ -525,14 +531,14 @@ util_pack_z(enum pipe_format format, double z)
  * Pack Z and/or stencil values into a 32-bit value described by format.
  * Note: it's assumed that z is in [0,1] and s in [0,255]
  */
-static INLINE uint
-util_pack_z_stencil(enum pipe_format format, double z, uint s)
+static INLINE uint32_t
+util_pack_z_stencil(enum pipe_format format, double z, uint8_t s)
 {
-   unsigned packed = util_pack_z(format, z);
+   uint32_t packed = util_pack_z(format, z);
 
    switch (format) {
    case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
-      packed |= s << 24;
+      packed |= (uint32_t)s << 24;
       break;
    case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
       packed |= s;
diff --git a/src/gallium/auxiliary/util/u_simple_list.h b/src/gallium/auxiliary/util/u_simple_list.h
index f5f43b0faa..fe59771371 100644
--- a/src/gallium/auxiliary/util/u_simple_list.h
+++ b/src/gallium/auxiliary/util/u_simple_list.h
@@ -46,6 +46,8 @@
 do {						\
    (elem)->next->prev = (elem)->prev;		\
    (elem)->prev->next = (elem)->next;		\
+   (elem)->next = elem;                         \
+   (elem)->prev = elem;                         \
 } while (0)
 
 /**
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index 87959ab0aa..1df6c87267 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -71,6 +71,96 @@ _mm_castps_si128(__m128 a)
 
 #endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
 
+union m128i {
+   __m128i m;
+   ubyte ub[16];
+   ushort us[8];
+   uint ui[4];
+};
+
+static INLINE void u_print_epi8(const char *name, __m128i r)
+{
+   union { __m128i m; ubyte ub[16]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x\n",
+                name,
+                u.ub[0],  u.ub[1],  u.ub[2],  u.ub[3],
+                u.ub[4],  u.ub[5],  u.ub[6],  u.ub[7],
+                u.ub[8],  u.ub[9],  u.ub[10], u.ub[11],
+                u.ub[12], u.ub[13], u.ub[14], u.ub[15]);
+}
+
+static INLINE void u_print_epi16(const char *name, __m128i r)
+{
+   union { __m128i m; ushort us[8]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x\n",
+                name,
+                u.us[0],  u.us[1],  u.us[2],  u.us[3],
+                u.us[4],  u.us[5],  u.us[6],  u.us[7]);
+}
+
+static INLINE void u_print_epi32(const char *name, __m128i r)
+{
+   union { __m128i m; uint ui[4]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%08x/"
+                "%08x/"
+                "%08x/"
+                "%08x\n",
+                name,
+                u.ui[0],  u.ui[1],  u.ui[2],  u.ui[3]);
+}
+
+static INLINE void u_print_ps(const char *name, __m128 r)
+{
+   union { __m128 m; float f[4]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%f/"
+                "%f/"
+                "%f/"
+                "%f\n",
+                name,
+                u.f[0],  u.f[1],  u.f[2],  u.f[3]);
+}
+
+
+#define U_DUMP_EPI32(a) u_print_epi32(#a, a)
+#define U_DUMP_EPI16(a) u_print_epi16(#a, a)
+#define U_DUMP_EPI8(a)  u_print_epi8(#a, a)
+#define U_DUMP_PS(a)    u_print_ps(#a, a)
+
+
 
 #if defined(PIPE_ARCH_SSSE3)
 
@@ -78,8 +168,6 @@ _mm_castps_si128(__m128 a)
 
 #else /* !PIPE_ARCH_SSSE3 */
 
-#include <emmintrin.h>
-
 /**
  * Describe _mm_shuffle_epi8() with gcc extended inline assembly, for cases
  * where -mssse3 is not supported/enabled.
@@ -100,6 +188,68 @@ _mm_shuffle_epi8(__m128i a, __m128i mask)
 #endif /* !PIPE_ARCH_SSSE3 */
 
 
-#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
+
+
+/* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of
+ * _mm_mul_epu32().
+ *
+ * I suspect this works fine for us because one of our operands is
+ * always positive, but not sure that this can be used for general
+ * signed integer multiplication.
+ *
+ * This seems close enough to the speed of SSE4 and the real
+ * _mm_mullo_epi32() intrinsic as to not justify adding an sse4
+ * dependency at this point.
+ */
+static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
+{
+   __m128i a4   = _mm_srli_epi64(a, 32);  /* shift by one dword */
+   __m128i b4   = _mm_srli_epi64(b, 32);  /* shift by one dword */
+   __m128i ba   = _mm_mul_epu32(b, a);   /* multply dwords 0, 2 */
+   __m128i b4a4 = _mm_mul_epu32(b4, a4); /* multiply dwords 1, 3 */
+
+   /* Interleave the results, either with shuffles or (slightly
+    * faster) direct bit operations:
+    */
+#if 0
+   __m128i ba8             = _mm_shuffle_epi32(ba, 8);
+   __m128i b4a48           = _mm_shuffle_epi32(b4a4, 8);
+   __m128i result          = _mm_unpacklo_epi32(ba8, b4a48);
+#else
+   __m128i mask            = _mm_setr_epi32(~0,0,~0,0);
+   __m128i ba_mask         = _mm_and_si128(ba, mask);
+   __m128i b4a4_mask_shift = _mm_slli_epi64(b4a4, 32);
+   __m128i result          = _mm_or_si128(ba_mask, b4a4_mask_shift);
+#endif
+
+   return result;
+}
+
+
+static INLINE void
+transpose4_epi32(const __m128i * restrict a,
+                 const __m128i * restrict b,
+                 const __m128i * restrict c,
+                 const __m128i * restrict d,
+                 __m128i * restrict o,
+                 __m128i * restrict p,
+                 __m128i * restrict q,
+                 __m128i * restrict r)
+{
+  __m128i t0 = _mm_unpacklo_epi32(*a, *b);
+  __m128i t1 = _mm_unpacklo_epi32(*c, *d);
+  __m128i t2 = _mm_unpackhi_epi32(*a, *b);
+  __m128i t3 = _mm_unpackhi_epi32(*c, *d);
+
+  *o = _mm_unpacklo_epi64(t0, t1);
+  *p = _mm_unpackhi_epi64(t0, t1);
+  *q = _mm_unpacklo_epi64(t2, t3);
+  *r = _mm_unpackhi_epi64(t2, t3);
+}
+
+#define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
+
+
+#endif /* PIPE_ARCH_SSE */
 
 #endif /* U_SSE_H_ */
diff --git a/src/gallium/auxiliary/util/u_surface.c b/src/gallium/auxiliary/util/u_surface.c
index af99163b2e..f78b6838a7 100644
--- a/src/gallium/auxiliary/util/u_surface.c
+++ b/src/gallium/auxiliary/util/u_surface.c
@@ -332,7 +332,7 @@ util_clear_depth_stencil(struct pipe_context *pipe,
                uint32_t *row = (uint32_t *)dst_map;
                for (j = 0; j < width; j++) {
                   uint32_t tmp = *row & dst_mask;
-                  *row++ = tmp & (zstencil & ~dst_mask);
+                  *row++ = tmp | (zstencil & ~dst_mask);
                }
                dst_map += dst_stride;
             }
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index f7aa1403d0..44cadbfcdd 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -217,6 +217,81 @@ z24s8_get_tile_rgba(const unsigned *src,
    }
 }
 
+/*** PIPE_FORMAT_S8X24_USCALED ***/
+
+/**
+ * Return S component as four uint32_t in [0..255].  Z part ignored.
+ */
+static void
+s8x24_get_tile_rgba(const unsigned *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float)((*src++ >> 24) & 0xff);
+      }
+
+      p += dst_stride;
+   }
+}
+
+/*** PIPE_FORMAT_X24S8_USCALED ***/
+
+/**
+ * Return S component as four uint32_t in [0..255].  Z part ignored.
+ */
+static void
+x24s8_get_tile_rgba(const unsigned *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float)(*src++ & 0xff);
+      }
+      p += dst_stride;
+   }
+}
+
+
+/**
+ * Return S component as four uint32_t in [0..255].  Z part ignored.
+ */
+static void
+s8_get_tile_rgba(const unsigned char *src,
+		 unsigned w, unsigned h,
+		 float *p,
+		 unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float)(*src++ & 0xff);
+      }
+      p += dst_stride;
+   }
+}
 
 /*** PIPE_FORMAT_Z32_FLOAT ***/
 
@@ -261,10 +336,19 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
    case PIPE_FORMAT_Z24X8_UNORM:
       s8z24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
+   case PIPE_FORMAT_S8_USCALED:
+      s8_get_tile_rgba((unsigned char *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_X24S8_USCALED:
+      s8x24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
    case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
    case PIPE_FORMAT_X8Z24_UNORM:
       z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
+   case PIPE_FORMAT_S8X24_USCALED:
+      x24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
    case PIPE_FORMAT_Z32_FLOAT:
       z32f_get_tile_rgba((float *) src, w, h, dst, dst_stride);
       break;